In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# import fasttext.util
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from gensim.models import FastText
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
import random
from sklearn.metrics import accuracy_score
from gensim.models import KeyedVectors

pip install gensim

pip install nltk

In [134]:
file_path = 'Twitter_Data.csv'
df = pd.read_csv(file_path)

In [135]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Oleg_PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Oleg_PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [136]:
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [137]:
df['category'].unique()

array([-1.,  0.,  1., nan])

In [138]:
df = df.dropna(subset=['category'])

In [139]:
df['category'].unique()

array([-1.,  0.,  1.])

In [140]:
def preprocess_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

In [141]:
df['processed_text'] = df['clean_text'].apply(preprocess_text)

In [142]:
X = df['processed_text']
y = df['category']

In [143]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [144]:
(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

((130378,), (32595,), (130378,), (32595,))

In [145]:
# pipeline = Pipeline([
#     ('tfidf', TfidfVectorizer()),
#     ('nb', MultinomialNB())
# ])


pipeline_tfidf = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.9, min_df=5, ngram_range=(1, 2))),
    ('nb', MultinomialNB())
])

In [146]:
pipeline_tfidf.fit(X_train, y_train)
y_pred = pipeline_tfidf.predict(X_test)
report = classification_report(y_test, y_pred, zero_division=1)


In [147]:
print(report)

              precision    recall  f1-score   support

        -1.0       0.85      0.39      0.53      7230
         0.0       0.81      0.61      0.70     10961
         1.0       0.63      0.92      0.75     14404

    accuracy                           0.70     32595
   macro avg       0.76      0.64      0.66     32595
weighted avg       0.74      0.70      0.68     32595


In [148]:
param_grid = {
    'nb__alpha': [0.1, 0.5, 1.0, 1.5, 2.0]  
}


grid_search = GridSearchCV(pipeline_tfidf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_


In [149]:
print(best_params)
print(best_score)

{'nb__alpha': 0.1}
0.7193774912980231


In [150]:
best_pipeline = grid_search.best_estimator_
y_pred = best_pipeline.predict(X_test)
report = classification_report(y_test, y_pred, zero_division=1)

print(report)


              precision    recall  f1-score   support

        -1.0       0.72      0.54      0.62      7230
         0.0       0.81      0.66      0.73     10961
         1.0       0.68      0.86      0.76     14404

    accuracy                           0.72     32595
   macro avg       0.74      0.69      0.70     32595
weighted avg       0.73      0.72      0.72     32595


In [151]:
# Тренуємо FastText модель на текстах
sentences = [text.split() for text in X_train]  # Розбиваємо тексти на слова
ft_model = FastText(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4, sg=1, epochs=10)


In [152]:
class FastTextVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model

    def transform(self, X, **transform_params):
        return np.array([
            np.mean(
                [self.model.wv[word] for word in text.split() if word in self.model.wv], 
                axis=0
            ) if any(word in self.model.wv for word in text.split()) else np.zeros(self.model.vector_size)
            for text in X
        ])

    def fit(self, X, y=None, **fit_params):
        return self


In [153]:
pipeline_ft = Pipeline([
    ('ft_vectorizer', FastTextVectorizer(ft_model)),
    ('lr', LogisticRegression(class_weight='balanced', max_iter=1000))
])

In [154]:
pipeline_ft.fit(X_train, y_train)

# Оцінка моделі
y_pred = pipeline_ft.predict(X_test)
report_ft = classification_report(y_test, y_pred, zero_division=1)

print(report_ft)

              precision    recall  f1-score   support

        -1.0       0.41      0.60      0.49      7230
         0.0       0.59      0.59      0.59     10961
         1.0       0.67      0.51      0.58     14404

    accuracy                           0.56     32595
   macro avg       0.56      0.57      0.55     32595
weighted avg       0.59      0.56      0.56     32595


In [155]:
param_dist = {
    'lr__C': [0.1, 1, 10],               # Значення регуляризації
    'lr__solver': ['liblinear', 'saga'],  # Оптимізатори, які підтримують `class_weight`
    'lr__class_weight': ['balanced', None]
}

In [156]:
random_search = RandomizedSearchCV(
    pipeline_ft, param_distributions=param_dist, 
    n_iter=10, cv=5, scoring='accuracy', random_state=42
)

In [157]:
random_search.fit(X_train, y_train)

In [158]:
best_params = random_search.best_params_
best_score = random_search.best_score_


print("Best Parameters:", best_params)
print("Best Cross-Validation Accuracy:", best_score)

Best Parameters: {'lr__solver': 'saga', 'lr__class_weight': None, 'lr__C': 1}
Best Cross-Validation Accuracy: 0.5872616752535585


In [159]:
y_pred = random_search.predict(X_test)
report = classification_report(y_test, y_pred, zero_division=1)
print(report)

              precision    recall  f1-score   support

        -1.0       0.51      0.29      0.37      7230
         0.0       0.59      0.58      0.59     10961
         1.0       0.59      0.74      0.66     14404

    accuracy                           0.58     32595
   macro avg       0.57      0.53      0.54     32595
weighted avg       0.58      0.58      0.57     32595


In [160]:
import gradio as gr

In [161]:
def classify_tfidf_naive_bayes(text):
    processed_text = preprocess_text(text)  # Препроцесинг
    prediction = pipeline_tfidf.predict([processed_text])[0]
    return f"Predicted Category: {prediction}"

# Функція класифікації для FastText + Logistic Regression
def classify_fasttext_logreg(text):
    processed_text = preprocess_text(text)  # Препроцесинг
    prediction = pipeline_ft.predict([processed_text])[0]
    return f"Predicted Category: {prediction}"

In [162]:
with gr.Blocks() as demo:
    gr.Markdown("## Text Classification Demo")
    
    with gr.Tab("TF-IDF + Naive Bayes"):
        with gr.Row():
            input_text_tfidf = gr.Textbox(label="Enter text for TF-IDF + Naive Bayes")
            output_label_tfidf = gr.Label()
        classify_button_tfidf = gr.Button("Classify")
        classify_button_tfidf.click(fn=classify_tfidf_naive_bayes, inputs=input_text_tfidf, outputs=output_label_tfidf)
    
    with gr.Tab("FastText + Logistic Regression"):
        with gr.Row():
            input_text_fasttext = gr.Textbox(label="Enter text for FastText + Logistic Regression")
            output_label_fasttext = gr.Label()
        classify_button_fasttext = gr.Button("Classify")
        classify_button_fasttext.click(fn=classify_fasttext_logreg, inputs=input_text_fasttext, outputs=output_label_fasttext)


This product is fantastic! It really helped me improve my workflow.

I am really disappointed with the quality of this service.

The meeting was scheduled to discuss the progress of the project.

In [163]:
demo.launch()

* Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




QA

In [164]:
file_path = 'DataScienceBasics_QandA - Sheet1.csv'
df2 = pd.read_csv(file_path)

In [165]:
df2.head()

Unnamed: 0,Id,Question,Answer
0,1,What is data science?,Data science is an interdisciplinary field tha...
1,2,What are the key steps in the data science pro...,The key steps typically include problem defini...
2,3,What is the difference between supervised and ...,Supervised learning involves training a model ...
3,4,Explain the bias-variance tradeoff.,The bias-variance tradeoff is the balance betw...
4,5,What is feature engineering?,Feature engineering is the process of selectin...


In [166]:
lemmatizer = WordNetLemmatizer()

In [167]:
def preprocess_text(text):
    # Приведення до нижнього регістру
    text = text.lower()
    # Видалення спеціальних символів і чисел
    text = re.sub(r'[^a-z\s]', '', text)
    # Видалення стоп-слів і лематизація
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

In [168]:
df2['processed_question'] = df2['Question'].apply(preprocess_text)
df2['processed_answer'] = df2['Answer'].apply(preprocess_text)


In [169]:
sentences = [q.split() for q in df2['processed_question']] + [a.split() for a in df2['processed_answer']]
ft_model = FastText(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4, sg=1, epochs=10)

In [170]:
def get_text_vector(text, model):
    words = text.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model.vector_size)


In [171]:
df2['question_vector'] = df2['processed_question'].apply(lambda x: get_text_vector(x, ft_model))
df2['answer_vector'] = df2['processed_answer'].apply(lambda x: get_text_vector(x, ft_model))


In [172]:
def get_top_k_answers(query, data, model, k=5):
    query = preprocess_text(query)
    query_vector = get_text_vector(query, model)
    similarities = cosine_similarity([query_vector], list(data['answer_vector']))[0]
    top_k_indices = np.argsort(similarities)[-k:][::-1]  # Вибираємо топ-K
    return data.iloc[top_k_indices]['Answer'].tolist()

In [173]:
def evaluate_top_k_accuracy(data, model, k=5):
    correct_count = 0
    for index, row in data.iterrows():
        question = row['processed_question']
        correct_answer = row['Answer']
        
        top_k_answers = get_top_k_answers(question, data, model, k=k)
        if correct_answer in top_k_answers:
            correct_count += 1
    
    accuracy = correct_count / len(data)
    return accuracy

In [174]:
print("Top-1 Accuracy:", evaluate_top_k_accuracy(df2, ft_model, k=1))
print("Top-3 Accuracy:", evaluate_top_k_accuracy(df2, ft_model, k=3))
print("Top-5 Accuracy:", evaluate_top_k_accuracy(df2, ft_model, k=5))

Top-1 Accuracy: 0.215
Top-3 Accuracy: 0.425
Top-5 Accuracy: 0.53


In [175]:
def select_correct_question(data, model):
    correct_question_index = random.choice(data.index)
    correct_question = data.loc[correct_question_index, 'processed_question']
    correct_answer = data.loc[correct_question_index, 'processed_answer']
    
    distractor_indices = random.sample(list(data.index.drop(correct_question_index)), 3)
    distractors = data.loc[distractor_indices, 'processed_question'].tolist()
    
    options = [correct_question] + distractors
    random.shuffle(options)
    
    answer_vector = get_text_vector(correct_answer, model)
    
    similarities = [cosine_similarity([answer_vector], [get_text_vector(q, model)])[0][0] for q in options]
    
    predicted_question = options[np.argmax(similarities)]
    
    return predicted_question == correct_question

In [176]:
def evaluate_question_selection(data, model, n_trials=100):
    results = [select_correct_question(data, model) for _ in range(n_trials)]
    accuracy = sum(results) / n_trials
    return accuracy

In [177]:
def select_correct_answer(data, question, model):
    question_vector = get_text_vector(question, model)
    similarities = [cosine_similarity([question_vector], [get_text_vector(ans, model)])[0][0] for ans in data['processed_answer']]
    best_match_index = np.argmax(similarities)
    return data.iloc[best_match_index]['Answer']

In [178]:
def evaluate_answer_selection(data, model):
    correct_count = 0
    for index, row in data.iterrows():
        question = row['processed_question']
        correct_answer = row['Answer']
        predicted_answer = select_correct_answer(data, question, model)
        if predicted_answer == correct_answer:
            correct_count += 1
    accuracy = correct_count / len(data)
    return accuracy

In [179]:
question_selection_accuracy = evaluate_question_selection(df2, ft_model)
answer_selection_accuracy = evaluate_answer_selection(df2, ft_model)

print("Accuracy for selecting the correct question:", question_selection_accuracy)
print("Accuracy for selecting the correct answer:", answer_selection_accuracy)

Accuracy for selecting the correct question: 0.78
Accuracy for selecting the correct answer: 0.215


In [180]:
def gradio_qa_system(question, k):
    top_k_answers = get_top_k_answers(question, df2, ft_model, k=k)
    return "\n\n".join(top_k_answers)


with gr.Blocks() as demo:
    gr.Markdown("## Question Answering System Demo (Top-K)")
    
    with gr.Tab("Ask a Question"):
        with gr.Row():
            input_question = gr.Textbox(label="Enter your question here")
            input_k = gr.Slider(minimum=1, maximum=5, step=1, value=3, label="Top-K Answers")
            output_answers = gr.Textbox(label="Answers")
        
        submit_button = gr.Button("Get Answers")
        submit_button.click(fn=gradio_qa_system, inputs=[input_question, input_k], outputs=output_answers)


In [181]:
demo.launch()

* Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.


