In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [64]:

df = pd.read_csv('dataset.csv')

In [65]:
df.head()

Unnamed: 0,text,humor
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False
1,Watch: darvish gave hitter whiplash with slow ...,False
2,What do you call a turtle without its shell? d...,True
3,5 reasons the 2016 election feels so personal,False
4,"Pasco police shot mexican migrant from behind,...",False


In [66]:
df.isna().sum()

text     0
humor    0
dtype: int64

In [67]:
df['humor'].value_counts()

humor
False    100000
True     100000
Name: count, dtype: int64

In [68]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text) 
    text = text.lower()  
    text = text.strip()  # Видалення пробілів на початку та в кінці
    return text

df['text'] = df['text'].apply(clean_text)

In [69]:
df['text']

0              joe biden rules out  bid guys im not running
1         watch darvish gave hitter whiplash with slow p...
2          what do you call a turtle without its shell dead
3                   reasons the  election feels so personal
4         pasco police shot mexican migrant from behind ...
                                ...                        
199995    conor maynard seamlessly fits oldschool rb hit...
199996    how to you make holy water you boil the hell o...
199997    how many optometrists does it take to screw in...
199998    mcdonalds will officially kick off allday brea...
199999    an irish man walks on the street and ignores a...
Name: text, Length: 200000, dtype: object

In [70]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

df['text'] = df['text'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Oleg_PC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [71]:
df['text']

0                       joe biden rules bid guys im running
1             watch darvish gave hitter whiplash slow pitch
2                            call turtle without shell dead
3                           reasons election feels personal
4         pasco police shot mexican migrant behind new a...
                                ...                        
199995    conor maynard seamlessly fits oldschool rb hit...
199996                            make holy water boil hell
199997    many optometrists take screw lightbulb one two...
199998    mcdonalds officially kick allday breakfast oct...
199999    irish man walks street ignores bar muahahaha l...
Name: text, Length: 200000, dtype: object

In [72]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

df['text'] = df['text'].apply(lemmatize_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Oleg_PC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [73]:
df['text']

0                         joe biden rule bid guy im running
1             watch darvish gave hitter whiplash slow pitch
2                            call turtle without shell dead
3                             reason election feel personal
4         pasco police shot mexican migrant behind new a...
                                ...                        
199995    conor maynard seamlessly fit oldschool rb hit ...
199996                            make holy water boil hell
199997    many optometrist take screw lightbulb one two ...
199998    mcdonalds officially kick allday breakfast oct...
199999    irish man walk street ignores bar muahahaha li...
Name: text, Length: 200000, dtype: object

In [74]:
X = df['text']
y = df['humor']


X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state=42)

# CountVectorizer

In [75]:
# Кодування тексту методом "мішка слів"
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Навчання класифікатора
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_bow, y_train)

# Прогнозування на тестовій вибірці
y_pred_bow_st = nb_classifier.predict(X_test_bow)

In [76]:
def print_metrics_log_for_y_pred(y_test, y_pred):  
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"F1 Score: {f1_score(y_test, y_pred)}")
    print("\n\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
print_metrics_log_for_y_pred(y_test,y_pred_bow_st)

Accuracy: 0.89831
Precision: 0.8879811713446538
Recall: 0.9119092325516359
F1 Score: 0.8997861500103476


Classification Report:
              precision    recall  f1-score   support

       False       0.91      0.88      0.90     49938
        True       0.89      0.91      0.90     50062

    accuracy                           0.90    100000
   macro avg       0.90      0.90      0.90    100000
weighted avg       0.90      0.90      0.90    100000


# TF-IDF

In [77]:
# Кодування тексту методом TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Навчання класифікатора
logreg_classifier = LogisticRegression()
logreg_classifier.fit(X_train_tfidf, y_train)

# Прогнозування на тестовій вибірці
y_pred_tfidf_st = logreg_classifier.predict(X_test_tfidf)


In [78]:
print_metrics_log_for_y_pred(y_test,y_pred_tfidf_st)

Accuracy: 0.89748
Precision: 0.9001005025125628
Recall: 0.8944908313691023
F1 Score: 0.8972868993708172


Classification Report:
              precision    recall  f1-score   support

       False       0.89      0.90      0.90     49938
        True       0.90      0.89      0.90     50062

    accuracy                           0.90    100000
   macro avg       0.90      0.90      0.90    100000
weighted avg       0.90      0.90      0.90    100000


Для кожного типу екодування тексту навчити векторизатор на навчальній вибірці, перетворити навчальні та тестові тексти, навчити моделі з тюнінгом гіперпараметрів;

# CountVectorizer 2

In [None]:

# Пайплайн для векторизатора і класифікатора
pipeline_bow = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])

# Параметри для тюнінгу
parameters_bow = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'vect__max_df': [0.75, 1.0],
    'vect__min_df': [1, 2],
    'clf__alpha': [0.1, 1.0, 10.0]
}

grid_search_bow = GridSearchCV(pipeline_bow, parameters_bow, cv=5, n_jobs=-1, verbose=1)
grid_search_bow.fit(X_train, y_train)

print("Best parameters for Bag of Words:")
print(grid_search_bow.best_params_)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [ ]:
y_pred_bow = grid_search_bow.predict(X_test)

print_metrics_log_for_y_pred(y_test, y_pred_bow)

#  TF-ID з тюнінгом гіперпараметрів

In [None]:
pipeline_tfidf = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

parameters_tfidf = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'vect__max_df': [0.75, 1.0],
    'vect__min_df': [1, 2],
    'clf__C': [0.1, 1.0, 10.0]
}

grid_search_tfidf = GridSearchCV(pipeline_tfidf, parameters_tfidf, cv=5, n_jobs=-1, verbose=1)
grid_search_tfidf.fit(X_train, y_train)

print("Best parameters for TF-IDF:")
print(grid_search_tfidf.best_params_)

In [None]:
y_pred_tfidf = grid_search_tfidf.predict(X_test)

print_metrics_log_for_y_pred(y_test, y_pred_tfidf)

In [None]:
# Обчислення метрик для моделі з "мішком слів"
accuracy_bow = accuracy_score(y_test, y_pred_bow)
precision_bow = precision_score(y_test, y_pred_bow)
recall_bow = recall_score(y_test, y_pred_bow)
f1_bow = f1_score(y_test, y_pred_bow)

# Обчислення метрик для моделі з TF-IDF
accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
precision_tfidf = precision_score(y_test, y_pred_tfidf)
recall_tfidf = recall_score(y_test, y_pred_tfidf)
f1_tfidf = f1_score(y_test, y_pred_tfidf)

# Створення порівняльної таблиці
comparison_table = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-score'],
    'Bag of Words': [accuracy_bow, precision_bow, recall_bow, f1_bow],
    'TF-IDF': [accuracy_tfidf, precision_tfidf, recall_tfidf, f1_tfidf]
})

print(comparison_table)