# Naive Bayes

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from gensim.models import Word2Vec
from sklearn.metrics import f1_score, balanced_accuracy_score, accuracy_score, classification_report, confusion_matrix
import wandb
import numpy as np
import sys

# Import functions from preprocessing module
sys.path.append('..')

## Load data

In [7]:
df = load_processed_data()
train_data = df["train"]
test_data = df["test"]
dev_data = df["dev"]

X_train, y_train = train_data["lemma"], train_data["label"]
X_test, y_test = test_data["lemma"], test_data["label"]
X_dev, y_dev = dev_data["lemma"], dev_data["label"]

df: Train split loaded.
df: Dev split loaded.
df: Test split loaded.


## TF-IDF

In [17]:
tfidf_params_list = [
    {'max_features': 3000, 'ngram_range': (1, 1), 'min_df': 5},
    {'max_features': 3000, 'ngram_range': (1, 2), 'min_df': 5},
    {'max_features': 5000, 'ngram_range': (1, 1), 'min_df': 5},
    {'max_features': 5000, 'ngram_range': (1, 2), 'min_df': 5},
    {'max_features': 10000, 'ngram_range': (1, 1), 'min_df': 5},
    {'max_features': 10000, 'ngram_range': (1, 2), 'min_df': 5}
]

In [18]:
wandb.init(project="online_sexism_detection", name="tfidf_naive_bayes")

for tfidf_params in tfidf_params_list:
    vectorizer = TfidfVectorizer(**tfidf_params)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_dev_tfidf = vectorizer.transform(X_dev)

    # Train Naive Bayes classifier
    model = MultinomialNB()
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_dev_tfidf)

    # Evaluate
    accuracy = accuracy_score(y_dev, y_pred)
    print(f"TF-IDF Params: {tfidf_params}")
    print("Accuracy:", accuracy)
    print(classification_report(y_dev, y_pred))

    # Log results
    wandb.log({
        'tfidf_params': tfidf_params,
        "f1": f1_score(y_dev, y_pred),
        "balanced_accuracy": balanced_accuracy_score(y_dev, y_pred),
        "accuracy": accuracy_score(y_dev, y_pred),
    })

wandb.finish()

TF-IDF Params: {'max_features': 3000, 'ngram_range': (1, 1), 'min_df': 5}
Accuracy: 0.7905
              precision    recall  f1-score   support

           0       0.79      1.00      0.88      1514
           1       0.92      0.15      0.26       486

    accuracy                           0.79      2000
   macro avg       0.85      0.57      0.57      2000
weighted avg       0.82      0.79      0.73      2000

TF-IDF Params: {'max_features': 3000, 'ngram_range': (1, 2), 'min_df': 5}
Accuracy: 0.7965
              precision    recall  f1-score   support

           0       0.79      0.99      0.88      1514
           1       0.90      0.18      0.30       486

    accuracy                           0.80      2000
   macro avg       0.85      0.59      0.59      2000
weighted avg       0.82      0.80      0.74      2000

TF-IDF Params: {'max_features': 5000, 'ngram_range': (1, 1), 'min_df': 5}
Accuracy: 0.7855
              precision    recall  f1-score   support

           0      

0,1
accuracy,▅█▂▄▂▁
balanced_accuracy,▄█▂▄▂▁
f1,▄█▂▄▂▁

0,1
accuracy,0.7825
balanced_accuracy,0.55876
f1,0.21622


## Result analysis

In [21]:
vectorizer = TfidfVectorizer(
        max_features=5000,
        ngram_range=(1, 2),
        min_df=5
    )
X_train_tfidf = vectorizer.fit_transform(X_train)
X_dev_tfidf = vectorizer.transform(X_dev)

# Train Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

### Train

In [31]:
X = vectorizer.transform(X_train)
y_pred = model.predict(X)
print("f1_score", f1_score(y_train, y_pred))
print("accuracy_score", accuracy_score(y_train, y_pred))
print("balanced_accuracy_score", balanced_accuracy_score(y_train, y_pred))
print(classification_report(y_train, y_pred))
print(confusion_matrix(y_train, y_pred))

f1_score 0.3757115749525617
accuracy_score 0.812
balanced_accuracy_score 0.6153129569320658
              precision    recall  f1-score   support

           0       0.80      1.00      0.89     10602
           1       0.97      0.23      0.38      3398

    accuracy                           0.81     14000
   macro avg       0.89      0.62      0.63     14000
weighted avg       0.84      0.81      0.76     14000

[[10576    26]
 [ 2606   792]]


### Dev

In [30]:
X = vectorizer.transform(X_dev)
y_pred = model.predict(X)
print("f1_score", f1_score(y_dev, y_pred))
print("accuracy_score", accuracy_score(y_dev, y_pred))
print("balanced_accuracy_score", balanced_accuracy_score(y_dev, y_pred))
print(classification_report(y_dev, y_pred))
print(confusion_matrix(y_dev, y_pred))

f1_score 0.25486725663716814
accuracy_score 0.7895
balanced_accuracy_score 0.5717623171388033
              precision    recall  f1-score   support

           0       0.78      1.00      0.88      1514
           1       0.91      0.15      0.25       486

    accuracy                           0.79      2000
   macro avg       0.85      0.57      0.57      2000
weighted avg       0.82      0.79      0.73      2000

[[1507    7]
 [ 414   72]]


### Test

In [29]:
X = vectorizer.transform(X_test)
y_pred = model.predict(X)
print("f1_score", f1_score(y_test, y_pred))
print("accuracy_score", accuracy_score(y_test, y_pred))
print("balanced_accuracy_score", balanced_accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

f1_score 0.3
accuracy_score 0.797
balanced_accuracy_score 0.5870504576230818
              precision    recall  f1-score   support

           0       0.79      0.99      0.88      3030
           1       0.92      0.18      0.30       970

    accuracy                           0.80      4000
   macro avg       0.85      0.59      0.59      4000
weighted avg       0.82      0.80      0.74      4000

[[3014   16]
 [ 796  174]]
