# Naive Bayes

In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from gensim.models import Word2Vec
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import wandb
import numpy as np
import sys

# Import functions from preprocessing module
sys.path.append('..')
from utils.load_data import load_processed_data

## Load data

In [7]:
df = load_processed_data()
train_data = df["train"]
dev_data = df["dev"]

X_train = train_data['text']
y_train = train_data['label']
X_dev = dev_data['text']
y_dev = dev_data['label']

df: Train split loaded.
df: Dev split loaded.
df: Test split loaded.


In [8]:
train_data['label'].value_counts()

label
0    10602
1     3398
Name: count, dtype: int64

## TF-IDF

In [9]:
tfidf_params_list = [
    {'max_features': 3000, 'ngram_range': (1, 1), 'min_df': 5},
    {'max_features': 5000, 'ngram_range': (1, 2), 'min_df': 5},
    {'max_features': 10000, 'ngram_range': (1, 3), 'min_df': 3}
]

In [10]:
wandb.init(project="tfidf_naive_bayes", name="TF-IDF_Experiments")

for tfidf_params in tfidf_params_list:
    vectorizer = TfidfVectorizer(**tfidf_params)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_dev)

    # Train Naive Bayes classifier
    model = MultinomialNB()
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    # Evaluate
    accuracy = accuracy_score(y_dev, y_pred)
    print(f"TF-IDF Params: {tfidf_params}")
    print("Accuracy:", accuracy)
    print(classification_report(y_dev, y_pred))

    # Log results
    wandb.log({
        'tfidf_params': tfidf_params,
        'accuracy': accuracy,
        'confusion_matrix': confusion_matrix(y_dev, y_pred).tolist(),
        'classification_report': classification_report(y_dev, y_pred, output_dict=True)
    })

wandb.finish()

TF-IDF Params: {'max_features': 3000, 'ngram_range': (1, 1), 'min_df': 5}
Accuracy: 0.78975
              precision    recall  f1-score   support

           0       0.78      1.00      0.88      3030
           1       0.94      0.14      0.25       970

    accuracy                           0.79      4000
   macro avg       0.86      0.57      0.56      4000
weighted avg       0.82      0.79      0.72      4000

TF-IDF Params: {'max_features': 5000, 'ngram_range': (1, 2), 'min_df': 5}
Accuracy: 0.789
              precision    recall  f1-score   support

           0       0.78      1.00      0.88      3030
           1       0.91      0.14      0.25       970

    accuracy                           0.79      4000
   macro avg       0.85      0.57      0.56      4000
weighted avg       0.82      0.79      0.72      4000

TF-IDF Params: {'max_features': 10000, 'ngram_range': (1, 3), 'min_df': 3}
Accuracy: 0.7815
              precision    recall  f1-score   support

           0     

0,1
accuracy,█▇▁

0,1
accuracy,0.7815


## Word2Vec

In [11]:
word2vec_params_list = [
    {'vector_size': 50, 'window': 3, 'min_count': 1, 'workers': 4},
    {'vector_size': 100, 'window': 5, 'min_count': 2, 'workers': 4},
    {'vector_size': 200, 'window': 7, 'min_count': 3, 'workers': 4},
]

In [12]:
# Initialize W&B
wandb.init(project="word2vec_naive_bayes", name="Word2Vec_Settings_Experiments")

def compute_word2vec_features(text_data, model, vector_size):
    features = []
    for sentence in text_data:
        words = sentence.split()
        word_vecs = [model.wv[word] for word in words if word in model.wv]
        if word_vecs:
            features.append(np.mean(word_vecs, axis=0))
        else:
            features.append(np.zeros(vector_size))
    return np.array(features)

for params in word2vec_params_list:
    print(f"Training Word2Vec with params: {params}")
    
    # Train Word2Vec model
    sentences = [text.split() for text in X_train]
    word2vec_model = Word2Vec(sentences=sentences, **params)
    
    # Compute Word2Vec features
    X_train_word2vec = compute_word2vec_features(X_train, word2vec_model, params['vector_size'])
    X_test_word2vec = compute_word2vec_features(X_dev, word2vec_model, params['vector_size'])
    
    # Scale Word2Vec features
    X_train_word2vec_scaled = (X_train_word2vec - X_train_word2vec.min()) * 1000
    X_test_word2vec_scaled = (X_test_word2vec - X_test_word2vec.min()) * 1000

    # Train Naive Bayes classifier
    model = MultinomialNB()
    model.fit(X_train_word2vec_scaled, y_train)
    y_pred = model.predict(X_test_word2vec_scaled)

    # Evaluate
    accuracy = accuracy_score(y_dev, y_pred)
    print(f"Word2Vec Params: {params}")
    print("Accuracy:", accuracy)
    print(classification_report(y_dev, y_pred))

    # Log results
    wandb.log({
        'word2vec_params': params,
        'accuracy': accuracy,
        'confusion_matrix': confusion_matrix(y_dev, y_pred).tolist(),
        'classification_report': classification_report(y_dev, y_pred, output_dict=True)
    })

wandb.finish()

Training Word2Vec with params: {'vector_size': 50, 'window': 3, 'min_count': 1, 'workers': 4}
Word2Vec Params: {'vector_size': 50, 'window': 3, 'min_count': 1, 'workers': 4}
Accuracy: 0.662
              precision    recall  f1-score   support

           0       0.76      0.82      0.79      3030
           1       0.24      0.18      0.20       970

    accuracy                           0.66      4000
   macro avg       0.50      0.50      0.49      4000
weighted avg       0.63      0.66      0.64      4000

Training Word2Vec with params: {'vector_size': 100, 'window': 5, 'min_count': 2, 'workers': 4}
Word2Vec Params: {'vector_size': 100, 'window': 5, 'min_count': 2, 'workers': 4}
Accuracy: 0.5085
              precision    recall  f1-score   support

           0       0.80      0.47      0.59      3030
           1       0.27      0.63      0.38       970

    accuracy                           0.51      4000
   macro avg       0.54      0.55      0.49      4000
weighted avg      

0,1
accuracy,█▁▁

0,1
accuracy,0.51675
