# Naive Bayes

In [1]:
%reload_ext autoreload
%autoreload 2

## Imports

In [2]:
import bz2
import os
import sys
import matplotlib.pyplot as plt
import pickle
import pandas as pd
import scipy as sp
import seaborn as sns
from IPython.display import display
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

sys.path.append(os.path.abspath('../src'))
from fact_classification import *

2023-04-17 20:58:25.612993: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load datafiles

In [3]:
df, df_crowdsourced, df_ground_truth = data_loading(local=True)
df['Sentiment'] = df.Sentiment.fillna(df.Sentiment[df.Verdict == -1].mean())

## Load features
Load the features matrix that we generated in the `feature_generation.ipynb` notebook. This is a large sparse matrix so ww convert it to Compressed Sparse Row (CSR) format to avoid running out of memory when fitting our models.

In [4]:
with bz2.open('../results/df_features.bz2') as f:
    df_features = pickle.load(f)


The Naive Bayes classifier don't accept negative values so we scale the normalized data to values between 0 and 1 using the MinMaxScaler:

In [5]:
# MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(df_features.to_numpy())

# Convert to compressed sparse row matrix
X = sp.sparse.csr_matrix(X)    

## Split data and generate indexes

We split the dataset according to the instructions in the assignment, where data up until and including year 2008 will be used for training, and data after 2008 will be used for testing. Here we also generate indexes for the various feature sets.

In [39]:
df_train, df_test, idx_train = test_train_split(df)

y = df['Verdict']
y_train = df_train['Verdict']
y_test = df_test['Verdict']

X_train = X[idx_train]
X_test = X[~idx_train]

# Column index for the numeric columns Sentiment and Length
col_idx_n = (df_features.columns == 'Sentiment') | (df_features.columns == 'Length')

# Column index for TF-IDF features on the raw Text column with n-grams=1
col_idx_w1 = df_features.columns.str.startswith('W1_')

# Column index for TF-IDF features on the raw Text column with n-grams=2
col_idx_w2 = df_features.columns.str.startswith('W2_')

# Column index for TF-IDF features on the stemmed text with n-grams=1
col_idx_ws = df_features.columns.str.startswith('WS_')

# Column index for POS features
col_idx_p = df_features.columns.str.startswith('P_')

# Column index for NER labels
col_idx_e = df_features.columns.str.startswith('E_')

## Naive Bayes - MultinomialNB

In [40]:
nbc = MultinomialNB()

param_grid = {
    'alpha': [0.001, 0.01, 0.1, 0.5, 1, 5, 10, 100],
    'fit_prior': [True, False]
}

clf = GridSearchCV(
    estimator=nbc,
    param_grid=param_grid,
    scoring='f1_weighted',
    cv=4,
    n_jobs=-1,
    return_train_score=True,
    verbose=0
)

## Define experiments

In [41]:
# Define experiments
experiments = {
    'N': col_idx_n,
    'W': col_idx_w1,
    'P': col_idx_p,
    'E': col_idx_e,
    'N_W': col_idx_n | col_idx_w1,
    'N_P': col_idx_n | col_idx_p,
    'N_E': col_idx_n | col_idx_e,
    'N_W_P': col_idx_n | col_idx_w1 | col_idx_p,
    'N_W_E': col_idx_n | col_idx_w1 | col_idx_e,
    'N_W_P_E': col_idx_n | col_idx_w1 | col_idx_p | col_idx_e
}


## Run experiments

In [42]:
df_score_train = pd.DataFrame()
df_score_test = pd.DataFrame()
models = {}

for features, index in experiments.items():
  exp_clf, exp_score_train, exp_score_test = run_experiment(
      clf=clf,
      X_train=X_train[:, index],
      y_train=y_train,
      X_test=X_test[:, index],
      y_test=y_test,
      annotations={
        'algorithm': 'NBC',
        'features': features}
  )
  models[features] = exp_clf
  # Print test results
  print('Test metrics:')
  display(exp_score_test)
  # Add results to dataframe
  df_score_train = pd.concat([df_score_train, exp_score_train]).reset_index(drop=True)
  df_score_test = pd.concat([df_score_test, exp_score_test]).reset_index(drop=True)

Running experiment with algorithm "NBC" and features "N"
Best parameters found: {'alpha': 100, 'fit_prior': False}
Test metrics:


Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,NBC,N,0.675,0.106,0.446,0.548,0.79,0.074,0.33,0.584,0.728,0.087,0.379,0.561


Running experiment with algorithm "NBC" and features "W"
Best parameters found: {'alpha': 0.5, 'fit_prior': True}
Test metrics:


Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,NBC,W,0.714,0.563,0.678,0.687,0.944,0.122,0.404,0.705,0.813,0.201,0.506,0.66


Running experiment with algorithm "NBC" and features "P"
Best parameters found: {'alpha': 100, 'fit_prior': False}
Test metrics:


Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,NBC,P,0.771,0.203,0.459,0.622,0.737,0.247,0.462,0.607,0.754,0.223,0.461,0.614


Running experiment with algorithm "NBC" and features "E"
Best parameters found: {'alpha': 100, 'fit_prior': False}
Test metrics:


Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,NBC,E,0.687,0.203,0.523,0.587,0.856,0.063,0.382,0.638,0.762,0.096,0.442,0.599


Running experiment with algorithm "NBC" and features "N_W"
Best parameters found: {'alpha': 1, 'fit_prior': False}
Test metrics:


Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,NBC,N_W,0.763,0.448,0.614,0.687,0.878,0.238,0.525,0.71,0.816,0.311,0.566,0.691


Running experiment with algorithm "NBC" and features "N_P"
Best parameters found: {'alpha': 100, 'fit_prior': False}
Test metrics:


Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,NBC,N_P,0.771,0.203,0.488,0.63,0.761,0.252,0.452,0.62,0.766,0.225,0.469,0.624


Running experiment with algorithm "NBC" and features "N_E"
Best parameters found: {'alpha': 100, 'fit_prior': False}
Test metrics:


Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,NBC,N_E,0.713,0.2,0.549,0.61,0.83,0.136,0.417,0.639,0.767,0.162,0.474,0.619


Running experiment with algorithm "NBC" and features "N_W_P"
Best parameters found: {'alpha': 0.5, 'fit_prior': True}
Test metrics:


Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,NBC,N_W_P,0.727,0.578,0.683,0.698,0.945,0.149,0.428,0.715,0.821,0.237,0.526,0.675


Running experiment with algorithm "NBC" and features "N_W_E"
Best parameters found: {'alpha': 0.5, 'fit_prior': True}
Test metrics:


Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,NBC,N_W_E,0.729,0.556,0.692,0.699,0.942,0.136,0.45,0.717,0.822,0.219,0.545,0.678


Running experiment with algorithm "NBC" and features "N_W_P_E"
Best parameters found: {'alpha': 0.5, 'fit_prior': True}
Test metrics:


Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,NBC,N_W_P_E,0.737,0.564,0.691,0.705,0.94,0.17,0.461,0.723,0.826,0.261,0.553,0.688


# Results

In [43]:
print('Training metrics:')
display(df_score_train.sort_values(by='f_wavg', ascending=False))

print('Testing metrics:')
display(df_score_test.sort_values(by='f_wavg', ascending=False))

Training metrics:


Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
1,NBC,W,0.865,0.883,0.853,0.864,0.968,0.443,0.748,0.864,0.914,0.59,0.797,0.854
7,NBC,N_W_P,0.868,0.854,0.841,0.86,0.963,0.445,0.753,0.862,0.913,0.586,0.795,0.852
8,NBC,N_W_E,0.866,0.859,0.839,0.859,0.961,0.444,0.753,0.86,0.911,0.585,0.794,0.851
9,NBC,N_W_P_E,0.871,0.821,0.828,0.856,0.954,0.457,0.761,0.859,0.911,0.587,0.793,0.851
4,NBC,N_W,0.895,0.702,0.775,0.848,0.912,0.523,0.819,0.851,0.904,0.599,0.797,0.848
5,NBC,N_P,0.835,0.217,0.491,0.692,0.761,0.257,0.576,0.667,0.796,0.235,0.53,0.677
2,NBC,P,0.832,0.213,0.473,0.686,0.75,0.251,0.57,0.658,0.789,0.231,0.517,0.669
6,NBC,N_E,0.782,0.172,0.523,0.66,0.805,0.168,0.484,0.666,0.793,0.17,0.503,0.663
3,NBC,E,0.748,0.149,0.501,0.63,0.844,0.068,0.437,0.67,0.793,0.093,0.467,0.646
0,NBC,N,0.727,0.095,0.371,0.58,0.682,0.092,0.442,0.566,0.704,0.094,0.403,0.572


Testing metrics:


Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
4,NBC,N_W,0.763,0.448,0.614,0.687,0.878,0.238,0.525,0.71,0.816,0.311,0.566,0.691
9,NBC,N_W_P_E,0.737,0.564,0.691,0.705,0.94,0.17,0.461,0.723,0.826,0.261,0.553,0.688
8,NBC,N_W_E,0.729,0.556,0.692,0.699,0.942,0.136,0.45,0.717,0.822,0.219,0.545,0.678
7,NBC,N_W_P,0.727,0.578,0.683,0.698,0.945,0.149,0.428,0.715,0.821,0.237,0.526,0.675
1,NBC,W,0.714,0.563,0.678,0.687,0.944,0.122,0.404,0.705,0.813,0.201,0.506,0.66
5,NBC,N_P,0.771,0.203,0.488,0.63,0.761,0.252,0.452,0.62,0.766,0.225,0.469,0.624
6,NBC,N_E,0.713,0.2,0.549,0.61,0.83,0.136,0.417,0.639,0.767,0.162,0.474,0.619
2,NBC,P,0.771,0.203,0.459,0.622,0.737,0.247,0.462,0.607,0.754,0.223,0.461,0.614
3,NBC,E,0.687,0.203,0.523,0.587,0.856,0.063,0.382,0.638,0.762,0.096,0.442,0.599
0,NBC,N,0.675,0.106,0.446,0.548,0.79,0.074,0.33,0.584,0.728,0.087,0.379,0.561


## Save results to file

In [52]:
# Save the scoring metrics to file
score_saving(df_score_train, df_score_test, fname='NaiveBayes_score')

# Save the trained models to file
with bz2.open('../models/NaiveBayes.bz2', 'wb') as f:
    pickle.dump(models, f)

## Export results to LaTeX

In [None]:
to_latex(df_score_train)

In [None]:
to_latex(df_score_test)