# Baseline: Naive Bayes

This notebook uses one of the two baseline classifiers: Naive Bayes to classify GEF biases based on sentences.

## 1. Upload your dataset

Your dataset must be in excel format and must contain `sentence` column.

In [None]:
import panel as pn
pn.extension(notifications=True)

In [None]:
from causation.utils import fileuploader 

uploaded = dict()
sets = ['train', 'val', 'test']
for set_ in sets:
    finput, uploaded_data = fileuploader('.xlsx')
    uploaded[set_] = dict()
    uploaded[set_]['row'] = pn.Row(pn.pane.Str(f"{set_} set:".rjust(10)), finput)
    uploaded[set_]['finput'] = finput
    uploaded[set_]['upload'] = uploaded_data
    
pn.Column('# Upload datasets', *(uploaded[set_]['row'] for set_ in sets))

In [None]:
has_uploads = all(uploaded[set_]['upload'].get('data', False) for set_ in uploaded.keys())
if not has_uploads:
    pn.state.notifications.error('Did you upload all 3 datasets?', duration=10_000)
    raise Exception('Did you upload all 3 datasets?')
import pandas as pd
from atap_corpus.corpus import Corpus, Corpora
import spacy

dfs = []
for set_ in sets:
    df = pd.read_excel(uploaded[set_]['upload'].get('data'))
    df['set'] = set_
    dfs.append(df)

df = pd.concat(dfs, axis=0)
    
corpus = Corpus.from_dataframe(df, col_doc='sentence')
print(f"Tokenising and building DTM for {corpus.name}...")
corpus.run_spacy(spacy.blank('en'))
assert corpus.uses_spacy(), "Corpus must be using spacy for spacy tokenisation."
corpus.add_dtm_from_docs(lambda doc: list(t.text for t in doc), name='tokens')
assert corpus.get_dtm('tokens') is not None, "Corpus tokens DTM was not built."
"Successful. Please continue."

In [None]:
import numpy as np

datasets = dict(DE=None, SE=None, NA=None, HD=None)
for clazz in datasets.keys():
    datasets[clazz] = dict()
    neutral = corpus.s.filter_by_item(name=clazz, items=0)
    biased = corpus.s.filter_by_item(name=clazz, items=1)
    balanced_corp = biased.join(neutral.sample(len(biased), rand_stat=42))
    
    train = balanced_corp.s.filter_by_item("set", ["train", "val"])
    test = balanced_corp.s.filter_by_item("set", "test")
    X_train, y_train = np.asarray(train.dtms['tokens'].matrix.todense()), np.array(train[clazz].tolist())
    X_test, y_test = np.asarray(test.dtms['tokens'].matrix.todense()), np.array(test[clazz].tolist())
    
    datasets[clazz]['X_train'] = X_train
    datasets[clazz]['y_train'] = y_train
    datasets[clazz]['X_test'] = X_test
    datasets[clazz]['y_test'] = y_test
datasets.keys()

In [None]:
from sklearn.naive_bayes import MultinomialNB

for clazz, dataset in datasets.items():
    mnb = MultinomialNB()   # each feature is multinomial since its frequencies.
    mnb.fit(dataset['X_train'], dataset['y_train'])
    y_preds = mnb.predict(dataset['X_test'])
    dataset['nb'] = y_preds

In [None]:
import numpy as np
from sklearn.metrics import classification_report
from typing import IO

def evaluate(y_pred: np.ndarray, y_true: np.ndarray, file: IO, labels=None, **kwargs):
    assert y_pred.shape == y_true.shape, "Mismatched shape between y_pred and y_true."
    assert file.writable(), "File is not writable."
    report = classification_report(y_pred=y_pred, y_true=y_true, output_dict=False, labels=labels, **kwargs)
    file.write(report)

In [None]:
import io

classifier = "nb"
s = io.StringIO()
for clazz, dataset in datasets.items():
    s.write("===" + clazz + "===\n")
    evaluate(datasets[clazz][classifier], datasets[clazz]['y_test'], file=s)
s.seek(0)
print(s.read()); s.close();