# Naive Bayes & Random Forest

In [1]:
!pip uninstall -y atap_corpus
!pip install --quiet "git+ssh://git@github.com/Australian-Text-Analytics-Platform/atap_corpus.git@48ec8f6438a259876bb401ef567aeaf970fb3e71"

In [2]:
# dataset setup
import pandas as pd
from atap_corpus.corpus import Corpus

df = pd.read_excel("./all_sent_model_annot_no_duplicates_cfSGRR scores-700.xlsx")
df = df.loc[:, ['sentence', 'det', 'se', 'nat', 'hom', 'pos']]
corpus: Corpus = Corpus.from_dataframe(df, col_doc='sentence')
str(corpus)

'<DataFrameCorpus enlightened-jackrabbit size: 4278>'

In [3]:
import spacy
corpus.run_spacy(spacy.blank('en'))
corpus.uses_spacy()

Processing:   0%|          | 0/4278 [00:00<?, ?it/s]

True

In [4]:
corpus.add_dtm(lambda doc: list(t.text for t in doc), name='tokens')

In [5]:
# datasets setup
import numpy as np
from sklearn.model_selection import train_test_split

split = 0.2
datasets = dict(det=None, se=None, nat=None, hom=None, pos=None)
for clazz in datasets.keys():
    datasets[clazz] = dict()
    neutral = corpus.s.filter_by_item(name=clazz, items=0)
    biased = corpus.s.filter_by_item(name=clazz, items=1)
    balanced_corp = biased.join(neutral.sample(len(biased), rand_stat=42))
    
    X, Y = np.asarray(balanced_corp.dtms['tokens'].matrix.todense()), balanced_corp[clazz]
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=split, random_state=42)
    datasets[clazz]['X'] = X
    datasets[clazz]['Y'] = Y
    datasets[clazz]['X_train'] = X_train
    datasets[clazz]['y_train'] = y_train
    datasets[clazz]['X_test'] = X_test
    datasets[clazz]['y_test'] = y_test
datasets.keys()

dict_keys(['det', 'se', 'nat', 'hom', 'pos'])

# Classifications
+ [ ] is biased classification
+ [ ] classification per class
+ [ ] classification as multiclass

## Naive Bayes

In [6]:
from sklearn.naive_bayes import MultinomialNB

for clazz, dataset in datasets.items():
    mnb = MultinomialNB()   # each feature is multinomial since its frequencies.
    mnb.fit(dataset['X_train'], dataset['y_train'])
    y_preds = mnb.predict(dataset['X_test'])
    dataset['nb'] = y_preds

## Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier

n_estimators = 100
for clazz, dataset in datasets.items():
    clf = RandomForestClassifier(n_estimators=n_estimators)
    clf.fit(dataset['X_train'], dataset['y_train'])
    y_preds = clf.predict(dataset['X_test'])
    dataset['rf'] = y_preds

# Evaluations

In [8]:
# reused for (NB, RF)
# accuracy, precision, recall for each class.
import numpy as np
from sklearn.metrics import classification_report
from typing import IO

def evaluate(y_pred: np.ndarray, y_true: np.ndarray, file: IO, labels=None, **kwargs):
    assert y_pred.shape == y_true.shape, "Mismatched shape between y_pred and y_true."
    assert file.writable(), "File is not writable."
    report = classification_report(y_pred=y_pred, y_true=y_true, output_dict=False, labels=labels, **kwargs)
    file.write(report)

In [9]:
# for Naive Bayes and Random Forest, evaluate for each class.
import io

classifier = "nb"
file = io.TextIOWrapper(io.BufferedWriter(io.FileIO(f"{classifier}.txt", mode='w')), encoding='utf-8')
s = io.StringIO()
for clazz, dataset in datasets.items():
    file.write("===" + clazz + "===\n")
    s.write("===" + clazz + "===\n")
    evaluate(datasets[clazz][classifier], datasets[clazz]['y_test'], file=file)
    evaluate(datasets[clazz][classifier], datasets[clazz]['y_test'], file=s)
file.close()
s.seek(0)
print(s.read()); s.close()

===det===
              precision    recall  f1-score   support

           0       0.58      0.48      0.53        29
           1       0.62      0.71      0.67        35

    accuracy                           0.61        64
   macro avg       0.60      0.60      0.60        64
weighted avg       0.61      0.61      0.60        64
===se===
              precision    recall  f1-score   support

           0       0.73      0.62      0.67        26
           1       0.64      0.75      0.69        24

    accuracy                           0.68        50
   macro avg       0.69      0.68      0.68        50
weighted avg       0.69      0.68      0.68        50
===nat===
              precision    recall  f1-score   support

           0       0.50      0.50      0.50         8
           1       0.33      0.33      0.33         6

    accuracy                           0.43        14
   macro avg       0.42      0.42      0.42        14
weighted avg       0.43      0.43      0.43    

In [10]:
classifier = "rf"
file = io.TextIOWrapper(io.BufferedWriter(io.FileIO(f"{classifier}.txt", mode='w')), encoding='utf-8')
s = io.StringIO()
for clazz, dataset in datasets.items():
    file.write("===" + clazz + "===\n")
    s.write("===" + clazz + "===\n")
    evaluate(datasets[clazz][classifier], datasets[clazz]['y_test'], file=file)
    evaluate(datasets[clazz][classifier], datasets[clazz]['y_test'], file=s)
file.close()
s.seek(0)
print(s.read()); s.close()

===det===
              precision    recall  f1-score   support

           0       0.55      0.55      0.55        29
           1       0.63      0.63      0.63        35

    accuracy                           0.59        64
   macro avg       0.59      0.59      0.59        64
weighted avg       0.59      0.59      0.59        64
===se===
              precision    recall  f1-score   support

           0       0.75      0.69      0.72        26
           1       0.69      0.75      0.72        24

    accuracy                           0.72        50
   macro avg       0.72      0.72      0.72        50
weighted avg       0.72      0.72      0.72        50
===nat===
              precision    recall  f1-score   support

           0       0.57      0.50      0.53         8
           1       0.43      0.50      0.46         6

    accuracy                           0.50        14
   macro avg       0.50      0.50      0.50        14
weighted avg       0.51      0.50      0.50    