# Baseline: Random Forest

This notebook uses one of the two baseline classifiers: Random Forest to classify GEF biases based on sentences.

## 1. Upload your dataset

Your dataset must be in excel format and must contain `sentence` column.

In [None]:
import panel as pn
pn.extension(notifications=True)

In [None]:
from causation.utils import fileuploader 

uploaded = dict()
sets = ['train', 'val', 'test']
for set_ in sets:
    finput, uploaded_data = fileuploader('.xlsx')
    uploaded[set_] = dict()
    uploaded[set_]['row'] = pn.Row(pn.pane.Str(f"{set_} set:".rjust(10)), finput)
    uploaded[set_]['finput'] = finput
    uploaded[set_]['upload'] = uploaded_data
    
pn.Column('# Upload datasets', *(uploaded[set_]['row'] for set_ in sets))

In [None]:
has_uploads = all(uploaded[set_]['upload'].get('data', False) for set_ in uploaded.keys())
if not has_uploads:
    pn.state.notifications.error('Did you upload all 3 datasets?', duration=10_000)
    raise Exception('Did you upload all 3 datasets?')
import pandas as pd
from atap_corpus.corpus import Corpus, Corpora
import spacy

dfs = []
for set_ in sets:
    df = pd.read_excel(uploaded[set_]['upload'].get('data'))
    df['set'] = set_
    dfs.append(df)

df = pd.concat(dfs, axis=0)
    
corpus = Corpus.from_dataframe(df, col_doc='sentence')
print(f"Tokenising and building DTM for {corpus.name}...")
corpus.run_spacy(spacy.blank('en'))
assert corpus.uses_spacy(), "Corpus must be using spacy for spacy tokenisation."
corpus.add_dtm_from_docs(lambda doc: list(t.text for t in doc), name='tokens')
assert corpus.get_dtm('tokens') is not None, "Corpus tokens DTM was not built."
"Successful. Please continue."

In [None]:
import numpy as np

datasets = dict(DE=None, SE=None, NA=None, HD=None)
for clazz in datasets.keys():
    datasets[clazz] = dict()
    neutral = corpus.s.filter_by_item(name=clazz, items=0)
    biased = corpus.s.filter_by_item(name=clazz, items=1)
    balanced_corp = biased.join(neutral.sample(len(biased), rand_stat=42))
    
    train = balanced_corp.s.filter_by_item("set", ["train", "val"])
    test = balanced_corp.s.filter_by_item("set", "test")
    X_train, y_train = np.asarray(train.dtms['tokens'].matrix.todense()), np.array(train[clazz].tolist())
    X_test, y_test = np.asarray(test.dtms['tokens'].matrix.todense()), np.array(test[clazz].tolist())
    
    datasets[clazz]['X_train'] = X_train
    datasets[clazz]['y_train'] = y_train
    datasets[clazz]['X_test'] = X_test
    datasets[clazz]['y_test'] = y_test
datasets.keys()

In [None]:
from sklearn.ensemble import RandomForestClassifier

n_estimators = 100
for clazz, dataset in datasets.items():
    clf = RandomForestClassifier(n_estimators=n_estimators)
    clf.fit(dataset['X_train'], dataset['y_train'])
    y_preds = clf.predict(dataset['X_test'])
    dataset['rf'] = y_preds

In [None]:
import tempfile
from pathlib import Path

tmpd = Path(tempfile.mkdtemp())

In [None]:
import numpy as np
from sklearn.metrics import classification_report
from typing import IO
import io

def evaluate(y_pred: np.ndarray, y_true: np.ndarray, file: IO, labels=None, **kwargs):
    assert y_pred.shape == y_true.shape, "Mismatched shape between y_pred and y_true."
    assert file.writable(), "File is not writable."
    report = classification_report(y_pred=y_pred, y_true=y_true, output_dict=False, labels=labels, **kwargs)
    file.write(report)
    

classifier = "rf"
file = io.TextIOWrapper(io.BufferedWriter(io.FileIO(tmpd.joinpath(f"{classifier}.txt"), mode='w')), encoding='utf-8')
s = io.StringIO()
for clazz, dataset in datasets.items():
    file.write("===" + clazz + "===\n")
    s.write("===" + clazz + "===\n")
    evaluate(datasets[clazz][classifier], datasets[clazz]['y_test'], file=file)
    evaluate(datasets[clazz][classifier], datasets[clazz]['y_test'], file=s)
file.close()
s.seek(0)
print(s.read()); s.close()

In [None]:
def evaluate(y_pred: np.ndarray, y_true: np.ndarray, **kwargs):
    assert y_pred.shape == y_true.shape, "Mismatched shape between y_pred and y_true."
    report = classification_report(y_pred=y_pred, y_true=y_true, output_dict=True, **kwargs)
    return report

r_dfs = list()
for clazz in datasets.keys():
    report = evaluate(datasets[clazz][classifier], datasets[clazz]['y_test'])
    r_df = pd.DataFrame.from_dict(report).T.loc[['0', '1'], ['precision', 'recall', 'f1-score']]
    r_dfs.append(r_df)
    
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

# Select a colormap
cmap = cm.get_cmap('tab20c')

fig, axs = plt.subplots(1, 1, figsize=(12, 6))
categories = list(datasets.keys())
colors = cmap(np.linspace(0, 1, len(categories)))
              
values = []
for r_df in r_dfs:
    pre = r_df.loc['1', 'precision']
    rec = r_df.loc['1', 'recall']
    f1 = r_df.loc['1', 'f1-score']
    values.append([pre, rec, f1])
values = np.array(values)

for i, metric in enumerate(['precision', 'recall', 'f1-score']):
    plt.scatter(categories, values[:, i], color=colors[i], label=metric)

plt.title("Random Forest")
plt.xlabel('Bias', fontsize=12)
plt.grid(True)
plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.savefig(tmpd.joinpath('plot.png'))
plt.show()

In [None]:
import zipfile
import os
from datetime import datetime
from pathlib import Path
import panel as pn

now = datetime.now().strftime(format="%Y-%m-%d_%H-%M-%S")
zfname = Path(f'{now}-{classifier}.zip')
file_names = list(tmpd.rglob("*"))
file_names += [u['upload']['data'] for u in uploaded.values()]
with zipfile.ZipFile(zfname, 'w') as zipf:
    for file_name in file_names:
        zipf.write(file_name, arcname=os.path.basename(file_name))
print(f"Saved as {zfname}.\nClick below to download.")

# download link for the zip.
pn.widgets.FileDownload(file=str(zfname), filename=zfname.name)