# Baseline model

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import nltk

from nltk.corpus import stopwords

# Caching stopwords
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

sys.path.append(os.path.abspath('../src'))
from fact_classification import *

2023-04-15 17:59:19.124571: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Importing data

In [3]:
df, df_crowdsourced, df_ground_truth = data_loading(local=True)

## Preparing for processing

### Preparing text

In [4]:
df.head()

Unnamed: 0,Sentence_id,Text,Speaker,Speaker_title,Speaker_party,File_id,Length,Line_number,Sentiment,Verdict
0,16,I think we've seen a deterioration of values.,George Bush,Vice President,REPUBLICAN,1988-09-25.txt,8,16,0.0,-1
1,17,I think for a while as a nation we condoned th...,George Bush,Vice President,REPUBLICAN,1988-09-25.txt,16,17,-0.456018,-1
2,18,"For a while, as I recall, it even seems to me ...",George Bush,Vice President,REPUBLICAN,1988-09-25.txt,29,18,-0.805547,-1
3,19,"So we've seen a deterioration in values, and o...",George Bush,Vice President,REPUBLICAN,1988-09-25.txt,35,19,0.698942,-1
4,20,"We got away, we got into this feeling that val...",George Bush,Vice President,REPUBLICAN,1988-09-25.txt,15,20,0.0,-1


## Split into test and train

According to the description of the task we shuold split the dataset into test and train based on year of debate. All debates before and including 2008 goes into train and more recent debates into test. (We could also consider making a validation set when we get closer to the end to have a final validation)

In [5]:
df_train, df_test, idx_train = test_train_split(df)

### Create the tfid matrix for the text column

In [6]:
train_tfid, test_tfid, vocabulary = tfid(train=df_train.Text, test=df_test.Text, n_gram_range=1)

In [7]:
train_tfid

<18170x10641 sparse matrix of type '<class 'numpy.float64'>'
	with 277846 stored elements in Compressed Sparse Row format>

In [8]:
test_tfid

<5363x10641 sparse matrix of type '<class 'numpy.float64'>'
	with 70684 stored elements in Compressed Sparse Row format>

Looks good, we first fitted the vectorizer to the train set (so only words in the train set will be counted) and then transformed the test set using the same vectorizer. They have the same amount of columns which indicate it has been done correctly, keeping them sparse to save storage.

## Predict using standard models

The base model used is `RandomForestClassifier` from the scikit-learn package. We set the `class_weight` parameter to `balanced_subsample` and leave the rest of the parameters as default.

In [9]:
pred_train, pred_test = predict_it(train_tfid, df_train.Verdict, test_tfid)

## Scoring

The score of our base-model is ok, it is important to also consider the scores for individual classes because our data is so unbalanced.

In [10]:
df_score_test = score_it(df_test.Verdict, pred_test, algorithm='RFC-baseline', features='W')
df_score_train = score_it(df_train.Verdict, pred_train, algorithm='RFC-baseline', features='W')

In [11]:
df_score_train

Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,RFC-baseline,W,1.0,0.998,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.999,1.0,1.0


In [12]:
df_score_test

Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,RFC-baseline,W,0.667,0.6,0.813,0.698,0.987,0.058,0.226,0.677,0.796,0.105,0.353,0.598


A very simple random forest classifier based only on the text vectorized gives an accuracy of around 60 percent. We can clearly see that it is strongly overtrained. It will function as a minimum effort baseline model example.

## Proving the point of checking more than one accuracy measure and your data

Making a model which will have a high score but be completely useless. We can predict all -1 and get an average weighted f1score of nearly 50%.

In [13]:
pred_test = dummy_model(df_test.shape[0])
pred_train = dummy_model(df_train.shape[0])

In [14]:
df_score_test = pd.concat([
    df_score_test,
    score_it(df_test.Verdict, pred_test, algorithm='Dummy', features='Done')
])

df_score_train = pd.concat([
    df_score_train,
    score_it(df_train.Verdict, pred_train, algorithm='Dummy', features='None')
])

In [15]:
df_score_test

Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,RFC-baseline,W,0.667,0.6,0.813,0.698,0.987,0.058,0.226,0.677,0.796,0.105,0.353,0.598
0,Dummy,Done,0.618,0.0,0.0,0.382,1.0,0.0,0.0,0.618,0.764,0.0,0.0,0.472


In [16]:
df_score_train

Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,RFC-baseline,W,1.0,0.998,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.999,1.0,1.0
0,Dummy,,0.666,0.0,0.0,0.444,1.0,0.0,0.0,0.666,0.8,0.0,0.0,0.533


In [17]:
# Save results to files
score_saving(df_score_train, df_score_test, fname='Baseline_score')

In [18]:
# Export df_score_test to latex format for use in report
to_latex(df_score_test)

\begin{tabular}{llrrrrrrrrrrrr}
\toprule
algorithm & features & p\_NFS & p\_UFS & p\_CFS & p\_wavg & r\_NFS & r\_UFS & r\_CFS & r\_wavg & f\_NFS & f\_UFS & f\_CFS & f\_wavg \\
\midrule
RFC-baseline & W & 0.667 & 0.600 & 0.813 & 0.698 & 0.987 & 0.058 & 0.226 & 0.677 & 0.796 & 0.105 & 0.353 & 0.598 \\
Dummy & Done & 0.618 & 0.000 & 0.000 & 0.382 & 1.000 & 0.000 & 0.000 & 0.618 & 0.764 & 0.000 & 0.000 & 0.472 \\
\bottomrule
\end{tabular}

