# Testing different features with RandomForest

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import nltk
import pandas as pd
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, PredefinedSplit

# Caching stopwords
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

sys.path.append(os.path.abspath('../src'))
from fact_classification import *

Load the dataset.

In [3]:
df, df_crowdsourced, df_ground_truth = data_loading(local=True)
df = df.dropna().reset_index(drop=True)
df.head()

Unnamed: 0,Sentence_id,Text,Speaker,Speaker_title,Speaker_party,File_id,Length,Line_number,Sentiment,Verdict
0,16,I think we've seen a deterioration of values.,George Bush,Vice President,REPUBLICAN,1988-09-25.txt,8,16,0.0,-1
1,17,I think for a while as a nation we condoned th...,George Bush,Vice President,REPUBLICAN,1988-09-25.txt,16,17,-0.456018,-1
2,18,"For a while, as I recall, it even seems to me ...",George Bush,Vice President,REPUBLICAN,1988-09-25.txt,29,18,-0.805547,-1
3,19,"So we've seen a deterioration in values, and o...",George Bush,Vice President,REPUBLICAN,1988-09-25.txt,35,19,0.698942,-1
4,20,"We got away, we got into this feeling that val...",George Bush,Vice President,REPUBLICAN,1988-09-25.txt,15,20,0.0,-1


Load the baseline model scoring results. We will use this to compare against our new models.

In [4]:
# Load scoring results dataframe from the baseline model
df_score_train, df_score_test = score_loading()
df_score_test

Unnamed: 0,algorithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,RFC,W,0.67,0.6,0.81,0.7,0.99,0.06,0.23,0.68,0.8,0.11,0.35,0.6
0,DUM,,0.62,0.0,0.0,0.38,1.0,0.0,0.0,0.62,0.76,0.0,0.0,0.47


Load the features matrix that we generated in the `feature_generation.ipynb` notebook.

In [5]:
df_features = pd.read_parquet('../results/features.gzip')
df_features

ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Pandas requires version '0.6.3' or newer of 'fastparquet' (version '0.5.0' currently installed).

In [None]:
df_features_sparse = df_features.to_sparse()

## RandomForest with only Length and Sentiment features

Using Sentiment and Length because these have the highest correlation

In [None]:
df_train, df_test = test_train_split(df)

# PredefinedSplit index for GridSearchCV
test_fold = np.append(
    np.full((len(df_train),), -1, dtype=int),
    np.full((len(df_test),), 0, dtype=int)
)
test_fold = PredefinedSplit(test_fold)

In [None]:
rfc = RandomForestClassifier(
    random_state=42,
    n_jobs=-1,
    class_weight="balanced_subsample",
)

param_grid = {
    'n_estimators': [5, 10, 20],
    'criterion': ['gini', 'log_loss'],
    'max_depth': [5, 10, 20, None],
    'class_weight': ['balanced', 'balanced_subsample', None]
}

clf = GridSearchCV(rfc, param_grid, cv=test_fold, n_jobs=-1)
clf.fit(df_features_sparse, df['Verdict'])

In [None]:

pred_train, pred_test = predict_it(
    df_train[['Length', 'Sentiment']],
    df_train.Verdict,
    df_test[['Length', 'Sentiment']],
    method=method
)

df_score_test = pd.concat([
    df_score_test,
    score_it(
        df_test.Verdict,
        pred_test,
        algorithm='RFC',
        features='SL')]
).reset_index(drop=True)

df_score_train = pd.concat([
    df_score_train,
    score_it(
        df_train.Verdict,
        pred_train,
        algorithm='RFC',
        features='SL')]
).reset_index(drop=True)

In [None]:
df_score_test

In [None]:
df_score_train

Using only Sentiment and Length gets very low accuracy, to not overfit the max depth of the random forrest is set to 5. 

## RandomForest with POS

In [None]:
df_train, df_test = test_train_split(df_features.join(df[['File_id', 'Verdict']]))
col_idx = df_train.columns.str.startswith('P_')

method = RandomForestClassifier(
    max_depth=5,
    random_state=42,
    class_weight="balanced_subsample",
)

pred_train, pred_test = predict_it(
    df_train.loc[:, col_idx],
    df_train['Verdict'],
    df_test.loc[:, col_idx],
    method=method
)

df_score_test = pd.concat([
    df_score_test,
    score_it(
        df_test['Verdict'],
        pred_test,
        algorithm='RFC',
        features='P')]
).reset_index(drop=True)

df_score_train = pd.concat([
    df_score_train,
    score_it(
        df_train['Verdict'],
        pred_train,
        algorithm='RFC',
        features='P')]
).reset_index(drop=True)

In [None]:
df_score_test

In [None]:
df_score_train

Better than just using sentiment and length, but barely better than the baseline model. 

In [None]:
a = df_train.loc[:, col_idx]
a

In [None]:
a.loc[:, 'Verdict'] = df_train['Verdict'].values
a

In [None]:
a.columns[a[a.columns].corr()['Verdict'].abs() > 0.15]

In [None]:
a[a.columns].corr()['Verdict'].abs().sort_values(ascending = False)[1:10]


The most predictive categories are "vbd, vb, cd, nnp, vbp, prp, md, and in". These represent verbs, digits, proper nouns, personal noun, modal, and preposition. This seems logical. The least predictive categories are "rbs, ex, fw, uh, rbr" they are the adverbs, existential, foreign words, and interjections. Again this makes logical sense, interjections like "hmm" and "erm" are probably more person dependent and less dependent on the type of sentence. 

## Combining methods (POS-tagging, sentiment and length)

In [None]:
pos_train = pd.DataFrame(train_tfid.toarray(), columns = vocabulary)
pos_test = pd.DataFrame(test_tfid.toarray(), columns = vocabulary)

In [None]:
df_train.reset_index(inplace = True, drop = True)
pos_train.reset_index(inplace = True, drop = True)
df_test.reset_index(inplace = True, drop = True)
pos_test.reset_index(inplace = True, drop = True)

In [None]:
df_train_pos = df_train.join(pos_train)
df_test_pos = df_test.join(pos_test)

In [None]:
train_cols = ['Length', 'Sentiment', 'cc', 'cd', 'dt', 'ex', 'fw', 'in', 'jj', 'jjr',
       'jjs', 'md', 'nn', 'nnp', 'nnps', 'nns', 'pdt', 'prp', 'rb', 'rbr',
       'rbs', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbp', 'vbz', 'wdt',
       'wp', 'wrb']

In [None]:
method = method=RandomForestClassifier(max_depth = 5,
        random_state = 42,
        class_weight = "balanced_subsample",
    )
pred_train, pred_test = predict_it(df_train_pos[train_cols], df_train_pos.Verdict, df_test_pos[train_cols], method = method)
df_score_test = pd.concat([df_score_test, score_it(df_test_pos.Verdict, pred_test, features = 'Sentiment, Length, POS')]).reset_index(drop=True)
df_score_train = pd.concat([df_score_train, score_it(df_train_pos.Verdict, pred_train, features = 'Sentiment, Length, POS')]).reset_index(drop=True)

In [None]:
df_score_test

In [None]:
df_score_train

No big improvement with this

## Combining methods (pos tagging and stemming)

## RandomForest Named Entity Recognition

We use the `Spacy` package to generate the NER labels.

In [None]:
df['ner_tag'] = ner_labels(df['Text'])

In [None]:
df[['pos_tag', 'ner_tag']].head(10)

In [None]:
df['pos_ner_combined'] = df['pos_tag'] + df['ner_tag'].astype(str)

In [None]:
df_train, df_test = test_train_split(df)

train_tfid, test_tfid, vocabulary = tfid(
    train=df_train['pos_ner_combined'],
    test=df_test['pos_ner_combined'],
    n_gram_range=1
)

method = RandomForestClassifier(
    max_depth=7,
    random_state=42,
    class_weight="balanced_subsample",
)

pred_train, pred_test = predict_it(
    train_tfid,
    df_train.Verdict,
    test_tfid,
    method=method
)

In [None]:
df_score_test = pd.concat([
    df_score_test,
    score_it(
        df_test.Verdict,
        pred_test,
        algorithm='Random Forest',
        features='POS+NER')]
).reset_index(drop=True)

df_score_train = pd.concat([
    df_score_train,
    score_it(
        df_train.Verdict,
        pred_train,
        algorithm='Random Forest',
        features='POS+NER')]
).reset_index(drop=True)

In [None]:
df_score_test

In [None]:
df_score_train

Here we see that we have a slight improvement in weighted f-score when we combine the POS+NER labels

In [None]:
a = pd.DataFrame(train_tfid.toarray(), columns = vocabulary)

In [None]:
a['Verdict'] = df_train.Verdict.values

In [None]:
a.columns[a[a.columns].corr()['Verdict'].abs() > 0.15]

In [None]:
a[a.columns].corr()['Verdict'].abs().sort_values(ascending = False)[1:15]


Here we have four NER labels with correlation above 0.15; money, date, percent, and cardinal.

In [None]:
pos_train = pd.DataFrame(train_tfid.toarray(), columns = vocabulary)
pos_test = pd.DataFrame(test_tfid.toarray(), columns = vocabulary)

In [None]:
pos_train

In [None]:
to_latex(df_score_test)