# Testing different features with RandomForrest

In [1]:
import pandas as pd
import numpy as np

import os
import sys
sys.path.append(os.path.abspath('../src'))

import string
import nltk
from nltk import ngrams

# Caching stopwords
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

from nltk.stem.porter import PorterStemmer

from fact_classification import *

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support


In [2]:
df, df_crowdsourced, df_ground_truth = data_loading()

## RandomForrest with non_text labels

Using Sentiment and Lenght because these have the highest correlation

In [3]:
df['Sentiment'] = df.Sentiment.fillna(df.Sentiment[df.Verdict == -1].mean())

In [4]:
df_train, df_test = test_train_split(df)

In [5]:
df.head(1)

Unnamed: 0,Sentence_id,Text,Speaker,Speaker_title,Speaker_party,File_id,Length,Line_number,Sentiment,Verdict,Year
0,16,I think we've seen a deterioration of values.,George Bush,Vice President,REPUBLICAN,1988-09-25.txt,8,16,0.0,-1,1988


In [6]:
method = method=RandomForestClassifier(max_depth = 5,
        random_state = 42,
        class_weight = "balanced_subsample",
    )
pred_train, pred_test = predict_it(df_train[['Length','Sentiment']], df_train.Verdict, df_test[['Length', 'Sentiment']], method = method)
df_score_test = score_it(df_test.Verdict, pred_test, features = 'Sentiment, Length')
df_score_train = score_it(df_train.Verdict, pred_train, features = 'Sentiment, Lenght')

In [7]:
df_score_test

Unnamed: 0,alogrithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,RandomForrestClassifier,"Sentiment, Length",0.705449,0.132019,0.391041,0.555236,0.433615,0.354735,0.453015,0.42961,0.537096,0.192425,0.419753,0.465856


In [8]:
df_score_train

Unnamed: 0,alogrithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,RandomForrestClassifier,"Sentiment, Lenght",0.801037,0.148975,0.360263,0.632406,0.42117,0.351058,0.63645,0.464117,0.552071,0.209182,0.460091,0.495904


Using only Sentiment and Lenght gets very low accuracy, to not overfit the max depth of the random forrest is set to 5. 

## RandomForrest with POS

In [9]:
from nltk import pos_tag
from nltk.tokenize import word_tokenize
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\signe\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [10]:
def tokenize(column):
    """Tokenizes a Pandas dataframe column and returns a list of tokens.

    Args:
        column: Pandas dataframe column (i.e. df['text']).

    Returns:
        tokens (list): Tokenized list
    """

    tokens = nltk.word_tokenize(column)
    return [w for w in tokens if w.isalpha()]   # stole from  https://practicaldatascience.co.uk/machine-learning/how-to-perform-tokenization-in-nlp-with-nltk-and-python

In [11]:
df['tokenized'] = df.apply(lambda x: tokenize(x['Text']), axis=1)
df[['tokenized']].head()

Unnamed: 0,tokenized
0,"[I, think, we, seen, a, deterioration, of, val..."
1,"[I, think, for, a, while, as, a, nation, we, c..."
2,"[For, a, while, as, I, recall, it, even, seems..."
3,"[So, we, seen, a, deterioration, in, values, a..."
4,"[We, got, away, we, got, into, this, feeling, ..."


In [12]:
df['pos_tag'] = df.tokenized.apply(nltk.pos_tag)

In [13]:
def get_pos(l):
    return str(list(zip(*l))[1])

In [14]:
df.pos_tag = df.pos_tag.apply(lambda row: get_pos(row))

In [15]:
df.pos_tag.head()

0    ('PRP', 'VBP', 'PRP', 'VBN', 'DT', 'NN', 'IN',...
1    ('PRP', 'VBP', 'IN', 'DT', 'NN', 'IN', 'DT', '...
2    ('IN', 'DT', 'NN', 'IN', 'PRP', 'VBP', 'PRP', ...
3    ('IN', 'PRP', 'VBN', 'DT', 'NN', 'IN', 'NNS', ...
4    ('PRP', 'VBD', 'RB', 'PRP', 'VBD', 'IN', 'DT',...
Name: pos_tag, dtype: object

In [16]:
df_train, df_test = test_train_split(df)
train_tfid, test_tfid, vocabulary =  tfid(train = df_train.pos_tag, test = df_test.pos_tag)
method = method=RandomForestClassifier(max_depth = 5,
        random_state = 42,
        class_weight = "balanced_subsample",
    )
pred_train, pred_test = predict_it(train_tfid, df_train.Verdict, test_tfid, method = method) 


In [21]:
print(vocabulary)

None


In [17]:
df_score_test = pd.concat([df_score_test, score_it(df_test.Verdict, pred_test, algorithm = 'RandomForrest', features = 'POS-tagging')])
df_score_train = pd.concat([df_score_train, score_it(df_train.Verdict, pred_train, algorithm = 'RandomForrest', features = 'POS-tagging')])

In [18]:
df_score_test

Unnamed: 0,alogrithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,RandomForrestClassifier,"Sentiment, Length",0.705449,0.132019,0.391041,0.555236,0.433615,0.354735,0.453015,0.42961,0.537096,0.192425,0.419753,0.465856
0,RandomForrest,POS-tagging,0.803723,0.2418,0.523245,0.663868,0.716657,0.508828,0.402525,0.608988,0.757697,0.327818,0.455014,0.627277


In [19]:
df_score_train

Unnamed: 0,alogrithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,RandomForrestClassifier,"Sentiment, Lenght",0.801037,0.148975,0.360263,0.632406,0.42117,0.351058,0.63645,0.464117,0.552071,0.209182,0.460091,0.495904
0,RandomForrest,POS-tagging,0.86931,0.266322,0.509989,0.724597,0.72112,0.531199,0.537751,0.659218,0.788311,0.354774,0.523502,0.682762


Better than just using sentiment and length, but still not much better than the baseline model. 

In [17]:
a = pd.DataFrame(train_tfid.toarray(), columns = vocabulary)

In [18]:
a['Verdict'] = df_train.Verdict.values

In [19]:
a.head(1)

Unnamed: 0,cc,cd,dt,ex,fw,in,jj,jjr,jjs,md,...,vb,vbd,vbg,vbn,vbp,vbz,wdt,wp,wrb,Verdict
0,0.0,0.0,0.26435,0.0,0.0,0.266813,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.535821,0.358341,0.0,0.0,0.0,0.0,-1


In [22]:
a[a.columns].corr()['Verdict'].abs().sort_values(ascending = True)[0:10]


rbs    0.001731
ex     0.004541
fw     0.006304
uh     0.007696
rbr    0.012837
jj     0.015556
vbg    0.020498
wrb    0.021190
pdt    0.021273
vbz    0.022486
Name: Verdict, dtype: float64

The most predictive categories are "vbd, vb, cd, nnp, vbp, prp, md, and in". These represent verbs, digits, proper nouns, personal noun, modal, and preposition. This seems logical. The least predictive categories are "rbs, ex, fw, uh, rbr" they are the adverbs, existential, foreign words, and interjections. Again this makes logical sense, interjections like "hmm" and "erm" are probably more person dependent and less dependent on the type of sentence. Based on this we choose to only keep the ones which correlate the most and add them to the original dataset as this is not too much data.