# Testing different features with RandomForrest

In [1]:
import pandas as pd
import numpy as np

import os
import sys
sys.path.append(os.path.abspath('../src'))

import string
import nltk
from nltk import ngrams

# Caching stopwords
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

from nltk.stem.porter import PorterStemmer

from fact_classification import *

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\signe\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
df, df_crowdsourced, df_ground_truth = data_loading()

## RandomForrest with non_text labels

Using Sentiment and Lenght because these have the highest correlation

In [3]:
df['Sentiment'] = df.Sentiment.fillna(df.Sentiment[df.Verdict == -1].mean())

In [4]:
df_train, df_test = test_train_split(df)

In [5]:
df.head(1)

Unnamed: 0,Sentence_id,Text,Speaker,Speaker_title,Speaker_party,File_id,Length,Line_number,Sentiment,Verdict,Year
0,16,I think we've seen a deterioration of values.,George Bush,Vice President,REPUBLICAN,1988-09-25.txt,8,16,0.0,-1,1988


In [6]:
method = method=RandomForestClassifier(max_depth = 5,
        random_state = 42,
        class_weight = "balanced_subsample",
    )
pred_train, pred_test = predict_it(df_train[['Length','Sentiment']], df_train.Verdict, df_test[['Length', 'Sentiment']], method = method)
df_score_test = score_it(df_test.Verdict, pred_test, features = 'Sentiment, Length')
df_score_train = score_it(df_train.Verdict, pred_train, features = 'Sentiment, Lenght')

In [7]:
df_score_test

Unnamed: 0,alogrithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,RandomForrest,"Sentiment, Length",0.705449,0.132019,0.391041,0.555236,0.433615,0.354735,0.453015,0.42961,0.537096,0.192425,0.419753,0.465856


In [8]:
df_score_train

Unnamed: 0,alogrithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,RandomForrest,"Sentiment, Lenght",0.801037,0.148975,0.360263,0.632406,0.42117,0.351058,0.63645,0.464117,0.552071,0.209182,0.460091,0.495904


Using only Sentiment and Lenght gets very low accuracy, to not overfit the max depth of the random forrest is set to 5. 

## RandomForrest with POS

In [9]:
df['pos_tag'] = pos_tag_(df)

In [10]:
df.pos_tag.head()

0    ('PRP', 'VBP', 'PRP', 'VBN', 'DT', 'NN', 'IN',...
1    ('PRP', 'VBP', 'IN', 'DT', 'NN', 'IN', 'DT', '...
2    ('IN', 'DT', 'NN', 'IN', 'PRP', 'VBP', 'PRP', ...
3    ('IN', 'PRP', 'VBN', 'DT', 'NN', 'IN', 'NNS', ...
4    ('PRP', 'VBD', 'RB', 'PRP', 'VBD', 'IN', 'DT',...
Name: pos_tag, dtype: object

In [11]:
df_train, df_test = test_train_split(df)
train_tfid, test_tfid, vocabulary =  tfid(train = df_train.pos_tag, test = df_test.pos_tag, n_gram_range=1)
method = RandomForestClassifier(
        max_depth = 7,
        random_state = 42,
        class_weight = "balanced_subsample",
    )
pred_train, pred_test = predict_it(train_tfid, df_train.Verdict, test_tfid, method = method) 


In [12]:
df_score_test = pd.concat([df_score_test, score_it(df_test.Verdict, pred_test, algorithm = 'RandomForrest', features = 'POS-tagging')])
df_score_train = pd.concat([df_score_train, score_it(df_train.Verdict, pred_train, algorithm = 'RandomForrest', features = 'POS-tagging')])

In [13]:
df_score_test

Unnamed: 0,alogrithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,RandomForrest,"Sentiment, Length",0.705449,0.132019,0.391041,0.555236,0.433615,0.354735,0.453015,0.42961,0.537096,0.192425,0.419753,0.465856
0,RandomForrest,POS-tagging,0.804499,0.248355,0.519573,0.664133,0.733856,0.484751,0.409537,0.618684,0.767556,0.328439,0.458039,0.634246


In [14]:
df_score_train

Unnamed: 0,alogrithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,RandomForrest,"Sentiment, Lenght",0.801037,0.148975,0.360263,0.632406,0.42117,0.351058,0.63645,0.464117,0.552071,0.209182,0.460091,0.495904
0,RandomForrest,POS-tagging,0.879003,0.31739,0.544022,0.744146,0.737151,0.583288,0.596686,0.688883,0.801852,0.41109,0.569139,0.708104


Better than just using sentiment and length, but still worse than the baseline model. 

In [15]:
a = pd.DataFrame(train_tfid.toarray(), columns = vocabulary)

In [16]:
a['Verdict'] = df_train.Verdict.values

In [17]:
a.columns[a[a.columns].corr()['Verdict'].abs() > 0.15]

Index(['cd', 'in', 'md', 'nnp', 'prp', 'vb', 'vbd', 'vbp', 'Verdict'], dtype='object')

In [18]:
a[a.columns].corr()['Verdict'].abs().sort_values(ascending = False)[1:10]


vbd    0.297538
vb     0.271225
cd     0.215290
nnp    0.209826
vbp    0.195247
prp    0.171471
md     0.170495
in     0.153179
to     0.143571
Name: Verdict, dtype: float64

The most predictive categories are "vbd, vb, cd, nnp, vbp, prp, md, and in". These represent verbs, digits, proper nouns, personal noun, modal, and preposition. This seems logical. The least predictive categories are "rbs, ex, fw, uh, rbr" they are the adverbs, existential, foreign words, and interjections. Again this makes logical sense, interjections like "hmm" and "erm" are probably more person dependent and less dependent on the type of sentence. 

## Combining methods (pos_tagging and sentiment & lenght)

In [19]:
pos_train = pd.DataFrame(train_tfid.toarray(), columns = vocabulary)
pos_test = pd.DataFrame(test_tfid.toarray(), columns = vocabulary)

In [23]:
df_train.reset_index(inplace = True, drop = True)
pos_train.reset_index(inplace = True, drop = True)
df_test.reset_index(inplace = True, drop = True)
pos_test.reset_index(inplace = True, drop = True)

In [24]:
df_train_pos = df_train.join(pos_train)
df_test_pos = df_test.join(pos_test)

In [27]:
train_cols = ['Length', 'Sentiment', 'cc', 'cd', 'dt', 'ex', 'fw', 'in', 'jj', 'jjr',
       'jjs', 'md', 'nn', 'nnp', 'nnps', 'nns', 'pdt', 'prp', 'rb', 'rbr',
       'rbs', 'rp', 'to', 'uh', 'vb', 'vbd', 'vbg', 'vbn', 'vbp', 'vbz', 'wdt',
       'wp', 'wrb']

In [29]:
method = method=RandomForestClassifier(max_depth = 5,
        random_state = 42,
        class_weight = "balanced_subsample",
    )
pred_train, pred_test = predict_it(df_train_pos[train_cols], df_train_pos.Verdict, df_test_pos[train_cols], method = method)
df_score_test = score_it(df_test_pos.Verdict, pred_test, features = 'Sentiment, Length, pos_tag')
df_score_train = score_it(df_train_pos.Verdict, pred_train, features = 'Sentiment, Lenght, pos_tag')

In [30]:
df_score_test

Unnamed: 0,alogrithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,RandomForrest,"Sentiment, Length, pos_tag",0.806791,0.240712,0.521314,0.665125,0.716958,0.499197,0.411641,0.610479,0.759227,0.324804,0.460031,0.629207


In [31]:
df_score_train

Unnamed: 0,alogrithm,features,p_NFS,p_UFS,p_CFS,p_wavg,r_NFS,r_UFS,r_CFS,r_wavg,f_NFS,f_UFS,f_CFS,f_wavg
0,RandomForrest,"Sentiment, Lenght, pos_tag",0.874428,0.272415,0.511954,0.72908,0.710048,0.516007,0.587929,0.66197,0.783711,0.35658,0.547317,0.685419


No big improvement with this

## Combining methods (pos tagging and stemming)

## RandomForrest Named Entity Recognition

In [16]:
import spacy
import en_core_web_sm
ner = spacy.load("en_core_web_sm")

In [23]:
doc = ner(df.Text.iloc[100])
print([(X.text, X.label_) for X in doc.ents])

[('Massachusetts', 'GPE'), ('five', 'CARDINAL')]


Just an example for how it works, need to write the code to make this for all the rows