# Generating features for use with classifiers

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import bz2
import os
import pickle
import sys
from scipy.sparse import csr_matrix
sys.path.append(os.path.abspath('../src'))

from fact_classification import *

In [3]:
df, df_crowdsourced, df_ground_truth = data_loading(local=True)
df

Unnamed: 0,Sentence_id,Text,Speaker,Speaker_title,Speaker_party,File_id,Length,Line_number,Sentiment,Verdict
0,16,I think we've seen a deterioration of values.,George Bush,Vice President,REPUBLICAN,1988-09-25.txt,8,16,0.000000,-1
1,17,I think for a while as a nation we condoned th...,George Bush,Vice President,REPUBLICAN,1988-09-25.txt,16,17,-0.456018,-1
2,18,"For a while, as I recall, it even seems to me ...",George Bush,Vice President,REPUBLICAN,1988-09-25.txt,29,18,-0.805547,-1
3,19,"So we've seen a deterioration in values, and o...",George Bush,Vice President,REPUBLICAN,1988-09-25.txt,35,19,0.698942,-1
4,20,"We got away, we got into this feeling that val...",George Bush,Vice President,REPUBLICAN,1988-09-25.txt,15,20,0.000000,-1
...,...,...,...,...,...,...,...,...,...,...
23528,28958,He has promised a trillion dollars out of the ...,Al Gore,Vice President,DEMOCRAT,2000-10-17.txt,24,953,0.363438,1
23529,28965,(LAUGHTER) I -- there's an old high school deb...,George W. Bush,Governor,REPUBLICAN,2000-10-17.txt,23,960,-0.679982,-1
23530,29011,"Well, can I answer that?",George W. Bush,Governor,REPUBLICAN,2000-10-17.txt,5,1006,0.413020,-1
23531,29060,I look forward to the final weeks of this camp...,George W. Bush,Governor,REPUBLICAN,2000-10-17.txt,10,1055,0.625957,-1


From the data exploration stage we know that we have three NA's in the `Sentiment` column. Here we simply drop the three lines.

In [4]:
df = df.dropna().reset_index(drop=True)

We will use the numerical columns Length and Sentiment as features in the model training. Here we create a new dataframe to hold the features and normalize the Length values.

In [5]:
df_features = df[['Sentiment']].copy()
df_features['Length'] = (df['Length'] - df['Length'].mean()) / df['Length'].std()
df_features

Unnamed: 0,Sentiment,Length
0,0.000000,-0.776708
1,-0.456018,-0.147669
2,-0.805547,0.874519
3,0.698942,1.346298
4,0.000000,-0.226299
...,...,...
23525,0.363438,0.481370
23526,-0.679982,0.402740
23527,0.413020,-1.012597
23528,0.625957,-0.619448


## TF-IDF features

We will generate TF-IDF (Term Frequency - Inverse Document Frequency) tokens for the raw text, then on cleaned and stemmed text, as well as on POS-tags and NER-labels. We will later use various combinations of these tokens to see how they affect the classifier models.

### Raw text

First, we generate TF-IDF tokens on the raw Text column without any pre-processing. We split the dataset into training and testing in order to only generate the tokens on the training dataset. This ensures that we don't include unseen data in the token generation.

In [6]:
df_train, df_test, idx_train = test_train_split(df)

train_tfid, test_tfid, vocabulary = tfid(
    train=df_train['Text'],
    test=df_test['Text'],
    n_gram_range=1
)

len(vocabulary)

10641

This has generated 10641 distinct tokens. We then add these tokens as features in a new dataframe, being careful about keeping track of the correct indexes so that they stay identical to the original dataframe. Since this is a large sparse matrix we keep this separate from the original dataframe so that we can use the Pandas SparseDtype datatype.

In [7]:
df_features = df_features.join(
    pd.concat([
        pd.DataFrame(train_tfid.toarray(), columns='T_' + vocabulary, index=df.index[idx_train]),
        pd.DataFrame(test_tfid.toarray(), columns='T_' + vocabulary, index=df.index[~idx_train])
    ]).astype(pd.SparseDtype(float, fill_value=0))
)

In [8]:
df_features

Unnamed: 0,Sentiment,Length,T_00,T_000,T_10,T_100,T_1000,T_101,T_104,T_105,...,T_zamaria,T_zane,T_zarqawi,T_zemin,T_zero,T_zippo,T_zone,T_zones,T_ãƒâ,T_šâ
0,0.000000,-0.776708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.456018,-0.147669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.805547,0.874519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.698942,1.346298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,-0.226299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23525,0.363438,0.481370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23526,-0.679982,0.402740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23527,0.413020,-1.012597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23528,0.625957,-0.619448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df_features.dtypes

Sentiment               float64
Length                  float64
T_00         Sparse[float64, 0]
T_000        Sparse[float64, 0]
T_10         Sparse[float64, 0]
                    ...        
T_zippo      Sparse[float64, 0]
T_zone       Sparse[float64, 0]
T_zones      Sparse[float64, 0]
T_ãƒâ        Sparse[float64, 0]
T_šâ         Sparse[float64, 0]
Length: 10643, dtype: object

### Cleaned and stemmed text

Next, we clean the text by converting to lowercase, removing stopwords, punctuations and newlines.

In [10]:
df = clean_text(df)
df['cleaned_text'].head()

0                 think weve seen deterioration values
1               think nation condoned things condemned
2    while recall even seems talk legalizing decrim...
3    weve seen deterioration values one things thin...
4      got away got feeling value free education thing
Name: cleaned_text, dtype: object

Next, we perform stemming on the cleaned text.

In [11]:
df['stemmed'] = stem(df)
df['stemmed'].head()

0                        think weve seen deterior valu
1                    think nation condon thing condemn
2    while recal even seem talk legal decrimin mari...
3    weve seen deterior valu one thing think term c...
4               got away got feel valu free educ thing
Name: stemmed, dtype: object

Finally, we generate tokens based on the stemmed text.

In [12]:
df_train, df_test, idx_train = test_train_split(df)

train_tfid, test_tfid, vocabulary = tfid(
    train=df_train['stemmed'],
    test=df_test['stemmed'],
    n_gram_range=1
)

len(vocabulary)

7537

This has generated 7537 distinct tokens. We then add these as features in the dataframe.

In [13]:
df_features = df_features.join(
    pd.concat([
        pd.DataFrame(train_tfid.toarray(), columns='W_' + vocabulary, index=df.index[idx_train]),
        pd.DataFrame(test_tfid.toarray(), columns='W_' + vocabulary, index=df.index[~idx_train])
    ]).astype(pd.SparseDtype(float, fill_value=0))
)

In [14]:
df_features

Unnamed: 0,Sentiment,Length,T_00,T_000,T_10,T_100,T_1000,T_101,T_104,T_105,...,W_zamaria,W_zane,W_zarqawi,W_zemin,W_zero,W_zerobas,W_zippo,W_zone,W_ãƒâ,W_šâ
0,0.000000,-0.776708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.456018,-0.147669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.805547,0.874519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.698942,1.346298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,-0.226299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23525,0.363438,0.481370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23526,-0.679982,0.402740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23527,0.413020,-1.012597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23528,0.625957,-0.619448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## POS-tagging

Now, we will add Part-of-Speech tags to the features.

In [15]:
df['POS'] = pos_tag(df)
df['POS'].head()

0                         PRP VBP PRP VBN DT NN IN NNS
1    PRP VBP IN DT NN IN DT NN PRP VBD DT NNS PRP M...
2    IN DT NN IN PRP VBP PRP RB VBZ TO PRP IN EX VB...
3    IN PRP VBN DT NN IN NNS CC CD IN DT NNS WDT PR...
4      PRP VBD RB PRP VBD IN DT NN WDT JJ NN VBD DT NN
Name: POS, dtype: object

In [16]:
df_train, df_test, idx_train = test_train_split(df)

train_tfid, test_tfid, vocabulary = tfid(
    train=df_train['POS'],
    test=df_test['POS'],
    n_gram_range=1
)

len(vocabulary)

31

We see that we have 31 POS tags in the corpus.

In [17]:
vocabulary = np.char.upper(vocabulary.astype('U')).astype(np.object_)
vocabulary

array(['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'MD', 'NN',
       'NNP', 'NNPS', 'NNS', 'PDT', 'PRP', 'RB', 'RBR', 'RBS', 'RP', 'TO',
       'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WRB'],
      dtype=object)

Add the POS-tags to the feature matrix.

In [18]:
df_features = df_features.join(
    pd.concat([
        pd.DataFrame(train_tfid.toarray(), columns='P_' + vocabulary, index=df.index[idx_train]),
        pd.DataFrame(test_tfid.toarray(), columns='P_' + vocabulary, index=df.index[~idx_train])
    ]).astype(pd.SparseDtype(float, fill_value=0))
)

In [19]:
df_features

Unnamed: 0,Sentiment,Length,T_00,T_000,T_10,T_100,T_1000,T_101,T_104,T_105,...,P_UH,P_VB,P_VBD,P_VBG,P_VBN,P_VBP,P_VBZ,P_WDT,P_WP,P_WRB
0,0.000000,-0.776708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.535810,0.358368,0.000000,0.000000,0.0,0.0
1,-0.456018,-0.147669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.197258,0.269568,0.000000,0.316524,0.211702,0.000000,0.000000,0.0,0.0
2,-0.805547,0.874519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.159802,0.178578,0.000000,0.250997,0.163981,0.000000,0.0,0.0
3,0.698942,1.346298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.176210,0.000000,0.000000,0.141375,0.094556,0.123551,0.184139,0.0,0.0
4,0.000000,-0.226299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.700580,0.000000,0.000000,0.000000,0.000000,0.357149,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23525,0.363438,0.481370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.241793,0.000000,0.184628,0.193994,0.000000,0.169536,0.000000,0.0,0.0
23526,-0.679982,0.402740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.270966,0.000000,0.206903,0.000000,0.000000,0.189990,0.283161,0.0,0.0
23527,0.413020,-1.012597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.416894,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
23528,0.625957,-0.619448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.334258,0.000000,0.000000,0.0,0.0


## NER-Labels

We also add Named Entity Recognition labels to the feature matrix.

In [20]:
df['NER'] = ner_labels(df)
df['NER'].head()

0          
1          
2    PERSON
3          
4          
Name: NER, dtype: object

In [21]:
df_train, df_test, idx_train = test_train_split(df)

train_tfid, test_tfid, vocabulary = tfid(
    train=df_train['NER'],
    test=df_test['NER'],
    n_gram_range=1
)

len(vocabulary)

18

Here we see that we have 18 NER labels in the corpus. We add them to our feature matrix.

In [22]:
df_features = df_features.join(
    pd.concat([
        pd.DataFrame(train_tfid.toarray(), columns='E_' + vocabulary, index=df.index[idx_train]),
        pd.DataFrame(test_tfid.toarray(), columns='E_' + vocabulary, index=df.index[~idx_train])
    ]).astype(pd.SparseDtype(float, fill_value=0))
)

df_features

Unnamed: 0,Sentiment,Length,T_00,T_000,T_10,T_100,T_1000,T_101,T_104,T_105,...,E_money,E_norp,E_ordinal,E_org,E_percent,E_person,E_product,E_quantity,E_time,E_work_of_art
0,0.000000,-0.776708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.456018,-0.147669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.805547,0.874519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,0.0,0.0
3,0.698942,1.346298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,-0.226299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23525,0.363438,0.481370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.798806,0.0,0.0,0.601589,0.0,0.0,0.0,0.0,0.0,0.0
23526,-0.679982,0.402740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
23527,0.413020,-1.012597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
23528,0.625957,-0.619448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


## Export features to file

In [23]:
with bz2.open('../results/df_features.bz2', 'wb') as f:
    pickle.dump(df_features, f)