# Generating features for use with classifiers

In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
import sys
from scipy.sparse import csr_matrix
sys.path.append(os.path.abspath('../src'))

from fact_classification import *

In [3]:
df, df_crowdsourced, df_ground_truth = data_loading(local=True)
df.head()

Unnamed: 0,Sentence_id,Text,Speaker,Speaker_title,Speaker_party,File_id,Length,Line_number,Sentiment,Verdict
0,16,I think we've seen a deterioration of values.,George Bush,Vice President,REPUBLICAN,1988-09-25.txt,8,16,0.0,-1
1,17,I think for a while as a nation we condoned th...,George Bush,Vice President,REPUBLICAN,1988-09-25.txt,16,17,-0.456018,-1
2,18,"For a while, as I recall, it even seems to me ...",George Bush,Vice President,REPUBLICAN,1988-09-25.txt,29,18,-0.805547,-1
3,19,"So we've seen a deterioration in values, and o...",George Bush,Vice President,REPUBLICAN,1988-09-25.txt,35,19,0.698942,-1
4,20,"We got away, we got into this feeling that val...",George Bush,Vice President,REPUBLICAN,1988-09-25.txt,15,20,0.0,-1


From the data exploration stage we know that we have three NA's in the `Sentiment` column. Here we simply drop the three lines.

In [4]:
df = df.dropna().reset_index(drop=True)

The dataset already includes some features that we will use; Length and Sentiment.

In [5]:
df_features = df[['Length', 'Sentiment']]
df_features.head()

Unnamed: 0,Length,Sentiment
0,8,0.0
1,16,-0.456018
2,29,-0.805547
3,35,0.698942
4,15,0.0


## TF-IDF features

We will go on and generate TF-IDF features based on the Text content.

First, we clean the text by converting to lowercase, removing stopwords, punctuations and newlines.

In [6]:
df = clean_text(df)
df['Text'].head()

0                 think weve seen deterioration values
1               think nation condoned things condemned
2    while recall even seems talk legalizing decrim...
3    weve seen deterioration values one things thin...
4      got away got feeling value free education thing
Name: Text, dtype: object

Then, we apply stemming to the cleaned text.

In [7]:
df['stemmed'] = stem(df)
df['stemmed'].head()

0                        think weve seen deterior valu
1                    think nation condon thing condemn
2    while recal even seem talk legal decrimin mari...
3    weve seen deterior valu one thing think term c...
4               got away got feel valu free educ thing
Name: stemmed, dtype: object

Finally, we generate tokens based on the stemmed text.

In [8]:
df_train, df_test = test_train_split(df)

train_tfid, test_tfid, vocabulary = tfid(
    train=df_train['stemmed'],
    test=df_test['stemmed'],
    n_gram_range=1
)

len(vocabulary)

7537

This has generated 7537 distinct tokens. We then add these tokens to our features dataframe.

In [9]:
df_features = df_features.join(
    pd.concat([
        pd.DataFrame(train_tfid.toarray(), columns='W_' + vocabulary),
        pd.DataFrame(test_tfid.toarray(), columns='W_' + vocabulary)
    ]).reset_index(drop=True))

In [10]:
df_features

Unnamed: 0,Length,Sentiment,W_000,W_05,W_10,W_100,W_1000,W_10000,W_100000,W_100billion,...,W_zamaria,W_zane,W_zarqawi,W_zemin,W_zero,W_zerobas,W_zippo,W_zone,W_ãƒâ,W_šâ
0,8,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16,-0.456018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,29,-0.805547,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,35,0.698942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,15,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23525,24,0.363438,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23526,23,-0.679982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23527,5,0.413020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23528,10,0.625957,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## POS-tagging

In [11]:
df['POS'] = pos_tag(df)
df['POS'].head()

0                                    NN VBP VBN NN NNS
1                                    VB NN VBN NNS VBD
2           IN RB RB VBZ NN VBG VBG JJ NNS VBP NNS VBP
3    NN VBN NN VBZ CD NNS VBP NNS VBP NN NNS JJ NNS...
4                           VBD RB VBN VBG NN JJ NN NN
Name: POS, dtype: object

In [12]:
pos_tfid, vocabulary = tfid(df['POS'])
pos_tfid

<23530x32 sparse matrix of type '<class 'numpy.float64'>'
	with 114139 stored elements in Compressed Sparse Row format>

We see that we have 32 POS tags in the corpus after cleaning and stemming.

In [13]:
vocabulary = np.char.upper(vocabulary.astype('U')).astype(np.object_)
vocabulary

array(['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'MD', 'NN',
       'NNP', 'NNPS', 'NNS', 'PDT', 'PRP', 'RB', 'RBR', 'RBS', 'RP',
       'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT',
       'WP', 'WRB'], dtype=object)

In [14]:
df_features = df_features.join(
    pd.DataFrame(pos_tfid.toarray(), columns='P_' + vocabulary)
)

df_features.head()


Unnamed: 0,Length,Sentiment,W_000,W_05,W_10,W_100,W_1000,W_10000,W_100000,W_100billion,...,P_UH,P_VB,P_VBD,P_VBG,P_VBN,P_VBP,P_VBZ,P_WDT,P_WP,P_WRB
0,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.618519,0.481842,0.0,0.0,0.0,0.0
1,16,-0.456018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.485467,0.482287,0.0,0.598637,0.0,0.0,0.0,0.0,0.0
2,29,-0.805547,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.509182,0.0,0.453032,0.336504,0.0,0.0,0.0
3,35,0.698942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.252594,0.393555,0.292326,0.0,0.0,0.0
4,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.367665,0.399583,0.456363,0.0,0.0,0.0,0.0,0.0


## ENR Labels

## Create CSR and export matrix

We will use the features generated in our other notebooks when training and testing various classifiers. Since we have a large sparse matrix, we first convert it to a Compressed Sparse Row matrix, then we save it to disk.

In [15]:
# Convert to CSR matrix
# df_features = csr_matrix(df_features)

In [16]:
# Save CSR matrix to disk
# np.savez_compressed('../results/features.npz', df_features)

## Export matrix

In [17]:
df_features.to_parquet('../results/features.gzip', compression='gzip')

: 