<a id='sec0'></a>

# Extracting 2-gram features

The extracted words here will be combined with others extracted in different ways

<a href='#sec1'><b>1. Import and pre-process data</b></a>

<a href='#sec2'><b>2. Create features based on frequency paramerization</b></a>
   
<a href='#sec3'><b>3. PCA & LDA</b></a>

In [1]:
import csv
import json
import random
import re
import numpy as np
import pandas as pd
import scipy.stats as scs
import matplotlib.pyplot as plt
import importlib as imp
import feature_engineering.frequency_selection as fefs
import feature_engineering.text_processing as fetp
import myplot.decomposition as mpd

from nltk import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


%matplotlib inline

Using TensorFlow backend.
Slow version of gensim.models.doc2vec is being used


<a id='sec1'></a>
# 1. Import and pre-process data
(<a href='#sec0'>Back to top</a>)

In [2]:
class_train = pd.read_csv('./data/training_variants')
text_train = pd.read_csv("./data/training_text", sep=r"\|\|", engine='python',
                         header=None, skiprows=1, names=["ID","Text"])
train = class_train.merge(text_train, on='ID')

# create class label container
class_labels = []
for i in range(9):
    class_labels.append('class' + str(i+1))

In [3]:
train[train['Text'] == 'null']

Unnamed: 0,ID,Gene,Variation,Class,Text
1109,1109,FANCA,S1088F,1,
1277,1277,ARID5B,Truncating Mutations,1,
1407,1407,FGFR3,K508M,6,
1639,1639,FLT1,Amplification,6,
2755,2755,BRAF,G596C,7,


In [4]:
print('Length of train before removing null entries: %d' % len(train))
train = train.drop(train.index[train['Text'] == 'null'])
print('Length of train before removing null entries: %d' % len(train))

Length of train before removing null entries: 3321
Length of train before removing null entries: 3316


# 1. Test on one piece of text

# 2. Process the entire text

Import data

In [5]:
whole_corpus = (pd.read_csv('./data/unclassified_stemmed_corpus.csv', header=None, squeeze=True)).tolist()
filtered_2gram_words = (pd.read_csv('./data/filtered_2gram_words.csv', header=None, squeeze=True)).tolist()

In [6]:
len(whole_corpus)

3316

In [7]:
%%time
# Vectorizer with the filtered terms
ngram_vectorizer2 = TfidfVectorizer(analyzer='word', ngram_range=(2, 2), vocabulary=filtered_2gram_words)
vectorized_doc = ngram_vectorizer2.fit_transform(whole_corpus)
X = vectorized_doc.toarray()

CPU times: user 28.8 s, sys: 3.72 s, total: 32.6 s
Wall time: 32.6 s


In [12]:
vocab = ngram_vectorizer2.vocabulary_

In [13]:
v = [None]*len(vocab)
for key, val in vocab.items():
    v[val] = key

In [57]:
def split_matrix(X, vocab_list, num_split=5):
    num_feat = X.shape[1]
    if num_feat != len(vocab_list):
        print('Error: # features and vocabulary does not match!')
        return
    
    n = num_feat // num_split
    indices = [0, n]
    split_Xs = []
    split_Vocabs = []
    for i in range(num_split-1):
        start = indices[i]
        end = indices[i+1]
        indices.append(end+n)
        split_Xs.append(X[:, start:end])
        split_Vocabs.append(vocab_list[start:end])
    split_Xs.append(X[:, end:])
    split_Vocabs.append(X[:, end:])
                        
    return split_Xs, split_Vocabs

In [74]:
def remove_zero_importance_features(rfc, X, y, vocab_list, percentile=0.95, test_size=0.15,
                                     print_scores=True, random_state=None):
    X_train, X_test, y_train, y_test = \
                train_test_split(X, y, test_size=test_size, random_state=random_state)
    rfc.fit(X_train, y_train)
    
    if print_scores:
        print('Accuracy %.3f' % accuracy_score(y_test, rfc.predict(X_test)))
        print('Log Loss %.3f' % log_loss(y_test, rfc.predict_proba(X_test), 
                                         labels=list(range(1, 10))))
    
    imps = rfc.feature_importances_
    selectF_indices = list(np.argwhere(imps >= np.percentile(imps, (100*percentile))).ravel())
    selectX = X[:, selectF_indices]
    selectVocab = [vocab_list[i] for i,_ in enumerate(selectF_indices)]
    
    return selectX, selectVocab

In [75]:
def myRFE(rfc, X, y, vocab_list,
          num_split=10, percentile=0.95, test_size=0.15, 
          print_scores=True, random_state=None):
    
    selection = []
    selection_vocab = []
    split_Xs, split_Vocabs = split_matrix(X, vocab_list, num_split=num_split)
    for i in range(num_split):
        print('>> Processing split%d' % (i+1))
        selectX, selectVocab = remove_zero_importance_features(rfc, split_Xs[i], y, split_Vocabs[i],
                                          percentile=percentile, test_size=test_size,
                                          print_scores=print_scores, random_state=random_state)
        selection.append(selectX)
        selection_vocab.append(selectVocab)
        
    newX = np.concatenate(selection, axis=1)
    newVocab = np.concatenate(selection_vocab, axis=1)
    
    return newX, newVocab

In [76]:
%%time
y = np.array(train['Class']).astype(int).ravel()
rfc = RandomForestClassifier(n_estimators=100, max_depth=50, n_jobs=8, random_state=33)
newX, newVocab = myRFE(rfc, X, y, v,
                          num_split=5, percentile=0.95, test_size=0.15, 
                          print_scores=True, random_state=345)

>> Processing split1
Accuracy 0.649
Log Loss 1.016
>> Processing split2
Accuracy 0.651
Log Loss 1.067
>> Processing split3
Accuracy 0.653
Log Loss 1.000
>> Processing split4
Accuracy 0.643
Log Loss 1.006
>> Processing split5
Accuracy 0.651
Log Loss 1.002


IndexError: index 3316 is out of bounds for axis 0 with size 3316

In [59]:
newX.shape

(3316, 51160)

In [13]:
%%time
y = np.array(train['Class']).astype(int).ravel()
rfc = RandomForestClassifier(n_estimators=100, max_depth=50, n_jobs=8, random_state=33)
X_train, X_test, y_train, y_test = \
                train_test_split(newX, y, test_size=0.15, random_state=345)
rfc.fit(X_train, y_train)
print('Accuracy %.3f' % accuracy_score(y_test, rfc.predict(X_test)))
print('Log Loss %.3f' % log_loss(y_test, rfc.predict_proba(X_test), 
                                 labels=list(range(1, 10))))

Accuracy 0.687
Log Loss 1.008
CPU times: user 1min 24s, sys: 899 ms, total: 1min 25s
Wall time: 14.7 s


In [14]:
X = None

In [86]:
%%time
y = np.array(train['Class']).astype(int).ravel()
rfc = RandomForestClassifier(n_estimators=100, max_depth=50, n_jobs=8, random_state=33)

newX1 = remove_zero_importance_features(rfc, newX, y, percentile=0.8, test_size=0.15,
                                          print_scores=True, random_state=345)

X_train, X_test, y_train, y_test = \
                train_test_split(newX1, y, test_size=0.15, random_state=345)
rfc.fit(X_train, y_train)
print('Accuracy %.3f' % accuracy_score(y_test, rfc.predict(X_test)))
print('Log Loss %.3f' % log_loss(y_test, rfc.predict_proba(X_test), 
                                 labels=list(range(1, 10))))
print(newX1.shape)

Accuracy 0.687
Log Loss 1.008
Accuracy 0.683
Log Loss 1.061
(3316, 10232)
CPU times: user 1min 40s, sys: 1.21 s, total: 1min 42s
Wall time: 18.2 s


USE RFECV

In [87]:
%%time
selector = RFECV(rfc, step=0.1, scoring='neg_log_loss', verbose=1, n_jobs=8)
selector = selector.fit(newX1, y)
rfe_indices = list(np.argwhere(selector.support_ == True).ravel())
X_2grams = newX[:, rfe_indices]

Fitting estimator with 10232 features.
Fitting estimator with 10232 features.
Fitting estimator with 10232 features.
Fitting estimator with 9209 features.
Fitting estimator with 9209 features.
Fitting estimator with 9209 features.
Fitting estimator with 8186 features.
Fitting estimator with 8186 features.
Fitting estimator with 8186 features.
Fitting estimator with 7163 features.
Fitting estimator with 7163 features.
Fitting estimator with 7163 features.
Fitting estimator with 6140 features.
Fitting estimator with 6140 features.
Fitting estimator with 6140 features.
Fitting estimator with 5117 features.
Fitting estimator with 5117 features.
Fitting estimator with 4094 features.
Fitting estimator with 5117 features.
Fitting estimator with 4094 features.
Fitting estimator with 3071 features.
Fitting estimator with 4094 features.
Fitting estimator with 3071 features.
Fitting estimator with 2048 features.
Fitting estimator with 3071 features.
Fitting estimator with 2048 features.
Fitting e

In [88]:
newX1.shape

(3316, 10232)

In [89]:
X_2grams.shape

(3316, 5117)

In [90]:
%%time
y = np.array(train['Class']).astype(int).ravel()
rfc = RandomForestClassifier(n_estimators=100, max_depth=50, n_jobs=8, random_state=33)
X_train, X_test, y_train, y_test = \
                train_test_split(X_2grams, y, test_size=0.15, random_state=345)
rfc.fit(X_train, y_train)
print('Accuracy %.3f' % accuracy_score(y_test, rfc.predict(X_test)))
print('Log Loss %.3f' % log_loss(y_test, rfc.predict_proba(X_test), 
                                 labels=list(range(1, 10))))

Accuracy 0.675
Log Loss 0.967
CPU times: user 8.93 s, sys: 113 ms, total: 9.04 s
Wall time: 1.79 s


# PCA & LDA

In [None]:
decomp_table = mpd.decomposition3D(selectX1, train['Class'])
mpd.decomposition3DPlot(decomp_table, train['Class'])
mpd.decomposition2DPlot(decomp_table, train['Class'])

In [None]:
%%time
decomp_table = mpd.decomposition3D(selectX1, train['Class'], 
                y=y, decomposer=LinearDiscriminantAnalysis(n_components=3))
mpd.decomposition3DPlot(decomp_table, train['Class'])
mpd.decomposition2DPlot(decomp_table, train['Class'])

In [None]:
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(kernel="rbf", gamma=10, n_components=3, n_jobs=8)
decomp_table = mpd.decomposition3D(selectX1, train['Class'], 
                                y=y, decomposer=kpca)
mpd.decomposition3DPlot(decomp_table, train['Class'])
mpd.decomposition2DPlot(decomp_table, train['Class'])