<a id='sec0'></a>

# Extracting 2-gram features

The extracted words here will be combined with others extracted in different ways

<a href='#sec1'><b>1. Import and pre-process data</b></a>

<a href='#sec2'><b>2. Create features based on frequency paramerization</b></a>
   
<a href='#sec3'><b>3. PCA & LDA</b></a>

In [1]:
import csv
import json
import random
import re
import numpy as np
import pandas as pd
import scipy.stats as scs
import matplotlib.pyplot as plt
import importlib as imp
import feature_engineering.frequency_selection as fefs
import feature_engineering.text_processing as fetp
import myplot.decomposition as mpd

from nltk import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.feature_selection import SelectFromModel, RFECV
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

%matplotlib inline

Using TensorFlow backend.
Slow version of gensim.models.doc2vec is being used


<a id='sec1'></a>
# 1. Import and pre-process data
(<a href='#sec0'>Back to top</a>)

In [2]:
class_train = pd.read_csv('./data/training_variants')
text_train = pd.read_csv("./data/training_text", sep=r"\|\|", engine='python',
                         header=None, skiprows=1, names=["ID","Text"])
train = class_train.merge(text_train, on='ID')

# create class label container
class_labels = []
for i in range(9):
    class_labels.append('class' + str(i+1))

In [3]:
with open('./data/classified_texts.json') as f3:
    classified_texts = json.load(f3)

In [9]:
classified_texts['class9'][:300]

'The RNA maturation is an important and complex biological process. It requires several small nuclear ribonucleoproteins (snRNPs) that comprise the two forms of spliceosomes. The major form of spliceosome (U2-type) is composed of U1, U2, U4/6 and U5 snRNPs, and catalyzes most splicing events in metaz'

# 2. Process the entire text

In [10]:
doc1 = classified_texts['class1']

In [11]:
ngram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
counts = ngram_vectorizer.fit_transform([doc1])
class_dict = {value:key for key, value in ngram_vectorizer.vocabulary_.items()}

In [19]:
class_dict = {value:key for key, value in ngram_vectorizer.vocabulary_.items()}
table1 = pd.DataFrame(counts.toarray())
table2 = table1.rename(columns=class_dict)

In [14]:
pd.DataFrame(counts.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,796572,796573,796574,796575,796576,796577,796578,796579,796580,796581
0,8,2,4,2,2,2,1,5,24,26,...,2,9,9,1,1,1,3,3,3,1


In [16]:
unfiltered_2gram_words = []
for doc in train.iloc[:5]['Text']:
    ngram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
    ngram_vectorizer.fit([doc])
    unfiltered_2gram_words += ngram_vectorizer.get_feature_names()
unfiltered_2gram_words = list(set(unfiltered_2gram_words))

In [18]:
%%time
filtered_2gram_words = []
filtered_2gram_words_indices = []
for i, term in enumerate(unfiltered_2gram_words):
    word1, word2 = term.split(' ')
    if not (re.search(r'^[A-Za-z]', word1) and re.search(r'^[A-Za-z]', word2)):
        continue
    if not (re.search(r'[A-Za-z0-9]$', word1) and re.search(r'[A-Za-z0-9]$', word2)):
        continue
    if (re.search(r'[@#%&*()+=]', word1) or re.search(r'[@#%&*()+=]', word2)):
        contiue
    if not (len(word1) > 1 or len(word2) > 1):
        continue
    if ((word1.lower() in stemmed_stop_words) or (word2.lower() in stemmed_stop_words)):
        continue
    if ((word1.lower() in remove_words) or (word2.lower() in remove_words)):
        continue
        
    filtered_2gram_words.append(term)
    filtered_2gram_words_indices.append(i)

print('Before filter: %d terms' % len(unfiltered_2gram_words))
print('After filter: %d terms' % len(filtered_2gram_words))

Before filter: 14366 terms
After filter: 5652 terms
CPU times: user 102 ms, sys: 0 ns, total: 102 ms
Wall time: 102 ms


In [21]:
%%time
corpus = []
for i in range(len(train['Text'])):
    corpus.append(train.iloc[i]['Text'])

# Vectorizer with the filtered terms
ngram_vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2), vocabulary=filtered_2gram_words)
vectorized_doc = ngram_vectorizer2.fit_transform(corpus)
X = vectorized_doc.toarray()

CPU times: user 24.3 s, sys: 256 ms, total: 24.6 s
Wall time: 24.6 s


In [40]:
%%time
y = np.array(class_train.Class).astype(int).ravel()
rfc = RandomForestClassifier(n_estimators=100, max_depth=50, n_jobs=7, random_state=33)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=345)
rfc.fit(X_train, y_train)

accuracy = accuracy_score(y_test, rfc.predict(X_test))
lloss = log_loss(y_test, rfc.predict_proba(X_test), labels=list(range(1, 10)))

print('Accuracy %.3f' % accuracy)
print('Log Loss %.3f' % lloss)

Accuracy 0.623
Log Loss 1.729
CPU times: user 7.44 s, sys: 57.9 ms, total: 7.5 s
Wall time: 1.44 s
