<a id='sec0'></a>

# Extracting 2-gram features

The extracted words here will be combined with others extracted in different ways

<a href='#sec1'><b>1. Import and pre-process data</b></a>

<a href='#sec2'><b>2. Create features based on frequency paramerization</b></a>
   
<a href='#sec3'><b>3. PCA & LDA</b></a>

In [1]:
import csv
import json
import random
import re
import numpy as np
import pandas as pd
import scipy.stats as scs
import matplotlib.pyplot as plt
import importlib as imp
import feature_engineering.frequency_selection as fefs
import feature_engineering.text_processing as fetp
import myplot.decomposition as mpd

from nltk import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline

Using TensorFlow backend.
Slow version of gensim.models.doc2vec is being used


<a id='sec1'></a>
# 1. Import and pre-process data
(<a href='#sec0'>Back to top</a>)

In [2]:
class_train = pd.read_csv('./data/training_variants')
text_train = pd.read_csv("./data/training_text", sep=r"\|\|", engine='python',
                         header=None, skiprows=1, names=["ID","Text"])
train = class_train.merge(text_train, on='ID')

# create class label container
class_labels = []
for i in range(9):
    class_labels.append('class' + str(i+1))

In [3]:
doc1 = train.iloc[0]['Text']

In [20]:
tokenized_doc = word_tokenize(fetp.replace_with_whitespace(doc1, hyphens='on'))

stemmer = PorterStemmer()
tokenized_doc = [stemmer.stem(word) for word in tokenized_doc]
processed_doc = [' '.join(tokenized_doc)]

stop_words = set(stopwords.words('english'))
stemmed_stop_words = [stemmer.stem(word) for word in stop_words]

remove_words = ['also', 'anoth', 'case', 'fig', 'gave', 'illustr', 'result', 'show', 'suggest', 'supplementari', 'tabl', 'play', 
                'use', 'pictur', 'could', 'woukd', 'might', 'demonstr', 'whether', 'almost', 'elut', 'find', 'give', 'henc',
                'known', 'ml', 'name', 'ncbi', 'onlin', 'previous', 'purifi', 'method', 'must', 'seem', 'whose', 'when', 'where',
                'this', 'thus', 'town', 'wa', 'wash', 'download', 'yet']

In [5]:
ngram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
counts = ngram_vectorizer.fit_transform(processed_doc)

In [6]:
ngrams_unfiltered = ngram_vectorizer.get_feature_names()

In [21]:
%%time
ngrams_filtered = []
ngrams_filtered_indices = []
for i, term in enumerate(ngrams_unfiltered):
    word1, word2 = term.split(' ')
    if not (re.search(r'^[A-Za-z]', word1) and re.search(r'^[A-Za-z]', word2)):
        continue
    if not (re.search(r'[A-Za-z0-9]$', word1) and re.search(r'[A-Za-z0-9]$', word2)):
        continue
    if (re.search(r'[@#%&*()+=]', word1) or re.search(r'[@#%&*()+=]', word2)):
        contiue
    if not (len(word1) > 1 or len(word2) > 1):
        continue
    if ((word1.lower() in stemmed_stop_words) or (word2.lower() in stemmed_stop_words)):
        continue
    if ((word1.lower() in remove_words) or (word2.lower() in remove_words)):
        continue
        
    ngrams_filtered.append(term)
    ngrams_filtered_indices.append(i)

CPU times: user 27.6 ms, sys: 972 µs, total: 28.6 ms
Wall time: 28.1 ms


In [22]:
len(ngrams_filtered)

1307

In [26]:
ngram_vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2), vocabulary=ngrams_filtered)
ngram_vectorizer2.fit_transform(processed_doc)

<1x1307 sparse matrix of type '<class 'numpy.int64'>'
	with 1307 stored elements in Compressed Sparse Row format>

In [27]:
ngram_vectorizer2.get_feature_names()

['abcam ab9106',
 'aberr splice',
 'abnorm three',
 'abolish atp',
 'acceptor site',
 'acid genbank',
 'acid mop',
 'acid number',
 'acid sequenc',
 'acid substitut',
 'actin sigma',
 'activ chromosom',
 'activ cyclin',
 'activ peg202',
 'activ posit',
 'activ protein',
 'activ reveal',
 'activ treatment',
 'ad amount',
 'ad kinas',
 'ad laemli',
 'ad sampl',
 'addit anomali',
 'addit evid',
 'addit regulatori',
 'adult tissu',
 'affect individu',
 'affin interact',
 'agaros bead',
 'ago cdk10',
 'alanin substitut',
 'albeit notabl',
 'allow dock',
 'alreadi report',
 'altern exon',
 'altern gene',
 'altern splice',
 'although discov',
 'amino acid',
 'amino termin',
 'anal atresia',
 'analysi quantit',
 'analyz chromosom',
 'analyz sall1',
 'anti actin',
 'anti cdk10',
 'anti cycm',
 'anti ets2',
 'anti flag',
 'anti goat',
 'anti mous',
 'anti myc',
 'anti rabbit',
 'anti raf1',
 'anti tubulin',
 'anti v5',
 'antibodi anti',
 'antibodi detect',
 'antibodi input',
 'antibodi product',