<a id='sec0'></a>

# Extracting 2-gram features

The extracted words here will be combined with others extracted in different ways

<a href='#sec1'><b>1. Import and pre-process data</b></a>

<a href='#sec2'><b>2. Create features based on frequency paramerization</b></a>
   
<a href='#sec3'><b>3. PCA & LDA</b></a>

In [21]:
import csv
import json
import random
import re
import numpy as np
import pandas as pd
import scipy.stats as scs
import matplotlib.pyplot as plt
import importlib as imp
import feature_engineering.frequency_selection as fefs
import feature_engineering.text_processing as fetp
import myplot.decomposition as mpd

from nltk import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline

<a id='sec1'></a>
# 1. Import and pre-process data
(<a href='#sec0'>Back to top</a>)

In [6]:
class_train = pd.read_csv('./data/training_variants')
text_train = pd.read_csv("./data/training_text", sep=r"\|\|", engine='python',
                         header=None, skiprows=1, names=["ID","Text"])
train = class_train.merge(text_train, on='ID')

# create class label container
class_labels = []
for i in range(9):
    class_labels.append('class' + str(i+1))

In [12]:
doc1 = train.iloc[0]['Text']

In [24]:
tokenized_doc = word_tokenize(fetp.replace_with_whitespace(doc1, hyphens='on'))

stemmer = PorterStemmer()
tokenized_doc = [stemmer.stem(word) for word in tokenized_doc]
processed_doc = [' '.join(tokenized_doc)]

In [26]:
ngram_vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
counts = ngram_vectorizer.fit_transform(processed_doc)

In [27]:
ngram_vectorizer.get_feature_names()

['000 and',
 '000 anti',
 '000 as',
 '000 at',
 '000 collect',
 '000 or',
 '000 we',
 '001 we',
 '001 western',
 '01 001',
 '068 bp',
 '10 000',
 '10 11',
 '10 bi',
 '10 howev',
 '10 in',
 '10 kb',
 '10 min',
 '10 mm',
 '10 nm',
 '10 none',
 '10 rat',
 '10 the',
 '10 vol',
 '10 μg',
 '10 μm',
 '100 μl',
 '100 μm',
 '1019 000',
 '11 mous',
 '11 thu',
 '11 to',
 '11 we',
 '12 we',
 '123 152',
 '13 and',
 '13 fig',
 '13 straightforward',
 '13362 are',
 '14 becaus',
 '14 fig',
 '15 000',
 '15 interestingli',
 '15 min',
 '15 mm',
 '15 vol',
 '152 504',
 '152 508',
 '152 514',
 '152 554',
 '152274 express',
 '16 17',
 '16 coimmunoprecipit',
 '16 with',
 '164 152',
 '1640 glutamax',
 '17 play',
 '170 6516',
 '172 1019',
 '18 000',
 '18 our',
 '19 500',
 '19 we',
 '1a onlin',
 '1a we',
 '1b an',
 '1b cdk1',
 '1b onlin',
 '1c is',
 '1c would',
 '1d and',
 '1d suggest',
 '1d we',
 '1e we',
 '1f these',
 '1g affect',
 '1g and',
 '1g predict',
 '1g splice',
 '1g which',
 '1i use',
 '1j and',
 '1j 

In [28]:
counts.toarray().astype(int)

array([[2, 5, 1, ..., 1, 1, 1]])