In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import random
import json
import scipy.stats as scs
import feature_engineering as fe
import feature_engineering.text_processing as fete
import feature_engineering.frequency_selection as fefs
import csv
import xgboost as xgb

from xgboost import plot_importance
from pprint import pprint
from matplotlib  import cm
from collections import Counter
from importlib import reload
from gensim import corpora, matutils, models, similarities
from nltk import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import scale, normalize, robust_scale
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from mpl_toolkits.mplot3d import Axes3D

In [3]:
# Import data into pandas, merge class data and text data on ID
class_train = pd.read_csv('./data/training_variants')
text_train = pd.read_csv("./data/training_text", sep="\|\|", engine='python',
                         header=None, skiprows=1, names=["ID","Text"])
train = class_train.merge(text_train, on='ID')

# create class label container
class_labels = []
for i in range(9):
    class_labels.append('class' + str(i+1))

  text_train = pd.read_csv("./data/training_text", sep="\|\|", engine='python',


In [4]:
# Make a stemmer object, define stop words
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

<b>1-gram processing</b>

In [7]:
%%time
# Create a dictionay. Key is the class label.
# Value is 
classified_tokenized_docs = {}
for i in range(9):
    print('%s being processed...' % class_labels[i])
    docs = [doc for j, doc in enumerate(train[train.Class == (i+1)]['Text'])]
    
    tokenized_docs = []
    for k, doc in enumerate(docs):
        # tokenize the doc (DO NOT MAKE IT A SET FOR LATER USE)
        tokenized_doc = word_tokenize(fete.replace_with_whitespace(doc, hyphens='on'))

        # Remove stop words and words with special characters
        tokenized_doc = [word for word in tokenized_doc \
                         if re.search(r'^[A-Za-z]', word) \
                         if re.search(r'[A-Za-z0-9]$', word) \
                         if not re.search(r'[@#%&*()+=]', word) \
                         if len(word) > 1 \
                         if word.lower() not in stop_words]

        # Apply stemmer to each word in the list
        tokenized_doc = [stemmer.stem(word) for word in tokenized_doc]
        
        tokenized_docs.append(tokenized_doc)
    
    classified_tokenized_docs[class_labels[i]] = tokenized_docs

class1 being processed...
class2 being processed...
class3 being processed...
class4 being processed...
class5 being processed...
class6 being processed...
class7 being processed...
class8 being processed...
class9 being processed...
CPU times: user 7min 25s, sys: 390 ms, total: 7min 25s
Wall time: 7min 26s


In [8]:
%%time
# Create adictionary. Keys are class labels
# appearance_frequency looks at what fraction of
# the doc within each class the token appears
appearance_frequency = {}
for i in range(9):
    print('%s being processed...' % class_labels[i])
    tokenized_docs = classified_tokenized_docs[class_labels[i]]
    num_docs = len(tokenized_docs)
    
    app_freq_list = []
    for doc in tokenized_docs:
        c = Counter(doc)
        freq = dict(c)
        app_freq = {key:1 for key, value in freq.items() if value > 0}
        app_freq_list.append(app_freq)
    app_freq_table = pd.DataFrame(app_freq_list)
    app_freq = dict(app_freq_table.sum(axis=0)/num_docs)
    
    appearance_frequency[class_labels[i]] = app_freq

class1 being processed...
class2 being processed...
class3 being processed...
class4 being processed...
class5 being processed...
class6 being processed...
class7 being processed...
class8 being processed...
class9 being processed...
CPU times: user 25.3 s, sys: 509 ms, total: 25.8 s
Wall time: 25.8 s


In [10]:
%%time
# Remove words that appear in high frequency that 
# are intersecting words among top 3000 words
# in each class
fracdocs = pd.DataFrame(appearance_frequency).fillna(value=0)
n = 3000
top_words = []
for i in range(9):
    tops = fracdocs[class_labels[i]].sort_values(ascending=False).head(n)
    top_words.append(list(tops.index))

overlap1 = set(top_words[0])
for lis in top_words[1:]:
    overlap1.intersection_update(lis)
print('# intersecting words among top%d appearing words in each class: ', len(overlap1))
    
remove_list = []
for i in range(9):
    remove_words = [word for word in overlap1 \
                    if word in fracdocs[class_labels[i]] \
                    if fracdocs[class_labels[i]][word] > 0.5]
    remove_list.append(list(remove_words))

overlap2 = set(remove_list[0])
for lis in remove_list[1:]:
    overlap2.intersection_update(lis)
print('# intersecting words with >50% appearance: ', len(overlap2))

print('Table shape before removal: ', fracdocs.shape)
fracdocs = fracdocs.drop(overlap2)
print('Table shape after removal:  ', fracdocs.shape)

# intersecting words among top%d appearing words in each class:  1391
# intersecting words with >50% appearance:  287
Table shape before removal:  (125448, 9)
Table shape after removal:   (125161, 9)
CPU times: user 519 ms, sys: 1 µs, total: 519 ms
Wall time: 520 ms


In [13]:
%%time
# Use relative functions to select words by
# differential appearance frequency in each class
ncw_relative = fefs.getNClassWords(fracdocs, doc_type='fraction_of_docs',
                            mode='relative', min_frequency=0.45,
                            min_difference=1.35, print_result=True)
ncw_absolute = fefs.getNClassWords(fracdocs, doc_type='fraction_of_docs',
                            mode='absolute', min_frequency=0.6,
                            min_difference=0.1, print_result=True)
ncw_exclusive = fefs.getNClassExclusiveWords(fracdocs, min_frequency=0.175, print_result=True)
words1 = fefs.selectNClassWords(ncw_relative, n=8)
words2 = fefs.selectNClassWords(ncw_absolute, n=8)
words3 = fefs.selectNClassWords(ncw_exclusive, n=8)
select_words = set((words1 + words2 + words3))

print("'Relative' extraction: %d words" % len(words1))
print("'Absolute' extraction: %d words" % len(words2))
print("'Exclusive' extraction: %d words" % len(words3))
print("# unique words extracted: %d words" % len(select_words))

Input Type: fraction_of_docs
Minimum Difference = 1.35
Minimum Frequency = 0.45
# of words in one_class_words: 185
# of words in two_class_words: 56
# of words in three_class_words: 5
# of words in four_class_words: 7
# of words in five_class_words: 2
# of words in six_class_words: 3
# of words in seven_class_words: 19
# of words in eight_class_words: 36
# of words in other_words: 124842
Total # of words: 125155
Input Type: fraction_of_docs
Minimum Difference = 0.10
Minimum Frequency = 0.60
# of words in one_class_words: 176
# of words in two_class_words: 50
# of words in three_class_words: 7
# of words in four_class_words: 5
# of words in five_class_words: 6
# of words in six_class_words: 3
# of words in seven_class_words: 6
# of words in eight_class_words: 6
# of words in other_words: 124897
Total # of words: 125156
===== n-class words extractions by exclusive appearances =====
Minimum Frequency = 0.175000
# of words in one_class_words: 9
# of words in two_class_words: 0
# of words i