In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from collections import Counter
import gzip
from sklearn.utils import shuffle
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score



In [2]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df1 = getDF('Automotive.json.gz')
df2 = getDF('Cloth_shoes.json.gz')
df3 = getDF('Digi_Mus.json.gz')
df4 = getDF('Electronics.json.gz')
df5 = getDF('garden.json.gz')
df6 = getDF('health.json.gz')
df7 = getDF('home_kitchen.json.gz')
df8 = getDF('kindle.json.gz')
df9 = getDF('office_prod.json.gz')
df10 = getDF('pet_supply.json.gz')
df11 = getDF('tools.json.gz')

In [3]:
labels1 = ['Automotive', 'Cloth_shoes', 'Digi_Mus', 'Electronics', 'garden', 'health', 'home_kitchen', 'kindle', 'office_prod', 'pet_supply', 'tools']

In [4]:
df1['Class'] = 'Automotive'
df2['Class'] = 'Cloth_shoes'
df3['Class'] = 'Digi_Mus'
df4['Class'] = 'Electronics'
df5['Class'] = 'garden'
df6['Class'] = 'health'
df7['Class'] = 'home_kitchen'
df8['Class'] = 'kindle'
df9['Class'] = 'office_prod'
df10['Class'] = 'pet_supply'
df11['Class'] = 'tools'

In [5]:
df1_split = df1[0:10000]
df2_split = df2[0:10000]
df3_split = df3[0:10000]
df4_split = df4[0:10000]
df5_split = df5[0:10000]
df6_split = df6[0:10000]
df7_split = df7[0:10000]
df8_split = df8[0:10000]
df9_split = df9[0:10000]
df10_split = df10[0:10000]
df11_split = df11[0:10000]

In [6]:
frames = [df1_split, df2_split, df3_split, df4_split, df5_split, df6_split, df7_split, df8_split, df9_split, df10_split, df11_split]
df = pd.concat(frames)
df = shuffle(df)

In [7]:
tr_x, ts_x, tr_y, ts_y = train_test_split(df['reviewText'], df['Class'], test_size=0.2, random_state=0)

In [8]:
df_ts_x = pd.DataFrame(ts_x)
df_ts_y = pd.DataFrame(ts_y)

# Multinomial NB

In [9]:
uni_vectorizer = CountVectorizer(min_df=1, stop_words='english')
bi_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1, stop_words='english')
tri_vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=1, stop_words='english')
four_vectorizer = CountVectorizer(ngram_range=(1, 4), min_df=1, stop_words='english')



cls = MultinomialNB()

unigram_pred = Pipeline([
    ('vect', uni_vectorizer),
    ('classifier', cls)
])

bigram_pred = Pipeline([
    ('vect', bi_vectorizer),
    ('classifier', cls)
])

trigram_pred = Pipeline([
    ('vect', tri_vectorizer),
    ('classifier', cls)
])

fourgram_pred = Pipeline([
    ('vect', four_vectorizer),
    ('classifier', cls)
])


UNIGRAM NB

In [10]:
#CROSS VALIDATION
cv1 = ShuffleSplit(n_splits=5, test_size = 0.3, random_state=0)
scores = cross_val_score(unigram_pred, tr_x, tr_y, cv=cv1)
print ('cross_validation scores', scores)
#
unigram_pred.fit(tr_x, tr_y)
unigram_result = unigram_pred.predict(ts_x)
print(labels1)
confusion_matrix = metrics.confusion_matrix(ts_y, unigram_result, labels=labels1)
print (confusion_matrix)
score = metrics.accuracy_score(ts_y, unigram_result)
print('Accuracy score is', score)

('cross_validation scores', array([ 0.8857197 ,  0.88734848,  0.88568182,  0.88988636,  0.88594697]))
['Automotive', 'Cloth_shoes', 'Digi_Mus', 'Electronics', 'garden', 'health', 'home_kitchen', 'kindle', 'office_prod', 'pet_supply', 'tools']
[[1683   20    0   30   70   42   38    1   26   17  122]
 [   9 1888    2   21   16   19    7    7   17    7   11]
 [   0    0 1988    8    0    0    1    2    0    0    0]
 [  32   19    8 1704   12   33    8    4   96   12   43]
 [  36   11    0   12 1799   12   48    0   13   21   62]
 [  53   72    1   53   52 1565   71    5   39   44   54]
 [  18   22    1   15   39   30 1790    7   12   17   52]
 [   0    1    4   34    3    0    3 1936    5    0    0]
 [  16    9    3   60   24   15   14    3 1801    4   31]
 [  33   15    1    4   82   70   20    0   11 1738   19]
 [  78   21    1   44   53   36   17    2   50    4 1686]]
('Accuracy score is', 0.88990909090909087)


BIGRAM NB

In [11]:
#CROSS VALIDATION
cv1 = ShuffleSplit(n_splits=5, test_size = 0.3, random_state=0)
scores = cross_val_score(bigram_pred, tr_x, tr_y, cv=cv1)
print ('cross_validation scores', scores)
#
bigram_pred.fit(tr_x, tr_y)
bigram_result = bigram_pred.predict(ts_x)
print(labels1)
confusion_matrix = metrics.confusion_matrix(ts_y, bigram_result, labels=labels1)
print ('confusion matrix is', confusion_matrix)
score = metrics.accuracy_score(ts_y, bigram_result)
print('Accuracy score is', score)

('cross_validation scores', array([ 0.84643939,  0.84757576,  0.84617424,  0.84753788,  0.84598485]))
['Automotive', 'Cloth_shoes', 'Digi_Mus', 'Electronics', 'garden', 'health', 'home_kitchen', 'kindle', 'office_prod', 'pet_supply', 'tools']
('confusion matrix is', array([[1413,    5,    5,   60,  343,   21,   26,    1,   32,   12,  131],
       [   6, 1740,   16,   53,  100,   16,   12,    5,   36,    3,   17],
       [   0,    0, 1996,    3,    0,    0,    0,    0,    0,    0,    0],
       [   9,    4,   17, 1739,   67,   10,    5,    3,   86,    6,   25],
       [   9,    2,    2,    8, 1931,    4,   20,    0,    5,    3,   30],
       [  28,   37,    9,   98,  287, 1370,   56,    4,   51,   15,   54],
       [   4,    8,    6,   20,  188,   11, 1694,    9,   15,    4,   44],
       [   0,    0,   25,   29,    5,    0,    1, 1923,    3,    0,    0],
       [   3,    3,    5,   64,   62,    4,    7,    3, 1806,    2,   21],
       [  12,    4,    8,   14,  304,   46,   14,    0,   

TRIGRAM NB

In [12]:
#CROSS VALIDATION
cv1 = ShuffleSplit(n_splits=5, test_size = 0.3, random_state=0)
scores = cross_val_score(trigram_pred, tr_x, tr_y, cv=cv1)
print ('cross_validation scores', scores)
#
trigram_pred.fit(tr_x, tr_y)
trigram_result = trigram_pred.predict(ts_x)
print(labels1)
confusion_matrix = metrics.confusion_matrix(ts_y, trigram_result, labels=labels1)
print ('confusion matrix is', confusion_matrix)
score = metrics.accuracy_score(ts_y, trigram_result)
print('Accuracy score is', score)

('cross_validation scores', array([ 0.83      ,  0.83253788,  0.83162879,  0.83284091,  0.83026515]))
['Automotive', 'Cloth_shoes', 'Digi_Mus', 'Electronics', 'garden', 'health', 'home_kitchen', 'kindle', 'office_prod', 'pet_supply', 'tools']
('confusion matrix is', array([[1340,    5,    8,   59,  430,   23,   27,    0,   32,    9,  116],
       [   6, 1685,   24,   60,  141,   12,    8,    5,   45,    1,   17],
       [   0,    0, 1996,    3,    0,    0,    0,    0,    0,    0,    0],
       [   7,    3,   19, 1734,   78,    8,    3,    4,   85,    5,   25],
       [   5,    2,    3,   10, 1945,    2,   17,    0,    3,    3,   24],
       [  22,   31,   17,  107,  364, 1294,   49,    3,   53,   15,   54],
       [   5,    8,    6,   23,  227,   12, 1656,    9,   16,    3,   38],
       [   0,    0,   37,   30,    5,    0,    1, 1910,    3,    0,    0],
       [   3,    3,    6,   65,   78,    5,    7,    3, 1792,    2,   16],
       [  11,    4,    9,   18,  375,   38,   10,    0,   