In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from collections import Counter
import gzip
from sklearn.utils import shuffle
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score



In [3]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df1 = getDF('Automotive.json.gz')
df2 = getDF('Cloth_shoes.json.gz')
df3 = getDF('Digi_Mus.json.gz')
df4 = getDF('Electronics.json.gz')
df5 = getDF('garden.json.gz')
df6 = getDF('health.json.gz')
df7 = getDF('home_kitchen.json.gz')
df8 = getDF('kindle.json.gz')
df9 = getDF('office_prod.json.gz')
df10 = getDF('pet_supply.json.gz')
df11 = getDF('tools.json.gz')

In [4]:
labels1 = ['Automotive', 'Cloth_shoes', 'Digi_Mus', 'Electronics', 'garden', 'health', 'home_kitchen', 'kindle', 'office_prod', 'pet_supply', 'tools']

In [5]:
df1['Class'] = 'Automotive'
df2['Class'] = 'Cloth_shoes'
df3['Class'] = 'Digi_Mus'
df4['Class'] = 'Electronics'
df5['Class'] = 'garden'
df6['Class'] = 'health'
df7['Class'] = 'home_kitchen'
df8['Class'] = 'kindle'
df9['Class'] = 'office_prod'
df10['Class'] = 'pet_supply'
df11['Class'] = 'tools'

In [6]:
df1_split = df1[0:10000]
df2_split = df2[0:10000]
df3_split = df3[0:10000]
df4_split = df4[0:10000]
df5_split = df5[0:10000]
df6_split = df6[0:10000]
df7_split = df7[0:10000]
df8_split = df8[0:10000]
df9_split = df9[0:10000]
df10_split = df10[0:10000]
df11_split = df11[0:10000]

In [7]:
frames = [df1_split, df2_split, df3_split, df4_split, df5_split, df6_split, df7_split, df8_split, df9_split, df10_split, df11_split]
df = pd.concat(frames)
df = shuffle(df)

In [8]:
tr_x, ts_x, tr_y, ts_y = train_test_split(df['reviewText'], df['Class'], test_size=0.2, random_state=0)

In [9]:
df_ts_x = pd.DataFrame(ts_x)
df_ts_y = pd.DataFrame(ts_y)

# BernoulliNB

In [10]:
uni_vectorizer = CountVectorizer(min_df=1, stop_words='english')
bi_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1, stop_words='english')
tri_vectorizer = CountVectorizer(ngram_range=(1, 3), min_df=1, stop_words='english')
four_vectorizer = CountVectorizer(ngram_range=(1, 4), min_df=1, stop_words='english')



cls = BernoulliNB()

unigram_pred = Pipeline([
    ('vect', uni_vectorizer),
    ('classifier', cls)
])

bigram_pred = Pipeline([
    ('vect', bi_vectorizer),
    ('classifier', cls)
])

trigram_pred = Pipeline([
    ('vect', tri_vectorizer),
    ('classifier', cls)
])

fourgram_pred = Pipeline([
    ('vect', four_vectorizer),
    ('classifier', cls)
])


UNIGRAM NB

In [10]:
#CROSS VALIDATION
cv1 = ShuffleSplit(n_splits=5, test_size = 0.3, random_state=0)
scores = cross_val_score(unigram_pred, tr_x, tr_y, cv=cv1)
print ('cross_validation scores', scores)
#
unigram_pred.fit(tr_x, tr_y)
unigram_result = unigram_pred.predict(ts_x)
print(labels1)
confusion_matrix = metrics.confusion_matrix(ts_y, unigram_result, labels=labels1)
print (confusion_matrix)
score = metrics.accuracy_score(ts_y, unigram_result)
print('Accuracy score is', score)

('cross_validation scores', array([ 0.78352273,  0.78386364,  0.78276515,  0.77829545,  0.78193182]))
['Automotive', 'Cloth_shoes', 'Digi_Mus', 'Electronics', 'garden', 'health', 'home_kitchen', 'kindle', 'office_prod', 'pet_supply', 'tools']
[[1687  144    0   19   45   27   24    0    5   22   50]
 [   4 1893    0   20   14   10    3    1    5    1    6]
 [   7  178 1719   61    0   10    4   19    0    9    3]
 [ 183  343    3 1301    1   60    3    0   72   16   29]
 [ 171  148    0    1 1313   68   91    0    7  173   60]
 [  83  244    1   29   49 1433   54    0   12   50   17]
 [  29  156    0    2   40   49 1651    2    9   28   25]
 [  10  186    6   13    0    8    4 1754    8    4    1]
 [  42  266    2   43   17   52   12    1 1506   11   38]
 [  44  100    0    1   91   46    5    0    2 1757    5]
 [ 244  211    0   17   30   58   26    1   16   24 1342]]
('Accuracy score is', 0.78890909090909089)


BIGRAM NB

In [11]:
#CROSS VALIDATION
cv1 = ShuffleSplit(n_splits=5, test_size = 0.3, random_state=0)
scores = cross_val_score(bigram_pred, tr_x, tr_y, cv=cv1)
print ('cross_validation scores', scores)
#
bigram_pred.fit(tr_x, tr_y)
bigram_result = bigram_pred.predict(ts_x)
print(labels1)
confusion_matrix = metrics.confusion_matrix(ts_y, bigram_result, labels=labels1)
print ('confusion matrix is', confusion_matrix)
score = metrics.accuracy_score(ts_y, bigram_result)
print('Accuracy score is', score)

('cross_validation scores', array([ 0.70170455,  0.67204545,  0.63253788,  0.63575758,  0.62409091]))
['Automotive', 'Cloth_shoes', 'Digi_Mus', 'Electronics', 'garden', 'health', 'home_kitchen', 'kindle', 'office_prod', 'pet_supply', 'tools']
('confusion matrix is', array([[1468,  441,    0,    6,   10,   41,   12,    0,    5,   15,   25],
       [   0, 1939,    0,    7,    3,    2,    2,    0,    0,    0,    4],
       [   1,  489, 1480,   17,    0,   11,    1,   10,    1,    0,    0],
       [ 129,  780,    2,  975,    0,   68,    1,    0,   36,    4,   16],
       [ 157,  451,    0,    1,  911,  126,  126,    0,    4,  188,   68],
       [  41,  505,    0,    7,   13, 1342,   27,    0,    1,   22,   14],
       [  14,  410,    0,    0,    9,   51, 1480,    1,    3,    9,   14],
       [   3,  495,    0,    5,    0,    3,    4, 1481,    2,    1,    0],
       [  25,  609,    1,   24,    3,   45,   15,    1, 1244,    3,   20],
       [  20,  331,    0,    1,   16,   45,    6,    0,   

TRIGRAM NB

In [11]:
#CROSS VALIDATION
cv1 = ShuffleSplit(n_splits=5, test_size = 0.3, random_state=0)
scores = cross_val_score(trigram_pred, tr_x, tr_y, cv=cv1)
print ('cross_validation scores', scores)
#
trigram_pred.fit(tr_x, tr_y)
trigram_result = trigram_pred.predict(ts_x)
print(labels1)
confusion_matrix = metrics.confusion_matrix(ts_y, trigram_result, labels=labels1)
print ('confusion matrix is', confusion_matrix)
score = metrics.accuracy_score(ts_y, trigram_result)
print('Accuracy score is', score)

('cross_validation scores', array([ 0.4994697 ,  0.43265152,  0.50109848,  0.55121212,  0.55549242]))
['Automotive', 'Cloth_shoes', 'Digi_Mus', 'Electronics', 'garden', 'health', 'home_kitchen', 'kindle', 'office_prod', 'pet_supply', 'tools']
('confusion matrix is', array([[1177,  761,    0,    5,    2,    9,    2,    0,    0,   39,   17],
       [   0, 1954,    0,    2,    0,    0,    0,    0,    2,    0,    1],
       [   1,  846, 1130,    8,    0,    1,    0,   39,    0,    0,    0],
       [  86, 1136,    0,  716,    0,   25,    1,    0,   11,   32,   16],
       [ 148,  671,    0,    0,  566,   27,   88,    0,    0,  471,   59],
       [  29,  966,    0,    5,    6,  877,   14,    0,    0,  106,    6],
       [  14,  792,    0,    0,    2,   17, 1121,    0,    2,   48,   12],
       [   3,  774,    0,    6,    0,    2,    1, 1175,    0,    1,    0],
       [  24, 1113,    1,   31,    1,   16,   12,    1,  803,   37,   20],
       [   2,  385,    0,    0,    4,   10,    0,    0,   