In [16]:
#Basic importing

import sqlite3
import pandas as pd
import numpy as np

def partition(x):
    if x<3:
        return 0
    return 1

con = sqlite3.connect('Amazon_reviews.sqlite')
data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score!=3 LIMIT 5000""", con)

actual_score=data["Score"]
positiveNegative=actual_score.map(partition)
data["Score"]=positiveNegative
data=data.drop(["UserId","ProductId","HelpfulnessNumerator","HelpfulnessDenominator","ProfileName","Summary","Time"],axis=1)

print(data.head(10))

   Id  Score                                               Text
0   1      1  I have bought several of the Vitality canned d...
1   2      0  Product arrived labeled as Jumbo Salted Peanut...
2   3      1  This is a confection that has been around a fe...
3   4      0  If you are looking for the secret ingredient i...
4   5      1  Great taffy at a great price.  There was a wid...
5   6      1  I got a wild hair for taffy and ordered this f...
6   7      1  This saltwater taffy had great flavors and was...
7   8      1  This taffy is so good.  It is very soft and ch...
8   9      1  Right now I'm mostly just sprouting this so my...
9  10      1  This is a very healthy dog food. Good for thei...


In [17]:
#Data Preprocessing

import nltk
import string
import re
from nltk.corpus import stopwords
nltk.download("stopwords")

stop = set(stopwords.words('english')) 
sno = nltk.stem.SnowballStemmer('english') 

def cleanhtml(sentence): 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunc(sentence): 
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

textArray=np.array(data["Text"])
cleanedTextArray=[]
for sent in textArray:
    sent=cleanhtml(sent);
    sent=cleanpunc(sent);
    sentArray=sent.split()
    cleanedSent=[]
    for word in sentArray:
        word=word.lower();
        if(word not in stop):
            word=sno.stem(word)
            cleanedSent.append(word);
    cleanedTextArray.append(cleanedSent)

data["Text"] = cleanedTextArray
print(data.head(10))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   Id  Score                                               Text
0   1      1  [bought, sever, vital, can, dog, food, product...
1   2      0  [product, arriv, label, jumbo, salt, peanut, p...
2   3      1  [confect, around, centuri, light, pillowi, cit...
3   4      0  [look, secret, ingredi, robitussin, believ, fo...
4   5      1  [great, taffi, great, price, wide, assort, yum...
5   6      1  [got, wild, hair, taffi, order, five, pound, b...
6   7      1  [saltwat, taffi, great, flavor, soft, chewi, c...
7   8      1  [taffi, good, soft, chewi, flavor, amaz, would...
8   9      1  [right, im, most, sprout, cat, eat, grass, lov...
9  10      1  [healthi, dog, food, good, digest, also, good,...


In [18]:
from sklearn import model_selection

#upsampling
data1 = data[data["Score"]==1]
data0 = data[data["Score"]==0]
print("Number of positive negative data pts: ",data1.shape[0], data0.shape[0])
data0new = pd.concat([data0,data0,data0,data0,data0,data0],ignore_index=True)

data = pd.concat([data1,data0new],ignore_index=True)
data = data.drop("Id",axis=1)
print("Number of positive negative data pts after Upsampling: ",data1.shape[0],data[data["Score"]==0].shape[0])
print(data.head(10))

#Train-Test Split
data_L= data["Score"]
data  = data.drop("Score",axis=1)
X_train,X_test,y_train,y_test = model_selection.train_test_split(data,data_L,test_size=0.1)

Number of positive negative data pts:  4187 813
Number of positive negative data pts after Upsampling:  4187 4878
   Score                                               Text
0      1  [bought, sever, vital, can, dog, food, product...
1      1  [confect, around, centuri, light, pillowi, cit...
2      1  [great, taffi, great, price, wide, assort, yum...
3      1  [got, wild, hair, taffi, order, five, pound, b...
4      1  [saltwat, taffi, great, flavor, soft, chewi, c...
5      1  [taffi, good, soft, chewi, flavor, amaz, would...
6      1  [right, im, most, sprout, cat, eat, grass, lov...
7      1  [healthi, dog, food, good, digest, also, good,...
8      1  [dont, know, cactus, tequila, uniqu, combin, i...
9      1  [one, boy, need, lose, weight, didnt, put, foo...


In [19]:
#Vectorize Text

import gensim
from tqdm import tqdm

list_of_sent = X_train["Text"]
w2v_model=gensim.models.Word2Vec(list_of_sent,min_count=5,size=50, workers=4)
w2v_words=list(w2v_model.wv.vocab)

listof_sent_vec=[]
#tqdm is for improving speed
#Vectorization and normalization both going on
for sent in tqdm(list_of_sent): 
    sent_vec = np.zeros(50) 
    cnt_words =0; 
    for word in sent: 
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    listof_sent_vec.append(sent_vec)
    
list_col=tuple(range(50))
X_train=pd.DataFrame(data=listof_sent_vec, columns=list_col)
print(X_train.head(10))
print(X_train.shape)

100%|█████████████████████████████████████████████████████████████████████████████| 8158/8158 [00:37<00:00, 219.49it/s]


         0         1         2         3         4         5         6   \
0  0.431903 -0.381995  0.378593  0.523820  0.284583 -0.207679 -0.177397   
1  0.575280 -0.680137  0.293370  0.591984  0.454178 -0.477862 -0.291293   
2  0.308826 -0.832527  0.698802  0.245989  0.033335 -0.172367 -0.668512   
3  0.339948 -0.658952  0.527192  0.199352  0.311589 -0.315483 -0.282666   
4  0.355836 -0.568258  0.296901  0.583916  0.445909 -0.350972  0.012591   
5  0.442231 -0.625233  0.235512  0.640576  0.417766 -0.269260 -0.169581   
6  0.737685 -0.599947  0.232405  0.819594  0.498369 -0.531497 -0.190363   
7  0.541054 -0.542808  0.399283  0.435575  0.424205 -0.466830 -0.274356   
8  0.497757 -0.740315  0.218907  0.108493  0.249225 -0.335615 -0.567747   
9  0.753847 -0.580651  0.176100  1.233916  0.662455 -0.949763 -0.057467   

         7         8         9   ...        40        41        42        43  \
0 -0.072036  0.247916 -0.334947  ...  0.324443  0.049323 -0.474039 -0.570390   
1  0.219156  0

In [20]:
list_of_sent = X_test["Text"]

listof_sent_vec=[]
#tqdm is for improving speed
#Vectorization and normalization both going on
for sent in tqdm(list_of_sent): 
    sent_vec = np.zeros(50) 
    cnt_words =0; 
    for word in sent: 
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    listof_sent_vec.append(sent_vec)
    
list_col=tuple(range(50))
X_test=pd.DataFrame(data=listof_sent_vec, columns=list_col)
print(X_test.head(10))
print(X_test.shape)

100%|███████████████████████████████████████████████████████████████████████████████| 907/907 [00:03<00:00, 232.41it/s]


         0         1         2         3         4         5         6   \
0  0.031603 -0.847403  0.133999  0.892748  0.365950 -0.633642  0.194094   
1  0.270566 -0.520761  0.296037  0.781815  0.276381 -0.426343 -0.254261   
2  0.331553 -0.546859  0.380111  0.309448  0.256770 -0.313957 -0.288418   
3  0.479919 -0.562102  0.532904  0.315516  0.244812 -0.287422 -0.343223   
4  0.342021 -0.731478  0.610917  0.290220  0.231881 -0.127621 -0.360007   
5  0.239157 -0.626500  0.319459  0.144555  0.265537 -0.271952 -0.294281   
6  1.119164 -0.463757  0.282506  0.520515  0.453185  0.015696 -0.402799   
7  0.642823 -0.454932  0.495139  0.344915  0.325437 -0.133436 -0.354844   
8  0.051358 -0.654712  0.681238  0.581920  0.089366 -0.078336 -0.125406   
9  0.304208 -0.483646  0.337412  0.288739  0.278590 -0.229991 -0.230316   

         7         8         9   ...        40        41        42        43  \
0  0.378647  0.039353 -0.193295  ... -0.103244  0.205858 -0.718903 -0.672153   
1  0.182156 -0

In [22]:
#Standardization
from sklearn.preprocessing import StandardScaler;

scaler = StandardScaler()
X_train= scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print(X_train[0])
print(X_test[0])

[ 0.01897236  1.35352734  0.02189467  0.03848828 -0.25931455  0.65818892
  0.27816396 -0.88346469 -0.19936649 -0.47473253 -0.17035279  0.98752834
 -0.53518416  0.77356854  0.01376524 -0.13505271  0.25711098  0.55351811
  0.07613625 -0.77118451  0.46461331  0.43185612 -0.05275839  0.00925503
 -0.61257239  0.39285602 -0.54316702  0.40757853 -0.72047777 -0.23868149
 -0.06038479 -0.45479151 -0.32938002 -0.39707486  0.44504935  0.57017544
  0.09032992 -0.66409583  0.73949395 -0.26850521  0.13615461  0.3389129
  0.1257821  -0.10969215  0.22905093 -0.69413646  0.59064034  0.61720237
  0.23423148 -0.01190825]
[-1.99898498 -1.90712679 -1.39671151  1.30870904  0.2236665  -1.55924123
  2.3988228   0.6414891  -1.09292678  0.13279559  1.52709884 -1.32633582
  2.7139989   0.41771387 -0.57255367  1.21710168  0.35531865 -1.46169719
  2.66288363  0.27392521  1.13362176 -1.2191306  -0.64300055  0.84897975
 -0.6699157  -0.25853123  1.62421362 -1.16586129  0.6693948   2.63686826
  1.2818412  -0.56842676  

In [23]:
#Naive Bayes(using 10-fold cv for alpha)

from sklearn import metrics;
from sklearn import naive_bayes;

#k-fold cv
alpha_values=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.95,1]
cv_scores=[]

for alpha in alpha_values:
    nb_model=naive_bayes.BernoulliNB(alpha=alpha)
    scores=model_selection.cross_val_score(nb_model,X_train,y_train,cv=10,scoring="accuracy")
    cv_scores.append(scores.mean())

max_acc=cv_scores[0]
i=0
max_i=0
for acc in cv_scores:
    if(acc>max_acc):
        max_acc=acc
        max_i=i
    i=i+1
    
optimal_alpha=alpha_values[max_i]
print("Optimal value of alpha after 10-fold CV: "+str(optimal_alpha))

#Final model
nb_model=naive_bayes.BernoulliNB(alpha=optimal_alpha)
nb_model.fit(X_train,y_train)

arr=nb_model.predict(X_test)

acc_nb=metrics.accuracy_score(y_test, arr, normalize=True) * float(100)
cf_mat_NB=metrics.confusion_matrix(y_test,arr)
print("Accuracy of Naive Bayes: "+str(acc_nb))
print("Confusion matrix for Naive Bayes: \n",cf_mat_NB)

Optimal value of alpha after 10-fold CV: 0.1
Accuracy of Naive Bayes: 65.38037486218302
Confusion matrix for Naive Bayes: 
 [[307 152]
 [162 286]]


In [24]:
#Logostic Regresion(using grid search for parameter)

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

#model and grid search
tuned_parameter=[ { 'C' : [10**-4,10**-2,10**0,10**2,10**4] } ]
LR_model=model_selection.GridSearchCV(LogisticRegression(solver="liblinear"),tuned_parameter,scoring='f1',cv=5)
LR_model.fit(X_train,y_train)

arr=LR_model.predict(X_test)
cf_mat_LR=metrics.confusion_matrix(y_test,arr)
acc_LR=metrics.accuracy_score(y_test,arr)*float(100)
print("The accuracy for LR: ", acc_LR)
print("The confusion_matrix for LR: \n", cf_mat_LR)

The accuracy for LR:  79.27232635060639
The confusion_matrix for LR: 
 [[392  67]
 [121 327]]


In [26]:
#Decision Tress

from sklearn import tree
from sklearn import metrics
from sklearn import model_selection

#Finding right depth
d_values=range(1,10)
cv_scores=[]

for d in d_values:
    DT=tree.DecisionTreeClassifier(max_depth=d)
    scores=model_selection.cross_val_score(DT,X_train,y_train,cv=10,scoring="accuracy")
    cv_scores.append(scores.mean())

max_acc=cv_scores[0]
i=0
max_i=0
for acc in cv_scores:
    if(acc>max_acc):
        max_acc=acc
        max_i=i
    i=i+1
    
optimal_d=d_values[max_i]
print("Optimal value of d after 10-fold CV: "+str(optimal_d))

#Model
DT_model=tree.DecisionTreeClassifier(max_depth=optimal_d)
DT_model.fit(X_train,y_train)

arr=DT_model.predict(X_test)

final_acc_DT=metrics.accuracy_score(y_test, arr, normalize=True) * float(100)
cf_mat_DT=metrics.confusion_matrix(y_test,arr)
print("Accuracy for DT: "+str(final_acc_DT))
print("Confusion matrix for DT: \n",cf_mat_DT)

Optimal value of d after 10-fold CV: 9
Accuracy for DT: 84.6747519294377
Confusion matrix for DT: 
 [[426  33]
 [106 342]]


In [27]:
#KNN(using 10-fold cv for k)

from sklearn import neighbors;
from sklearn import model_selection;
from sklearn import metrics;

#k-fold cv
k_values=range(1,50,2)
cv_scores=[]

for k in k_values:
    knn=neighbors.KNeighborsClassifier(n_neighbors=k)
    scores=model_selection.cross_val_score(knn,X_train,y_train,cv=10,scoring="accuracy")
    cv_scores.append(scores.mean())

max_acc=cv_scores[0]
i=0
max_i=0
for acc in cv_scores:
    if(acc>max_acc):
        max_acc=acc
        max_i=i
    i=i+1
    
optimal_k=k_values[max_i]
print("Optimal value of k after 10-fold CV: "+str(optimal_k))

#Final_model
knn_model=neighbors.KNeighborsClassifier(n_neighbors=optimal_k)
knn_model.fit(X_train,y_train)

arr=knn_model.predict(X_test)

final_acc_knn=metrics.accuracy_score(y_test, arr, normalize=True) * float(100)
cf_mat_knn=metrics.confusion_matrix(y_test,arr)
print("Accuracy for KNN: "+str(final_acc_knn))
print("Confusion matrix for KNN: \n",cf_mat_knn)

#KNN proved to be a better model than NB,LR,DT and notedly its ability to properly classify the positive points is great.
#It is a good candidate to be used as a first model for stacking/cascading.

Optimal value of k after 10-fold CV: 1
Accuracy for KNN: 95.25909592061743
Confusion matrix for KNN: 
 [[459   0]
 [ 43 405]]


In [28]:
#SVM

from sklearn.svm import SVC
from sklearn import metrics

SVM_model=SVC(C=1000,kernel="rbf",gamma="auto")
SVM_model.fit(X_train,y_train)

arr=SVM_model.predict(X_test)
cf_mat_SVM=metrics.confusion_matrix(y_test,arr)
acc_SVM=metrics.accuracy_score(y_test,arr)*float(100)
print("The accuracy for SVM: ", acc_SVM)
print("The confusion_matrix for SVM: \n", cf_mat_SVM)

#Observed that Linear SVM performs similarly as LR and not much effective
##RBF Kernel offered an accuracy of 92% at C=1000(behaves like knn)
#Poly Kernel had 77% accuracy and looked not so good for negative pts at degree=4 and C=100

The accuracy for SVM:  94.8180815876516
The confusion_matrix for SVM: 
 [[459   0]
 [ 47 401]]


In [29]:
#GBDT

from sklearn.ensemble import GradientBoostingClassifier as GBDT
from sklearn import metrics

GBDT_model=GBDT(loss="deviance",learning_rate=0.3,n_estimators=100)
GBDT_model.fit(X_train,y_train)

arr=GBDT_model.predict(X_test)
cf_mat_GBDT=metrics.confusion_matrix(y_test,arr)
acc_GBDT=metrics.accuracy_score(y_test,arr)*float(100)
print("The accuracy for GBDT: ", acc_GBDT)
print("The confusion_matrix for GBDT: \n", cf_mat_GBDT)

#GBDT performed reasonably well on the data

The accuracy for GBDT:  89.41565600882029
The confusion_matrix for GBDT: 
 [[432  27]
 [ 69 379]]


In [30]:
#Random Forests(implementing bagging)

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

RF_model=RandomForestClassifier(n_estimators=1000,max_depth=None)
RF_model.fit(X_train,y_train)

arr=RF_model.predict(X_test)
cf_mat_RF=metrics.confusion_matrix(y_test,arr)
acc_RF=metrics.accuracy_score(y_test,arr)*float(100)
print("The accuracy for RF: ", acc_RF)
print("The confusion_matrix for RF: \n", cf_mat_RF)

#RF proved to be the best classifier till now with an accuracy of 98% at 1000 trees.
#It classifies all positive points correctly 

The accuracy for RF:  98.34619625137817
The confusion_matrix for RF: 
 [[459   0]
 [ 15 433]]
