In [34]:
#Basic importing

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler;
from sklearn import model_selection

data=pd.read_csv("Amazon_reviews_vectorized.csv")
data=data.drop(["Unnamed: 0","Time"],axis=1)

#upsampling
data1=data[data["Score"]==1]
data0=data[data["Score"]==0]
data0new=pd.concat([data0,data0,data0,data0,data0],ignore_index=True)

data=pd.concat([data1,data0new],ignore_index=True)

#Standardization

data_L=data["Score"]
data=data.drop("Score",axis=1)

cols=data.columns
datastd=StandardScaler().fit_transform(data)
data=pd.DataFrame(data=datastd, columns=cols)

print(data_L.head())
print(data.head())

#Breaking data
X_train,X_test,y_train,y_test=model_selection.train_test_split(data,data_L,test_size=0.1)

0    1
1    1
2    1
3    1
4    1
Name: Score, dtype: int64
          0         1         2         3         4         5         6  \
0  1.271121 -1.282555 -1.315072  1.416699 -1.343868 -1.282359  1.450958   
1 -1.255075  1.183532  1.229887 -1.273021  1.130575  1.126262 -1.385114   
2 -2.346429  2.431473  2.336246 -2.263446  2.357211  2.158313 -2.221263   
3 -1.453606  1.413520  1.453984 -1.528507  1.462599  1.214627 -1.495196   
4 -1.277107  1.144231  1.258039 -1.299785  1.212882  1.342687 -1.179163   

          7         8         9  ...        40        41        42        43  \
0 -1.151886  1.409491 -1.176996  ... -1.275696  0.947599 -1.272468 -1.309511   
1  0.657298 -1.169387  1.193175  ...  1.253955 -1.448713  1.183008  1.241729   
2  2.087082 -2.384908  2.219964  ...  2.304383 -2.012602  2.263938  2.266795   
3  1.533178 -1.484417  1.489940  ...  1.463652 -1.343887  1.459697  1.416326   
4  1.065217 -1.283690  1.263329  ...  1.221818 -0.961523  1.158105  1.291899   

       

In [35]:
#Naive Bayes(using 10-fold cv for alpha)

from sklearn import model_selection;
from sklearn import metrics;
from sklearn import naive_bayes;

#k-fold cv
alpha_values=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.95,1]
cv_scores=[]

for alpha in alpha_values:
    nb_model=naive_bayes.BernoulliNB(alpha=alpha)
    scores=model_selection.cross_val_score(nb_model,X_train,y_train,cv=10,scoring="accuracy")
    cv_scores.append(scores.mean())

max_acc=cv_scores[0]
i=0
max_i=0
for acc in cv_scores:
    if(acc>max_acc):
        max_acc=acc
        max_i=i
    i=i+1
    
optimal_alpha=alpha_values[max_i]
print("Optimal value of alpha after 10-fold CV: "+str(optimal_alpha))

#Final model
nb_model=naive_bayes.BernoulliNB(alpha=optimal_alpha)
nb_model.fit(X_train,y_train)

arr=nb_model.predict(X_test)

acc_nb=metrics.accuracy_score(y_test, arr, normalize=True) * float(100)
cf_mat_NB=metrics.confusion_matrix(y_test,arr)
print("Accuracy of Naive Bayes: "+str(acc_nb))
print("Confusion matrix for Naive Bayes: \n",cf_mat_NB)

Optimal value of alpha after 10-fold CV: 0.1
Accuracy of Naive Bayes: 54.601226993865026
Confusion matrix for Naive Bayes: 
 [[45 32]
 [42 44]]


In [36]:
#Logostic Regresion(using grid search for parameter)

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

#model and grid search
tuned_parameter=[ { 'C' : [10**-4,10**-2,10**0,10**2,10**4] } ]
LR_model=model_selection.GridSearchCV(LogisticRegression(solver="liblinear"),tuned_parameter,scoring='f1',cv=5)
LR_model.fit(X_train,y_train)

arr=LR_model.predict(X_test)
cf_mat_LR=metrics.confusion_matrix(y_test,arr)
acc_LR=metrics.accuracy_score(y_test,arr)*float(100)
print("The accuracy for LR: ", acc_LR)
print("The confusion_matrix for LR: \n", cf_mat_LR)

The accuracy for LR:  63.190184049079754
The confusion_matrix for LR: 
 [[45 32]
 [28 58]]


In [37]:
#Decision Tress

from sklearn import tree
from sklearn import metrics
from sklearn import model_selection

#Finding right depth
d_values=range(1,10)
cv_scores=[]

for d in d_values:
    DT=tree.DecisionTreeClassifier(max_depth=d)
    scores=model_selection.cross_val_score(DT,X_train,y_train,cv=10,scoring="accuracy")
    cv_scores.append(scores.mean())

max_acc=cv_scores[0]
i=0
max_i=0
for acc in cv_scores:
    if(acc>max_acc):
        max_acc=acc
        max_i=i
    i=i+1
    
optimal_d=d_values[max_i]
print("Optimal value of d after 10-fold CV: "+str(optimal_d))

#Model
DT_model=tree.DecisionTreeClassifier(max_depth=optimal_d)
DT_model.fit(X_train,y_train)

arr=DT_model.predict(X_test)

final_acc_DT=metrics.accuracy_score(y_test, arr, normalize=True) * float(100)
cf_mat_DT=metrics.confusion_matrix(y_test,arr)
print("Accuracy for DT: "+str(final_acc_DT))
print("Confusion matrix for DT: \n",cf_mat_DT)

Optimal value of d after 10-fold CV: 9
Accuracy for DT: 82.82208588957054
Confusion matrix for DT: 
 [[77  0]
 [28 58]]


In [38]:
#KNN(using 10-fold cv for k)

from sklearn import neighbors;
from sklearn import model_selection;
from sklearn import metrics;

#k-fold cv
k_values=range(1,50,2)
cv_scores=[]

for k in k_values:
    knn=neighbors.KNeighborsClassifier(n_neighbors=k)
    scores=model_selection.cross_val_score(knn,X_train,y_train,cv=10,scoring="accuracy")
    cv_scores.append(scores.mean())

max_acc=cv_scores[0]
i=0
max_i=0
for acc in cv_scores:
    if(acc>max_acc):
        max_acc=acc
        max_i=i
    i=i+1
    
optimal_k=k_values[max_i]
print("Optimal value of k after 10-fold CV: "+str(optimal_k))

#Final_model
knn_model=neighbors.KNeighborsClassifier(n_neighbors=optimal_k)
knn_model.fit(X_train,y_train)

arr=knn_model.predict(X_test)

final_acc_knn=metrics.accuracy_score(y_test, arr, normalize=True) * float(100)
cf_mat_knn=metrics.confusion_matrix(y_test,arr)
print("Accuracy for KNN: "+str(final_acc_knn))
print("Confusion matrix for KNN: \n",cf_mat_knn)

#KNN proved to be a better model than NB,LR,DT and notedly its ability to properly classify the negative points is great.
#It is a good candidate to be used as a first model for stacking/cascading.

Optimal value of k after 10-fold CV: 1
Accuracy for KNN: 92.63803680981594
Confusion matrix for KNN: 
 [[77  0]
 [12 74]]


In [39]:
#SVM

from sklearn.svm import SVC
from sklearn import metrics

SVM_model=SVC(C=100,kernel="poly",degree=4,gamma="auto")
SVM_model.fit(X_train,y_train)

arr=SVM_model.predict(X_test)
cf_mat_SVM=metrics.confusion_matrix(y_test,arr)
acc_SVM=metrics.accuracy_score(y_test,arr)*float(100)
print("The accuracy for SVM: ", acc_SVM)
print("The confusion_matrix for SVM: \n", cf_mat_SVM)

#Observed that Linear SVM performs similarly as LR and not much effective
##RBF Kernel offered an accuracy of 100% for the negative class points but only 75% for positive pts at C=2000(behaves like knn)
#Poly Kernel offered an ok ok accuracy for negative but dumb for positive at degree=2 and C=1000
#Poly Kernel did okay for positive but behaved terrible for negative pts at degree=4 and C=100
#No SVM model did an overall good job on both classes so we will use them in cascading/stacking in some way or other

The accuracy for SVM:  59.50920245398773
The confusion_matrix for SVM: 
 [[19 58]
 [ 8 78]]


In [40]:
#GBDT

from sklearn.ensemble import GradientBoostingClassifier as GBDT
from sklearn import metrics

GBDT_model=GBDT(loss="deviance",learning_rate=0.3,n_estimators=100)
GBDT_model.fit(X_train,y_train)

arr=GBDT_model.predict(X_test)
cf_mat_GBDT=metrics.confusion_matrix(y_test,arr)
acc_GBDT=metrics.accuracy_score(y_test,arr)*float(100)
print("The accuracy for GBDT: ", acc_GBDT)
print("The confusion_matrix for GBDT: \n", cf_mat_GBDT)

#GBDT performed reasonably well on the data
#It performs with full accuracy on negative pts

The accuracy for GBDT:  92.63803680981594
The confusion_matrix for GBDT: 
 [[77  0]
 [12 74]]


In [41]:
#Random Forests(implementing bagging)

from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

RF_model=RandomForestClassifier(n_estimators=1000,max_depth=None)
RF_model.fit(X_train,y_train)

arr=RF_model.predict(X_test)
cf_mat_RF=metrics.confusion_matrix(y_test,arr)
acc_RF=metrics.accuracy_score(y_test,arr)*float(100)
print("The accuracy for RF: ", acc_RF)
print("The confusion_matrix for RF: \n", cf_mat_RF)

#RF proved to be the best classifier till now with an accuracy of 95-98% at 1000 trees.
#It classifies all negative points correctly 

The accuracy for RF:  98.77300613496932
The confusion_matrix for RF: 
 [[77  0]
 [ 2 84]]


NOW WE WILL BE USING TECHNIQUES THAT WILL COMBINE THE ABOVE MODEL TO FORM A BEST MODEL

In [42]:
#Stacking models

from mlxtend.classifier import StackingClassifier

LR=LR_model

SVM_=SVC(C=100,kernel="poly",degree=4,gamma="auto")
GBDT_=GBDT(loss="deviance",learning_rate=0.3,n_estimators=100)
RF_=RandomForestClassifier(n_estimators=1000,max_depth=None)

Stacked_model=StackingClassifier(classifiers=[SVM_,GBDT_,RF_], meta_classifier=LR)
Stacked_model.fit(X_train,y_train)

arr=Stacked_model.predict(X_test)

cf_mat_NBLR=metrics.confusion_matrix(y_test,arr)
acc_NBLR=metrics.accuracy_score(y_test,arr)*float(100)
print("The accuracy for SVM+GBDT+RF: ", acc_NBLR)
print("The confusion_matrix for SVM+GBDT+RF: \n", cf_mat_NBLR)

#We got the best accuracy by stacking SVM, GBDT and RF with LR as meta classifier
#The accuracy was similar to random forest except a few cases.

The accuracy for SVM+GBDT+RF:  98.77300613496932
The confusion_matrix for SVM+GBDT+RF: 
 [[77  0]
 [ 2 84]]
