In [1]:
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from time import time

In [2]:
con=sqlite3.connect('final_db.sqlite')
data=pd.read_sql_query("""Select * from Reviews""",con)
data.shape

(364171, 12)

In [3]:
data.head(3)

Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
0,138706,150524,6641040,ACITT7DI6IDDL,shari zychinski,0,0,positive,939340800,EVERY book is educational,this witty little book makes my son laugh at l...,witti littl book make son laugh loud recit car...
1,138688,150506,6641040,A2IW4PEEKO2R0U,Tracy,1,1,positive,1194739200,"Love the book, miss the hard cover version","I grew up reading these Sendak books, and watc...",grew read sendak book watch realli rosi movi i...
2,138689,150507,6641040,A1S4A3IQ2MU7V4,"sally sue ""sally sue""",1,1,positive,1191456000,chicken soup with rice months,This is a fun way for children to learn their ...,fun way children learn month year learn poem t...


In [4]:
columns = data.columns.values
print(data['index'][364170:])
d=np.asarray(data)
print('*******************************************')
print(d[364170,0])

364170    302474
Name: index, dtype: int64
*******************************************
302474


In [5]:
# Random Sampling of 50000 datapoints from 364171 data points
import random
n=364171
m=30000
p = m/n;

sampled_data =[];

for i in range(0,n):
    if random.random() <= p:
        sampled_data.append(d[i,:])
print(len(sampled_data))
sampled_data = np.asarray(sampled_data)
print(sampled_data.shape)

29930
(29930, 12)


In [6]:
print(columns.shape)

(12,)


In [7]:
data = pd.DataFrame(data=sampled_data,columns=columns)
data.head(3)

Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
0,138697,150515,6641040,A2RTT81R6Y3R7X,Lindylu,0,0,positive,1303171200,One of our family's favorite books,This book is a family favorite and was read to...,book famili favorit read children small order ...
1,157836,171147,7310172001,A5F2CS558RBDA,Michael W. Riley,0,0,positive,1242259200,Good Product- VERY Slow Super Saver Shipping,My dogs LOVE this product. They beg for them. ...,dog love product beg also came recommend anim ...
2,157815,171126,7310172001,A3JDAZ5YVHY15U,Shadow and Blue,0,0,positive,1296086400,"Great Product, Great deal",My dogs love these treats and I was buying the...,dog love treat buy vet doubl price great deal ...


In [8]:
#Time based sorting
data = data.sort_values('Time',axis=0,ascending=True,inplace=False,kind='quicksort',na_position='last')
data.head(3)

Unnamed: 0,index,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
26,346028,374330,B00004CI84,AAI57M3OXP5NK,"""gibraud""",0,0,positive,1025654400,Love This Movie!,This movie is a very odd movie but I love it b...,movi odd movi love doesnt follow hollywood way...
86,137932,149700,B00006L2ZT,A19JWUIRF6DXLV,Andrew J Monzon,2,4,positive,1036800000,My favorite American Band!,I have been a huge fan of CVB ever since a fri...,huge fan cvb ever sinc friend mine loan casset...
27,346027,374329,B00004CI84,A1JZV9MCT6KOX4,"C. Eallonardo ""Kali's Copilot""",0,0,positive,1037923200,Good Tim Burton Flick,I like Tim Burton movies in general. But this...,like tim burton movi general good one your mov...


In [9]:
#Splitting the data to get train and test
from sklearn.model_selection import train_test_split
X_Train, X_test, Y_Train, Y_test = train_test_split(data['CleanedText'], data['Score'], test_size=0.3, random_state=0)
print(X_Train.shape)
print(Y_Train.shape)
print(X_test.shape)
print(Y_test.shape)

(20951,)
(20951,)
(8979,)
(8979,)


# Bag of Words

#### Uni Grams

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
X_Train_Vectors = CountVectorizer().fit_transform(X_Train.values)
print(X_Train_Vectors.shape)
X_test_Vectors = CountVectorizer().fit_transform(X_test.values)
print(X_test_Vectors.shape)
tsvd= TruncatedSVD(n_components=50,n_iter=10,random_state=0)
X_Train_Vectors = tsvd.fit_transform(X_Train_Vectors)
X_test_Vectors = tsvd.fit_transform(X_test_Vectors)
print(X_Train_Vectors.shape)
print(X_test_Vectors.shape)

(20951, 18605)
(8979, 12883)
(20951, 50)
(8979, 50)


In [11]:
print(X_Train_Vectors.shape)
print(Y_Train.shape)
print(X_test_Vectors.shape)
print(Y_test.shape)

(20951, 50)
(20951,)
(8979, 50)
(8979,)


# K-NN for Uni Grams - kd_tree

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
X_Train_Vectors = CountVectorizer().fit_transform(X_Train.values)
print(X_Train_Vectors.shape)
X_test_Vectors = CountVectorizer().fit_transform(X_test.values)
print(X_test_Vectors.shape)
tsvd= TruncatedSVD(n_components=50,n_iter=10,random_state=0)
X_Train_Vectors = tsvd.fit_transform(X_Train_Vectors.toarray())
X_test_Vectors = tsvd.fit_transform(X_test_Vectors.toarray())
print(X_Train_Vectors.shape)
print(X_test_Vectors.shape)

(20951, 18605)
(8979, 12883)
(20951, 50)
(8979, 50)


In [27]:
print(X_Train_Vectors.shape)
print(Y_Train.shape)
print(X_test_Vectors.shape)
print(Y_test.shape)

(20951, 50)
(20951,)
(8979, 50)
(8979,)


In [34]:
print(X_Train_Vectors.shape[0]//10)

2095


In [29]:
print(X_Train_Vectors[0])

[ 5.64945747  0.8437393   1.73967968 -0.69087387  0.20188122 -2.0972067
  1.97462896  1.00610092 -1.08617959 -0.02259577  0.07939133 -0.11407877
  0.9951742  -0.10122764  0.98954123 -0.78465485 -0.16177382  0.79131467
  0.91109779 -0.65557268 -0.74148424 -1.02902377  0.18104235 -1.92154698
  1.52863358 -0.65729354 -1.60842528  0.03158986 -0.31530939  0.33171852
 -0.59554004  0.39883793 -0.69313276  0.73289426 -0.9155187  -0.47576858
  0.74570942  1.88720633  0.51002068  0.30339862 -0.46912925  1.00083472
  0.98343167  0.36004021 -2.23169421  0.40788273  1.13663348 -1.83895286
  0.97824245  0.70964039]


In [48]:
#Construction of 10 fold cv
indic={}
n=X_Train_Vectors.shape[0]//10
print(n)
per=X_Train_Vectors.shape[0]%10
print(per)
indic={i:((i-1)*n,(i*n)) for i in range(1,10)}
indic[10]=(9*n,10*n+per)
print(indic)

2095
1
{1: (0, 2095), 2: (2095, 4190), 3: (4190, 6285), 4: (6285, 8380), 5: (8380, 10475), 6: (10475, 12570), 7: (12570, 14665), 8: (14665, 16760), 9: (16760, 18855), 10: (18855, 20951)}


In [61]:
#checking sample tree
from sklearn.neighbors import KDTree

tree=KDTree(X_Train_Vectors)
dist, ind =tree.query(X_Train_Vectors[0:10],k=3)
print(ind,dist)
print(tree)

[[    0  5670  9971]
 [    1   216 20467]
 [    2 11323 17155]
 [    3  9162 14034]
 [    4 13134 11163]
 [    5  9487 17555]
 [    6 20704   674]
 [    7  7727  9261]
 [    8  1661  2659]
 [    9  9173  6523]] [[ 0.          6.4727314   6.50859717]
 [ 0.          2.11925946  2.36323656]
 [ 0.          2.49817728  2.53004208]
 [ 0.          1.28846181  1.43072642]
 [ 0.          3.63163226  3.99390398]
 [ 0.          2.14594461  2.30231607]
 [ 0.          1.5097116   1.56627349]
 [ 0.          0.96433043  1.24576257]
 [ 0.          2.26580856  2.29005377]
 [ 0.          1.97094575  2.00002038]]
<sklearn.neighbors.kd_tree.KDTree object at 0x00000188F61F2FA8>


In [None]:
from sklearn.neighbors import KDTree

myList = list(range(1,50))
neighbors = list(filter(lambda x: x % 2 != 0, myList))

cv_scores = []
scores=np.array([])
X_Train_Vectors1=np.array([])
Y_Train1=np.array([])

for K in neighbors:
    for i in range(1,11):
        for a,b in indic.items():
            if(a==i):
                X_cv_dat=X_Train_Vectors[b[0]:b[1]]
                Y_cv_dat=Y_Train[b[0]:b[1]]
            else:
                if(X_Train_Vectors1.size==0):
                    X_Train_Vectors1=X_Train_Vectors[b[0]:b[1]]
                    Y_Train1=Y_Train1[b[0]:b[1]]
                else:
                    X_Train_Vectors1=np.concatenate((X_Train_Vectors1,X_Train_Vectors[b[0]:b[1]]))
                    Y_Train1=np.concatenate((Y_Train1,Y_Train[b[0]:b[1]]),axis=1)
        #For each fold find the score 
        tree = KDTree(X_Train_Vectors1)
        #Find Nearest neighbours for X_cv_dat
        dist, ind = tree.query(X_cv_dat, k=K)
        #Get the majority of nearest neighbors for each X_cv_dat and compare it with the class of Y_cv_dat
        
    
    #for each K value find the mean cv_scores and append it to cv_scores
        
        
        
        