In [35]:
import warnings
warnings.filterwarnings("ignore")


import sqlite3
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns


In [4]:
!pip list installed | findstr matplotlib

matplotlib                         2.2.2    


You are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


# Reading Data

In [6]:

# using SQLite Table to read data.
conn = sqlite3.connect('D:/datasets/Amazon_Fine_food_Reviews/amazon-fine-food-reviews/database.sqlite')

filtered_data = pd.read_sql_query(" SELECT * FROM Reviews WHERE Score != 3 LIMIT 100000", conn)

#print(filtered_data.columns)
#print(filtered_data.head(2))
# Give reviews with Score>3 a positive rating(1), and reviews with a score<3 a negative rating(0).
def rating(x):
    if x < 3:
        return 0
    return 1

#changing reviews with score less than 3 to be positive and vice-versa
actualScore = filtered_data['Score']
positiveNegative = actualScore.map(rating) 
filtered_data['Score'] = positiveNegative
print("Number of data points in our data :", filtered_data.shape)
filtered_data.head(3)

Number of data points in our data : (100000, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


In [7]:
display_unique_users = pd.read_sql_query("""SELECT UserId, ProductId, ProfileName, Time, Score, Text, COUNT(*) FROM Reviews
GROUP BY UserId
HAVING COUNT(*)>1
""", conn)

In [8]:
print('Unique users:',display_unique_users.shape)
display_unique_users.head(4)

Unique users: (80668, 7)


Unnamed: 0,UserId,ProductId,ProfileName,Time,Score,Text,COUNT(*)
0,#oc-R115TNMSPFT9I7,B007Y59HVM,Breyton,1331510400,2,Overall its just OK when considering the price...,2
1,#oc-R11D9D7SHXIJB9,B005HG9ET0,"Louis E. Emory ""hoppy""",1342396800,5,"My wife has recurring extreme muscle spasms, u...",3
2,#oc-R11DNU2NBKQ23Z,B007Y59HVM,Kim Cieszykowski,1348531200,1,This coffee is horrible and unfortunately not ...,2
3,#oc-R11O5J5ZVQE25C,B005HG9ET0,Penguin Chick,1346889600,5,This will be the bottle that you grab from the...,3


In [9]:
print('Total No. of Reviews:',display_unique_users['COUNT(*)'].sum())

Total No. of Reviews: 393063


# Exploratory Data Analysis

# Data Cleaning

In [10]:
display_duplicate_reviews= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND UserId="AR5J8UI46CURR"
ORDER BY ProductID
""", conn)
display_duplicate_reviews.head(6)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


It is observed (as shown in the table above) that the reviews data had many duplicate entries.
Hence it was necessary to remove duplicates in order to get unbiased results for the analysis of the data

In [11]:
#Sorting data according to ProductId in ascending order
sorted_data=filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')


In [12]:
final_data=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"},keep='first',inplace=False)
print('final_data after removing duplicated row entries:',final_data.shape)

final_data after removing duplicated row entries: (87775, 10)


In [13]:
display_HelpfulnessNumerator_corrupted_reviews= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE HelpfulnessNumerator > HelpfulnessDenominator 
ORDER BY ProductID
""", conn)
display_HelpfulnessNumerator_corrupted_reviews.head(6)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


HelpfulnessNumerator > HelpfulnessDenominator ,which is not practically possible hence these two rows too are removed from calcualtions

In [14]:
final_data=final_data[final_data.HelpfulnessNumerator<=final_data.HelpfulnessDenominator]
print('final_data after removing corrupted HelpfulnessNumerator:',final_data.shape)

final_data after removing corrupted HelpfulnessNumerator: (87773, 10)


In [15]:
final_data['Score'].value_counts()

1    73592
0    14181
Name: Score, dtype: int64

# PreProcessing review text

1.Begin by removing the html tags

2.Remove any punctuations or limited set of special characters like , or . or # etc.

3.Check if the word is made up of english letters and is not alpha-numeric

4.Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)

5.Convert the word to lowercase

6.Remove Stopwords

7.Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)


In [16]:
text_0=final_data['Text'].values[0]
print(text_0)
print('_'*50)
text_100=final_data['Text'].values[100]
print(text_100)
print('_'*50)

My dogs loves this chicken but its a product from China, so we wont be buying it anymore.  Its very hard to find any chicken products made in the USA but they are out there, but this one isnt.  Its too bad too because its a good product but I wont take any chances till they know what is going on with the china imports.
__________________________________________________
My Frenchbull is only given nylabone's to chew. He has had them since he was 7 weeks old. They are safe for him because he has a strong bite and they don't break off in large pieces that he could choke on. The Dinosaur Chew is perfect because it has so many places to hold and bite.  Dylabone is the only product I buy.
__________________________________________________


In [17]:
import re
# remove urls from text python: https://stackoverflow.com/a/40823105/4084039
text_0 = re.sub(r"http\S+", "", text_0)
text_100 = re.sub(r"http\S+", "", text_100)

print(text_0)
print('_'*50)
print(text_100)

My dogs loves this chicken but its a product from China, so we wont be buying it anymore.  Its very hard to find any chicken products made in the USA but they are out there, but this one isnt.  Its too bad too because its a good product but I wont take any chances till they know what is going on with the china imports.
__________________________________________________
My Frenchbull is only given nylabone's to chew. He has had them since he was 7 weeks old. They are safe for him because he has a strong bite and they don't break off in large pieces that he could choke on. The Dinosaur Chew is perfect because it has so many places to hold and bite.  Dylabone is the only product I buy.


In [18]:
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/
# https://stackoverflow.com/questions/16206380/python-beautifulsoup-how-to-remove-all-tags-from-an-element
from bs4 import BeautifulSoup

soup = BeautifulSoup(text_0, 'lxml')
text = soup.get_text()
print(text)
print("="*50)

soup = BeautifulSoup(text_100, 'lxml')
text = soup.get_text()
print(text)
print("="*50)

My dogs loves this chicken but its a product from China, so we wont be buying it anymore.  Its very hard to find any chicken products made in the USA but they are out there, but this one isnt.  Its too bad too because its a good product but I wont take any chances till they know what is going on with the china imports.
My Frenchbull is only given nylabone's to chew. He has had them since he was 7 weeks old. They are safe for him because he has a strong bite and they don't break off in large pieces that he could choke on. The Dinosaur Chew is perfect because it has so many places to hold and bite.  Dylabone is the only product I buy.


In [19]:
# https://stackoverflow.com/a/47091490/4084039

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

text_100 = decontracted(text_100)
print(text_100)
print("="*50)

My Frenchbull is only given nylabone is to chew. He has had them since he was 7 weeks old. They are safe for him because he has a strong bite and they do not break off in large pieces that he could choke on. The Dinosaur Chew is perfect because it has so many places to hold and bite.  Dylabone is the only product I buy.


In [20]:
#remove words with numbers python: https://stackoverflow.com/a/18082370/4084039
text_100 = re.sub("\S*\d\S*", "", text_100).strip()
print(text_100)

My Frenchbull is only given nylabone is to chew. He has had them since he was  weeks old. They are safe for him because he has a strong bite and they do not break off in large pieces that he could choke on. The Dinosaur Chew is perfect because it has so many places to hold and bite.  Dylabone is the only product I buy.


In [21]:
#remove spacial character: https://stackoverflow.com/a/5843547/4084039
text_100 = re.sub('[^A-Za-z0-9]+', ' ', text_100)
print(text_100)

My Frenchbull is only given nylabone is to chew He has had them since he was weeks old They are safe for him because he has a strong bite and they do not break off in large pieces that he could choke on The Dinosaur Chew is perfect because it has so many places to hold and bite Dylabone is the only product I buy 


In [22]:
stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [23]:
# Pre processing all reviews
from tqdm import tqdm
preprocessed_reviews = []
# tqdm is for printing the status bar
for Text in tqdm(final_data['Text'].values):
    Text = re.sub(r"http\S+", "", Text)
    Text = BeautifulSoup(Text, 'lxml').get_text()
    Text = decontracted(Text)
    Text = re.sub("\S*\d\S*", "", Text).strip()
    Text = re.sub('[^A-Za-z]+', ' ', Text)
    # https://gist.github.com/sebleier/554280
    Text = ' '.join(e.lower() for e in Text.split() if e.lower() not in stopwords)
    preprocessed_reviews.append(Text.strip())

100%|██████████████████████████████████████████████████████████████████████████| 87773/87773 [00:48<00:00, 1800.58it/s]


In [24]:
warnings.filterwarnings("ignore")
# Pre processing all review summary
from tqdm import tqdm
preprocessed_summary = []
# tqdm is for printing the status bar
for Text in tqdm(final_data['Summary'].values):
    Text = re.sub(r"http\S+", "", Text)
    Text = BeautifulSoup(Text, 'lxml').get_text()
    Text = decontracted(Text)
    Text = re.sub("\S*\d\S*", "", Text).strip()
    Text = re.sub('[^A-Za-z]+', ' ', Text)
    # https://gist.github.com/sebleier/554280
    Text = ' '.join(e.lower() for e in Text.split() if e.lower() not in stopwords)
    preprocessed_summary.append(Text.strip())

100%|██████████████████████████████████████████████████████████████████████████| 87773/87773 [00:37<00:00, 2358.54it/s]


# Featurization

# BOW

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

CountVect=CountVectorizer()
CountVect.fit(preprocessed_reviews)
print('first 10 features:',CountVect.get_feature_names()[0:10])

trans_preprocessed_reviews=CountVect.transform(preprocessed_reviews)
print(trans_preprocessed_reviews.shape)
print('unique words in all reviews:',trans_preprocessed_reviews.shape[1])

first 10 features: ['aa', 'aaa', 'aaaa', 'aaaaa', 'aaaaaaaaaaaa', 'aaaaaaaaaaaaaaa', 'aaaaaaahhhhhh', 'aaaaaaarrrrrggghhh', 'aaaaaawwwwwwwwww', 'aaaaah']
(87773, 54904)
unique words in all reviews: 54904


# BI-Grams,n-grams

In [26]:
#bi-gram, tri-gram and n-gram

#removing stop words like "not" should be avoided before building n-grams
# count_vect = CountVectorizer(ngram_range=(1,2))
# please do read the CountVectorizer documentation http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

# you can choose these numebrs min_df=10, max_features=5000, of your choice
count_vect = CountVectorizer(ngram_range=(1,2), min_df=10, max_features=5000)
final_bigram_counts = count_vect.fit_transform(preprocessed_reviews)

print("the type of count vectorizer ",type(final_bigram_counts))
print("the shape of out text BOW vectorizer ",final_bigram_counts.get_shape())
print("the number of unique words including both unigrams and bigrams ", final_bigram_counts.get_shape()[1])

the type of count vectorizer  <class 'scipy.sparse.csr.csr_matrix'>
the shape of out text BOW vectorizer  (87773, 5000)
the number of unique words including both unigrams and bigrams  5000


# Term Frequency-Inverse Document Frequency

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf=TfidfVectorizer(ngram_range=(1,2), min_df=10)

tf_idf.fit(preprocessed_reviews)
print('first 10 features:',tf_idf.get_feature_names()[0:10])

fianl_tfidf=tf_idf.transform(preprocessed_reviews)
print(fianl_tfidf.shape)
print('unique words in all reviews:',fianl_tfidf.shape[1])


first 10 features: ['aa', 'aafco', 'aback', 'abandon', 'abandoned', 'abdominal', 'ability', 'able', 'able add', 'able brew']
(87773, 51709)
unique words in all reviews: 51709


# Word2Vec

In [28]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
# Train your own Word2Vec model using your own text corpus
i=0
list_of_sents=[]
for review in preprocessed_reviews:
    list_of_sents.append(review.split())
print(list_of_sents[0:10])

[['dogs', 'loves', 'chicken', 'product', 'china', 'wont', 'buying', 'anymore', 'hard', 'find', 'chicken', 'products', 'made', 'usa', 'one', 'isnt', 'bad', 'good', 'product', 'wont', 'take', 'chances', 'till', 'know', 'going', 'china', 'imports'], ['dogs', 'love', 'saw', 'pet', 'store', 'tag', 'attached', 'regarding', 'made', 'china', 'satisfied', 'safe'], ['infestation', 'fruitflies', 'literally', 'everywhere', 'flying', 'around', 'kitchen', 'bought', 'product', 'hoping', 'least', 'get', 'rid', 'weeks', 'fly', 'stuck', 'going', 'around', 'notepad', 'squishing', 'buggers', 'success', 'rate', 'day', 'clearly', 'product', 'useless', 'even', 'dabbed', 'red', 'wine', 'banana', 'top', 'column', 'week', 'really', 'attracted', 'red', 'wine', 'glass', 'still', 'nothing', 'get', 'stuck', 'actually', 'saw', 'second', 'fly', 'land', 'watched', 'flapped', 'wings', 'frantically', 'within', 'secs', 'unstuck', 'product', 'total', 'waste', 'money'], ['worst', 'product', 'gotten', 'long', 'time', 'would

In [29]:
w2v_model=Word2Vec(list_of_sents,min_count=5,size=50, workers=4)
print(w2v_model.wv.most_similar('great'))
print('='*50)
print(w2v_model.wv.most_similar('worst'))
 

[('fantastic', 0.8487455248832703), ('excellent', 0.8304535150527954), ('awesome', 0.8162674903869629), ('terrific', 0.7988094091415405), ('wonderful', 0.7926963567733765), ('good', 0.7916631698608398), ('perfect', 0.7444168925285339), ('amazing', 0.731562614440918), ('fabulous', 0.6899922490119934), ('nice', 0.6759129762649536)]
[('greatest', 0.7658005952835083), ('tastiest', 0.7239884734153748), ('best', 0.7095773220062256), ('nastiest', 0.7017833590507507), ('nicest', 0.6796818971633911), ('coolest', 0.6442915201187134), ('disgusting', 0.6400687098503113), ('surpass', 0.638436496257782), ('wins', 0.6310152411460876), ('hottest', 0.6301690936088562)]


In [30]:
w2v_words = list(w2v_model.wv.vocab)
print("number of words that occured minimum 5 times ",len(w2v_words))
print("sample words ", w2v_words[0:50])

number of words that occured minimum 5 times  17386
sample words  ['dogs', 'loves', 'chicken', 'product', 'china', 'wont', 'buying', 'anymore', 'hard', 'find', 'products', 'made', 'usa', 'one', 'isnt', 'bad', 'good', 'take', 'chances', 'till', 'know', 'going', 'imports', 'love', 'saw', 'pet', 'store', 'tag', 'attached', 'regarding', 'satisfied', 'safe', 'infestation', 'literally', 'everywhere', 'flying', 'around', 'kitchen', 'bought', 'hoping', 'least', 'get', 'rid', 'weeks', 'fly', 'stuck', 'squishing', 'buggers', 'success', 'rate']


# AVG Word2Vec

In [31]:
# average Word2Vec
# compute average word2vec for each review.
sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in tqdm(list_of_sents): # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length 50, you might need to change this to 300 if you use google's w2v
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))
print(sent_vectors[0])

100%|███████████████████████████████████████████████████████████████████████████| 87773/87773 [05:12<00:00, 280.57it/s]


87773
50
[ 0.12575598  0.46159474 -0.09309819  1.37069529 -0.49098364 -0.87857422
 -0.46466648  0.30025681  0.26689559 -0.43047276  0.21443228  0.44285296
  0.45005171 -0.06647766  0.81688548 -0.31373119  0.46187448  0.09634926
 -0.83351827 -0.20564972  0.24990304 -0.77213698  0.11448248  0.30530376
 -0.59726662 -0.28261033  0.03641088  0.85926547 -0.73224569  0.24772421
  0.1414935   0.27890404  0.0173071   0.09323656  0.26079548 -0.33653816
  0.67106487 -0.80707854 -0.01311716  0.48671423  0.14546982  0.57295235
  0.66897004 -0.01995566  0.15799741  0.12079119 -0.69715988 -0.243065
 -0.48326728 -0.54766804]


# tf-idf weighted word2vec

In [32]:
# S = ["abc def pqr", "def def def abc", "pqr pqr def"]
model = TfidfVectorizer()
tf_idf_matrix = model.fit_transform(preprocessed_reviews)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(model.get_feature_names(), list(model.idf_)))

In [33]:
# TF-IDF weighted Word2Vec
tfidf_feat = model.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
for sent in tqdm(list_of_sents): # for each review/sentence 
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words and word in tfidf_feat:
            vhttp://localhost:8888/notebooks/Amazon_knn.ipynb#K-NN-Brute-Force-on-BOWec = w2v_model.wv[word]
            # dictionary[word] = idf value of word in whole courpus
            # sent.count(word) = tf valeus of word in this review
            tf_idf = dictionary[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)

100%|████████████████████████████████████████████████████████████████████████████| 87773/87773 [57:09<00:00, 25.60it/s]


# K-NN Brute Force on BOW

In [42]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer


X=preprocessed_reviews
y=np.array(final_data['Score'])

X_1, X_test, y_1, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_train, X_cv, y_train, y_cv = train_test_split(X_1, y_1, test_size=0.3)

text_model=CountVectorizer()

final_X_train=text_model.fit_transform(X_train)
final_Xcv=text_model.transform(X_cv)
final_X_test=text_model.transform(X_test)

auc_cv=[]
auc_train=[]

K=list(range(1,30,4))
cv_scores=[]

for i in K:
    knn=KNeighborsClassifier(n_neighbors=i,weights='uniform',algorithm='brute',leaf_size=30, p=2, metric='cosine')
    knn.fit(final_X_train, y_train)
    pred = knn.predict_proba(final_Xcv)[:,1]
    auc_cv.append(roc_auc_score(y_cv,pred))
    pred1=knn.predict_proba(final_Xtr)[:,1]
    auc_train.append(roc_auc_score(y_tr,pred1))  


ImportError: cannot import name 'bincount'

In [None]:
knn=KNeighborsClassifier(n_neighbors=29,weights='uniform',algorithm='brute',leaf_size=30, p=2, metric='cosine')
knn.fit(final_X_train,y_tr)
predi=knn.predict_proba(final_Xtest)[:,1]
fpr1, tpr1, thresholds1 = metrics.roc_curve(y_test, predi)
pred=knn.predict_proba(final_X_train)[:,1]
fpr2,tpr2,thresholds2=metrics.roc_curve(y_tr,pred)

In [None]:
K-NN Brute Force on tf-idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


X=preprocessed_reviews
y=np.array(final['Score'])

tf_idf_vect = TfidfVectorizer(ngram_range=(1,2), min_df=10)

X_1, X_test, y_1, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_train, X_cv, y_train, y_cv = train_test_split(X_1, y_1, test_size=0.3)

final_Xtrain=tf_idf_vect.fit_transform(X_train)
final_Xcv=tf_idf_vect.transform(X_cv)
final_Xtest=tf_idf_vect.transform(X_test)

auc_cv=[]
auc_train=[]
K=[]

for i in range(1,50,4):
    knn=KNeighborsClassifier(n_neighbors=i,weights='uniform',algorithm='brute',leaf_size=30, p=2, metric='cosine')
    knn.fit(final_Xtr, y_tr)
    pred = knn.predict_proba(final_Xcv)[:,1]
    pred1=knn.predict_proba(final_Xtr)[:,1]
    auc_cv.append(roc_auc_score(y_cv,pred))
    auc_train.append(roc_auc_score(y_tr,pred1))
    K.append(i)