In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")


import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

from tqdm import tqdm
import os



In [None]:
# using SQLite Table to read data.

con = sqlite3.connect('../input/amazon-fine-food-reviews/database.sqlite') 

# filtering only positive and negative reviews i.e. 
# not taking into consideration those reviews with Score=3
# SELECT * FROM Reviews WHERE Score != 3 LIMIT 500000, will give top 500000 data points
# you can change the number to any other number based on your computing power

# filtered_data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3 LIMIT 500000""", con) 
# for tsne assignment you can take 5k data points

filtered_data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3 """, con) 

# Give reviews with Score>3 a positive rating(1), and reviews with a score<3 a negative rating(0).
def partition(x):
    if x < 3:
        return 0
    return 1

#changing reviews with score less than 3 to be positive and vice-versa
actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition) 
filtered_data['Score'] = positiveNegative
print("Number of data points in our data", filtered_data.shape)
filtered_data.head(3)

In [None]:
display = pd.read_sql_query("""
SELECT UserId, ProductId, ProfileName, Time, Score, Text, COUNT(*)
FROM Reviews
GROUP BY UserId
HAVING COUNT(*)>1
""", con)

In [None]:
print(display.shape)
display.head()

In [None]:
display[display['UserId']=='AZY10LLTJ71NX']

In [None]:
display['COUNT(*)'].sum()

#  [2] Exploratory Data Analysis

## [2.1] Data Cleaning: Deduplication

It is observed (as shown in the table below) that the reviews data had many duplicate entries. Hence it was necessary to remove duplicates in order to get unbiased results for the analysis of the data.  Following is an example:

In [None]:
display= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND UserId="AR5J8UI46CURR"
ORDER BY ProductID
""", con)
display.head()

As it can be seen above that same user has multiple reviews with same values for HelpfulnessNumerator, HelpfulnessDenominator, Score, Time, Summary and Text and on doing analysis it was found that <br>
<br> 
ProductId=B000HDOPZG was Loacker Quadratini Vanilla Wafer Cookies, 8.82-Ounce Packages (Pack of 8)<br>
<br> 
ProductId=B000HDL1RQ was Loacker Quadratini Lemon Wafer Cookies, 8.82-Ounce Packages (Pack of 8) and so on<br>

It was inferred after analysis that reviews with same parameters other than ProductId belonged to the same product just having different flavour or quantity. Hence in order to reduce redundancy it was decided to eliminate the rows having same parameters.<br>

The method used for the same was that we first sort the data according to ProductId and then just keep the first similar product review and delelte the others. for eg. in the above just the review for ProductId=B000HDL1RQ remains. This method ensures that there is only one representative for each product and deduplication without sorting would lead to possibility of different representatives still existing for the same product.

In [None]:
#Sorting data according to ProductId in ascending order
sorted_data=filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

In [None]:
#Deduplication of entries
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final.shape

In [None]:
#Checking to see how much % of data still remains
(final['Id'].size*1.0)/(filtered_data['Id'].size*1.0)*100

<b>Observation:-</b> It was also seen that in two rows given below the value of HelpfulnessNumerator is greater than HelpfulnessDenominator which is not practically possible hence these two rows too are removed from calcualtions

In [None]:
display= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND Id=44737 OR Id=64422
ORDER BY ProductID
""", con)

display.head()

In [None]:
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]

In [None]:
#Before starting the next phase of preprocessing lets see the number of entries left
print(final.shape)

#How many positive and negative reviews are present in our dataset?
final['Score'].value_counts()

#  [3] Preprocessing

## [3.1].  Preprocessing Review Text

Now that we have finished deduplication our data requires some preprocessing before we go on further with analysis and making the prediction model.

Hence in the Preprocessing phase we do the following in the order below:-

1. Begin by removing the html tags
2. Remove any punctuations or limited set of special characters like , or . or # etc.
3. Check if the word is made up of english letters and is not alpha-numeric
4. Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)
5. Convert the word to lowercase
6. Remove Stopwords
7. Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)<br>

After which we collect the words used to describe positive and negative reviews

In [None]:
# printing some random reviews
sent_0 = final['Text'].values[0]
print(sent_0)
print("="*50)

sent_1000 = final['Text'].values[1000]
print(sent_1000)
print("="*50)

sent_1500 = final['Text'].values[1500]
print(sent_1500)
print("="*50)

sent_4900 = final['Text'].values[4900]
print(sent_4900)
print("="*50)

In [None]:
# remove urls from text python: https://stackoverflow.com/a/40823105/4084039
sent_0 = re.sub(r"http\S+", "", sent_0)
sent_1000 = re.sub(r"http\S+", "", sent_1000)
sent_150 = re.sub(r"http\S+", "", sent_1500)
sent_4900 = re.sub(r"http\S+", "", sent_4900)

print(sent_0)

In [None]:
# https://stackoverflow.com/a/47091490/4084039
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
from bs4 import BeautifulSoup
# Combining all the above stundents 
from tqdm import tqdm
preprocessed_reviews = []
# tqdm is for printing the status bar
for sentance in tqdm(final['Text'].values):
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    # https://gist.github.com/sebleier/554280
    
    preprocessed_reviews.append(sentance.strip())

In [None]:
preprocessed_reviews[0]

<h2><font color='red'>[3.2] Preprocessing Review Summary</font></h2>

In [None]:
## Similartly you can do preprocessing for review summary also.

from tqdm import tqdm
preprocessed_summary = []
# tqdm is for printing the status bar
for sentance in tqdm(final['Summary'].values):
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    # https://gist.github.com/sebleier/554280
    
    preprocessed_summary.append(sentance.strip())

In [None]:
preprocessed_summary[0]

In [None]:
preprocessed_data=[m+' '+str(n) for m,n in zip(preprocessed_summary,preprocessed_reviews)]

In [None]:
preprocessed_data[0]

In [None]:
final['CleanedText']=preprocessed_data

In [None]:
final.shape

In [None]:
final.head()

In [None]:
# Random sampling 
data = final.take(np.random.permutation(len(final))[:10000])

In [None]:
from sklearn.model_selection import train_test_split
# Spliting into Train and test
X_train, X_test, y_train, y_test = train_test_split(data['CleanedText'].values,data['Score'].values,test_size=0.3,)

# [4] Featurization

## [4.1] BAG OF WORDS

In [None]:
# This function plots the confusion, precision and recall matrices
def plot_confusion_matrix(x_test, y_pred):
    C = confusion_matrix(x_test, y_pred)
    
    A =(((C.T)/(C.sum(axis=1))).T)
    B =(C/C.sum(axis=0))
    plt.figure(figsize=(20,4))
    
    labels = [0,1]
    
    plt.subplot(1, 3, 1)
    sns.heatmap(C, annot=True, cmap="YlGnBu", fmt="d", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title("Confusion matrix")
    
    plt.subplot(1, 3, 2)
    sns.heatmap(B, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title("Precision matrix")
    
    plt.subplot(1, 3, 3)
    # representing B in heatmap format
    sns.heatmap(A, annot=True, cmap="YlGnBu", fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title("Recall matrix")
    
    plt.show()

In [None]:
from sklearn import preprocessing
#Bag of words
count_vect = CountVectorizer(max_features=2000, min_df=10) 

X_train = count_vect.fit_transform(X_train)
X_test = count_vect.transform(X_test)

#Normalize Data
X_train = preprocessing.normalize(X_train)
X_test = preprocessing.normalize(X_test)

print("Train Data Size: ",X_train.shape)
print("Test Data Size: ",X_test.shape)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import TimeSeriesSplit

clf = LogisticRegression()
l = [1000,500,100,50,10,5,1,0.5,0.1,0.05,0.01,0.005,0.001,0.0005,0.0001]
param_grid = { 'C': l}
tscv = TimeSeriesSplit(n_splits=10) 
gsv = GridSearchCV(clf,param_grid,cv=tscv, scoring='roc_auc', verbose=1, n_jobs = -1)
gsv.fit(X_train,y_train)

print("Best HyperParameter: ",gsv.best_params_)
print("Best Accuracy: %.2f%%"%(gsv.best_score_*100))

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.h

train_auc= gsv.cv_results_['mean_train_score']
train_auc_std= gsv.cv_results_['std_train_score']
cv_auc = gsv.cv_results_['mean_test_score'] 
cv_auc_std= gsv.cv_results_['std_test_score']

plt.plot(l, train_auc, label='Train AUC')
# this code is copied from here: https://stackoverflow.com/a/48803361/4084039
plt.gca().fill_between(l,train_auc - train_auc_std,train_auc + train_auc_std,alpha=0.2,color='darkblue')

plt.plot(l, cv_auc, label='CV AUC')
# this code is copied from here: https://stackoverflow.com/a/48803361/4084039
plt.gca().fill_between(l ,cv_auc - cv_auc_std,cv_auc + cv_auc_std,alpha=0.2,color='darkorange')
plt.legend()
plt.xlabel("hyperparameter")
plt.ylabel("score")
plt.title("ERROR PLOTS")
plt.show()

In [None]:
#Metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

#Testing Accuracy on Test data
clf = LogisticRegression(C = 10)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

print("Accuracy on test set: %0.3f%%"%(accuracy_score(y_test, y_pred)*100))
print("Precision on test set: %0.3f"%(precision_score(y_test, y_pred)))
print("Recall on test set: %0.3f"%(recall_score(y_test, y_pred)))
print("F1-Score on test set: %0.3f"%(f1_score(y_test, y_pred)))
plot_confusion_matrix(y_test, y_pred)

In [None]:
v = count_vect.vocabulary_

In [None]:
len(v)

In [None]:
feature_names = count_vect.get_feature_names()
coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
top = zip(coefs_with_fns[:10], coefs_with_fns[:-(10 + 1):-1])

for (coef_1, fn_1), (coef_2, fn_2) in top:
    print("\t%.4f\t%-15s\t\t\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [None]:
type(coefs_with_fns)

In [None]:
len(coefs_with_fns)

In [None]:
import numpy as np
ar = np.array(coefs_with_fns)
weights = ar[:,0]
weights = weights.astype(np.float32)
labels = ar[:,1]
pos_index = np.where(weights>0)[0]
weights[pos_index],labels[pos_index]

In [None]:
neg_index = np.where(weights<0)[0]
weights[neg_index],labels[neg_index]

In [None]:
neg = labels[neg_index]
pos = labels[pos_index]
neg

In [None]:
pos = labels[pos_index]
pos

In [None]:
positive = np.stack([weights[pos_index], labels[pos_index]],1)
negative = np.stack([weights[neg_index], labels[neg_index]],1)


In [None]:
positive = np.sort(positive)[::-1]

In [None]:
type(positive)

In [None]:
negative[0][1]

In [None]:
neg.tolist()
type(neg)

In [None]:
list = ['bland', 'horrible', 'money']
print(type(list))
print(list)

In [None]:
ar = np.array(['bland', 'horrible', 'money'])
ar.tolist()
print(type(ar))
print(ar)

In [None]:
type(neg)
print(neg)

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

EXAMPLE_TEXT = "It was a horrible movie"

print(word_tokenize(EXAMPLE_TEXT))

In [None]:
token = word_tokenize(EXAMPLE_TEXT)


In [None]:
'horrible' in neg

In [None]:
for i in token:
    if i in pos:
        print("{} positive impact".format(i))
    elif i in neg:
        print("{} negative impact".format(i))
    else:
        print("{} no impact".format(i))

In [None]:
def predict(string):
    clf = joblib.load('model.pkl')
    count_vect = joblib.load('count_vect.pkl')
    review_text = clean_text(string)
    test_vect = count_vect.transform(([review_text]))
    pred = clf.predict(test_vect)
    print(pred[0])
    if pred[0]:
        prediction = "Positive"
    else:
        prediction = "Negative"
    feature_names = count_vect.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    ar = np.array(coefs_with_fns)
    weights = ar[:,0]
    weights = weights.astype(np.float32)
    labels = ar[:,1]
    pos_index = np.where(weights>0)[0]
    weights[pos_index],labels[pos_index]
    neg_index = np.where(weights<0)[0]
    weights[neg_index],labels[neg_index]
    positive = np.stack([weights[pos_index], labels[pos_index]],1)
    negative = np.stack([weights[neg_index], labels[neg_index]],1)
    neg = labels[neg_index]
    pos = labels[pos_index]
    token = word_tokenize(string)
    for i in token:
    if i in pos:
        print("{} positive impact".format(i))
    elif i in neg:
        print("{} negative impact".format(i))
    else:
        print("{} no impact".format(i))
    
    return prediction

In [None]:
coefs_with_fns

In [None]:
def feature_Importance(vectorizer, clf, n=1837):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    print("\tNegative\t\t\t\tPositive")
    print("________________________________________________________________________________________________")
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
        
feature_Importance(count_vect,clf)

In [None]:
def clean_text(sentance):
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    return sentance.strip()

In [None]:
def predict(string):
    clf = LogisticRegression(C = 10)
    count_vect = CountVectorizer(max_features=2000, min_df=10)
    review_text = clean_text(string)
    test_vect = count_vect.transform(([review_text]))
    pred = clf.predict(test_vect)
    print(pred[0])
    if pred[0]:
        prediction = "Positive"
    else:
        prediction = "Negative"
    return prediction

In [None]:
import gensim
from sklearn.externals import joblib
word2vec_path = "../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin.gz"
sim = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
joblib.dump(sim, 'w2v.pkl')


In [None]:
sim = joblib.load('w2v.pkl')
w2v_vocub = sim.wv.vocab
len(w2v_vocub)

In [None]:
sim.wv.most_similar('great')

In [9]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.externals import joblib
import sqlite3
from bs4 import BeautifulSoup
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize


###################################################
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase


def partition(x):
    if x < 3:
        return 0
    return 1



def clean_text(sentance):
    sentance = re.sub(r"http\S+", "", sentance)
    sentance = BeautifulSoup(sentance, 'lxml').get_text()
    sentance = decontracted(sentance)
    sentance = re.sub("\S*\d\S*", "", sentance).strip()
    sentance = re.sub('[^A-Za-z]+', ' ', sentance)
    return sentance.strip()


def predict(string):
    clf = joblib.load('model.pkl')
    count_vect = joblib.load('count_vect.pkl')
    review_text = clean_text(string)
    test_vect = count_vect.transform(([review_text]))
    pred = clf.predict(test_vect)
    print(pred[0])
    if pred[0]:
        prediction = "Positive"
    else:
        prediction = "Negative"
    feature_names = count_vect.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    ar = np.array(coefs_with_fns)
    weights = ar[:,0]
    weights = weights.astype(np.float32)
    labels = ar[:,1]
    pos_index = np.where(weights>0)[0]
    weights[pos_index],labels[pos_index]
    neg_index = np.where(weights<0)[0]
    weights[neg_index],labels[neg_index]
    positive = np.stack([weights[pos_index], labels[pos_index]],1)
    negative = np.stack([weights[neg_index], labels[neg_index]],1)
    neg = labels[neg_index]
    pos = labels[pos_index]
    token = word_tokenize(string)
    sim = joblib.load('w2v.pkl')
    for i in token:
        if i in pos:
            print("{} positive impact \n simmilar word for {} are {}\n".format(i, i, sim.wv.most_similar('i')))
        elif i in neg:
            print("{} negative impact \n simmilar word for {} are {}\n".format(i, i, sim.wv.most_similar('i')))
        else:
            print("{} no impact\n".format(i))
    
    return prediction

###################################################


con = sqlite3.connect('../input/amazon-fine-food-reviews/database.sqlite')
filtered_data = pd.read_sql_query(""" SELECT * FROM Reviews WHERE Score != 3 LIMIT 10000""", con)
filtered_data['Score'] = filtered_data['Score'].map(partition)
sorted_data = filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
final = sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
final = final.sort_values('Time', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
final = final[final.HelpfulnessNumerator <= final.HelpfulnessDenominator]


preprocessed_reviews = []
for sentence in final['Text'].values:
    preprocessed_reviews.append(clean_text(sentence))

count_vect = CountVectorizer()
count_vect.fit(preprocessed_reviews)
joblib.dump(count_vect, 'count_vect.pkl')
X = count_vect.transform(preprocessed_reviews)
print(X.shape)
Y = final['Score'].values
clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3, eta0=0.1, alpha=0.001)
clf.fit(X, Y)
joblib.dump(clf, 'model.pkl')

#import gensim
#word2vec_path = "../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin.gz"
#word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
#joblib.dump(sim, 'w2v.pkl')


# Train your own Word2Vec model using your train text corpus
i=0
list_of_sentance=[]
for sentance in final['Text']:
    list_of_sentance.append(sentance.split())

# Own w2Vec
sim=Word2Vec(list_of_sentance,min_count=5,size=50, workers=4)
joblib.dump(sim, 'w2v.pkl')

(9564, 18375)


['w2v.pkl']

In [10]:
print(predict('I love my country'))

1
I no impact

love positive impact 
 simmilar word for love are [('I', 0.8809255957603455), ('we', 0.8263564705848694), ('this,', 0.8078132271766663), ('glad', 0.7827478647232056), ('excited', 0.7738275527954102), ('it,', 0.7496438026428223), ('finally', 0.7463163733482361), ('them,', 0.7442788481712341), ('So', 0.7347532510757446), ('unable', 0.7346459627151489)]

my positive impact 
 simmilar word for my are [('I', 0.8809255957603455), ('we', 0.8263564705848694), ('this,', 0.8078132271766663), ('glad', 0.7827478647232056), ('excited', 0.7738275527954102), ('it,', 0.7496438026428223), ('finally', 0.7463163733482361), ('them,', 0.7442788481712341), ('So', 0.7347532510757446), ('unable', 0.7346459627151489)]

country negative impact 
 simmilar word for country are [('I', 0.8809255957603455), ('we', 0.8263564705848694), ('this,', 0.8078132271766663), ('glad', 0.7827478647232056), ('excited', 0.7738275527954102), ('it,', 0.7496438026428223), ('finally', 0.7463163733482361), ('them,', 0.7

In [None]:
def feature_Importance(vectorizer, clf, n=25):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    print("\tNegative\t\t\t\tPositive")
    print("________________________________________________________________________________________________")
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
        
feature_Importance(count_vect,clf)

## [4.2] Bi-Grams and n-Grams.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

#Breaking into Train and test
X_train, X_test, y_train, y_test = train_test_split(df_sample['CleanedText'].values,df_sample['Score'].values,test_size=0.3,shuffle=False)

#taking one words and two consecutive words together
bi_gram = CountVectorizer(ngram_range=(1,2)) 
X_train = bi_gram.fit_transform(X_train)
#Normalize Data
X_train = preprocessing.normalize(X_train)
print("Train Data Size: ",X_train.shape)
X_test = bi_gram.transform(X_test)
#Normalize Data
X_test = preprocessing.normalize(X_test)
print("Test Data Size: ",X_test.shape)

## [4.3] TF-IDF

In [None]:
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2), min_df=10)
tf_idf_vect.fit(data['CleanedText'])
print("some sample features(unique words in the corpus)",tf_idf_vect.get_feature_names()[0:10])
print('='*50)

tf_idf = tf_idf_vect.transform(data['CleanedText'])
#Normalize Data
tf_idf_pre = preprocessing.normalize(tf_idf)

print("the type of count vectorizer ",type(tf_idf))
print("the shape of out text TFIDF vectorizer ",tf_idf.get_shape())
print("the number of unique words including both unigrams and bigrams ", tf_idf.get_shape()[1])

## [4.4] Word2Vec

In [0]:
# Train your own Word2Vec model using your train text corpus
i=0
list_of_sentance=[]
for sentance in data['CleanedText']:
    list_of_sentance.append(sentance.split())

In [None]:
# Using Google News Word2Vectors

# in this project we are using a pretrained model by google
# its 3.3G file, once you load this into your memory 
# it occupies ~9Gb, so please do this step only if you have >12G of ram
# we will provide a pickle file wich contains a dict , 
# and it contains all our courpus words as keys and  model[word] as values
# To use this code-snippet, download "GoogleNews-vectors-negative300.bin" 
# from https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
# it's 1.9GB in size.


# http://kavita-ganesan.com/gensim-word2vec-tutorial-starter-code/#.W17SRFAzZPY
# you can comment this whole cell
# or change these varible according to your need

is_your_ram_gt_16g=False
want_to_use_google_w2v = False
want_to_train_w2v = True

if want_to_train_w2v:
    # min_count = 5 considers only words that occured atleast 5 times
    w2v_model=Word2Vec(list_of_sentance,min_count=5,size=50, workers=4)
    print(w2v_model.wv.most_similar('great'))
    print('='*50)
    print(w2v_model.wv.most_similar('worst'))
    
elif want_to_use_google_w2v and is_your_ram_gt_16g:
    if os.path.isfile('GoogleNews-vectors-negative300.bin'):
        w2v_model=KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
        print(w2v_model.wv.most_similar('great'))
        print(w2v_model.wv.most_similar('worst'))
    else:
        print("you don't have gogole's word2vec file, keep want_to_train_w2v = True, to train your own w2v ")

In [None]:
w2v_words = list(w2v_model.wv.vocab)
print("number of words that occured minimum 5 times ",len(w2v_words))
print("sample words ", w2v_words[0:50])

## [4.4.1] Converting text into vectors using Avg W2V, TFIDF-W2V

#### [4.4.1.1] Avg W2v

In [None]:
# average Word2Vec
# compute average word2vec for each review.
sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in tqdm(list_of_sentance): # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length 50, you might need to change this to 300 if you use google's w2v
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
    if cnt_words != 0:
        sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

#### [4.4.1.2] TFIDF weighted W2v

In [0]:
# S = ["abc def pqr", "def def def abc", "pqr pqr def"]
model = TfidfVectorizer()
tf_idf_matrix = model.fit_transform(data['CleanedText'].values)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(model.get_feature_names(), list(model.idf_)))

In [None]:
# TF-IDF weighted Word2Vec
tfidf_feat = model.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in tqdm(list_of_sentance): # for each review/sentence 
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        if word in w2v_words and word in tfidf_feat:
            vec = w2v_model.wv[word]
#             tf_idf = tf_idf_matrix[row, tfidf_feat.index(word)]
            # to reduce the computation we are 
            # dictionary[word] = idf value of word in whole courpus
            # sent.count(word) = tf valeus of word in this review
            tf_idf = dictionary[word]*(sent.count(word)/len(sent))
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
    if weight_sum != 0:
        sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1