In [10]:
%matplotlib inline

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn import cross_validation

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)


# using the SQLite Table to read data.
con = sqlite3.connect('./database.sqlite') 



#filtering only positive and negative reviews i.e. 
# not taking into consideration those reviews with Score=3
filtered_data = pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3
""", con) 




# Give reviews with Score>3 a positive rating, and reviews with a score<3 a negative rating.
def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

#changing reviews with score less than 3 to be positive and vice-versa
actualScore = filtered_data['Score']
positiveNegative = actualScore.map(partition) 
filtered_data['Score'] = positiveNegative

In [2]:
filtered_data.shape #looking at the number of attributes and size of the data
filtered_data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...


Data Cleaning: Deduplication


In [3]:
display= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND UserId="AR5J8UI46CURR"
ORDER BY ProductID
""", con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [4]:
#Sorting data according to ProductId in ascending order
sorted_data=filtered_data.sort_values('ProductId', axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')

In [5]:
#Deduplication of entries
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='first', inplace=False)
#final=final.drop_duplicates(subset={"UserId","ProductId","Time"}, keep='first', inplace=False)
final.shape

(364173, 10)

In [6]:
#Checking to see how much % of data still remains
(final['Id'].size*1.0)/(filtered_data['Id'].size*1.0)*100

69.25890143662969

In [7]:
display= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND Id=44737 OR Id=64422
ORDER BY ProductID
""", con)
display


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [8]:
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]


In [13]:
#Before starting the next phase of preprocessing lets see the number of entries left
print(final.shape)

#How many positive and negative reviews are present in our dataset?
final['Score'].value_counts()

(364171, 10)


positive    307061
negative     57110
Name: Score, dtype: int64

In [None]:
Data-preprocessing

In [194]:
# find sentences containing HTML tags
import re

i=0;
for sent in final['Text'].values:
    if (len(re.findall('<.*?>', sent))):
        print(i)
        print(sent)
        break;
    i += 1;    

        

6
I set aside at least an hour each day to read to my son (3 y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.<br /><br />First, this book taught him the months of the year.<br /><br />Second, it's a pleasure to read. Well suited to 1.5 y/o old to 4+.<br /><br />Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best.


In [15]:
import re

import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stop = set(stopwords.words('english')) #set of stopwords
sno = nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

def cleanhtml(sentence): #function to clean the word of any html-tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', sentence)
    return cleantext
def cleanpunc(sentence): #function to clean the word of any punctuation or special characters
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned
print(stop)
print('************************************')
print(sno.stem('tasty'))

{'most', "isn't", 'as', 'so', "weren't", 'haven', "shan't", "wasn't", 'we', 'each', "hasn't", 'up', 'it', 'just', 'll', 't', "don't", 'are', 'does', 'further', 'how', 'down', 'they', 'no', 'which', "haven't", 're', 'shan', 'again', 'itself', "didn't", 'on', 'other', 'me', 'she', 'being', 'having', 'once', 'any', 'her', 'now', 'those', 'such', "you're", 'doing', 'be', "that'll", 'both', 'them', 'were', 'didn', 'ain', 'wasn', 'this', 'whom', 'under', 'then', 'too', 'our', 'do', 'there', 'or', 'herself', 've', 'but', 'out', 'own', 'have', 'wouldn', "wouldn't", 'my', 'am', 'these', 'an', 'same', 'after', 'weren', 'here', 'than', 'had', 'y', 'his', 'themselves', 'why', 'will', 'and', 'through', "hadn't", 'below', 'mustn', 'won', 'yourself', "it's", "you'd", 'a', 'very', 'couldn', 'ourselves', "shouldn't", "couldn't", 'ma', 'where', 'should', 'few', 'who', 'himself', 'don', 'while', 's', "mustn't", 'not', 'when', "doesn't", 'hasn', 'i', 'd', 'aren', 'at', 'about', "you've", 'shouldn', 'the',

In [16]:
#Code for implementing step-by-step the checks mentioned in the pre-processing phase
i=0
str1=' '
final_string=[]
all_positive_words=[] # store words from +ve reviews here
all_negative_words=[] # store words from -ve reviews here.
s=''
for sent in final['Text'].values:
    filtered_sentence=[]
    #print(sent);
    sent=cleanhtml(sent) # remove HTMl tags
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):    
                if(cleaned_words.lower() not in stop):
                    s=(sno.stem(cleaned_words.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if (final['Score'].values)[i] == 'positive': 
                        all_positive_words.append(s) #list of all words used to describe positive reviews
                    if(final['Score'].values)[i] == 'negative':
                        all_negative_words.append(s) #list of all words used to describe negative reviews reviews
                else:
                    continue
            else:
                continue 
    #print(filtered_sentence)
    str1 = b" ".join(filtered_sentence) #final string of cleaned words
    #print("***********************************************************************")
    
    final_string.append(str1)
    i+=1

In [17]:
final['CleanedText']=final_string #adding a column of CleanedText which displays the data after pre-processing of the review 

In [18]:
# store final table into an SQlLite table for future.
conn = sqlite3.connect('final.sqlite')
c=conn.cursor()
conn.text_factory = str
final.to_sql('Reviews', conn, flavor=None, schema=None, if_exists='replace', index=True, index_label=None, chunksize=None, dtype=None)

 Bag of Words (BoW)

In [19]:
final_df = pd.DataFrame(final) # converting table to a dataframe

In [20]:
random_final = final_df.sample(n=10000)# Sampling 10krows 
random_final

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
369243,399221,B000VK4K3W,AA157EV59BJGM,domenico luvera,0,0,negative,1348617600,Formula Changed,The Newman's Own Organics 2nd Generation Turke...,b'newman organ generat turkey veget chang ingr...
358793,388076,B007RTR8UM,A2KUVD844LL7QM,Angela M. Hey,0,0,positive,1338249600,Creamy conditioner that flows well out of the ...,The conditioner ingredients are unremarkable -...,b'condition ingredi unremark chemic plus oil a...
185818,201549,B000NURB18,A23OH1HHVZDKLI,juliet,0,0,positive,1284854400,&lt;3 it!!!,a friend gave this to me and I love it!!! So ...,b'friend gave love good doesnt need anyth'
444079,480169,B007GOCR5E,A1Z54EM24Y40LL,c2,2,2,positive,1339372800,One of the boys' favorites,I always have Powerade in the house - for ever...,b'alway powerad hous everyth flu over hot base...
395554,427721,B003DIIMLK,A14DFCE6FVU07L,FLRoss,1,4,negative,1303948800,ICK!,What happened Annie?.. There are very few Anni...,b'happen anni anni product say yuck prob one r...
281055,304509,B0017OE536,A287V946KJP73N,Jason Bourne,2,2,positive,1285891200,delicious with everything! alone....well an a...,This product helped a friend lose 50 lbs and I...,b'product help friend lose lbs lost lbs produc...
191005,207094,B0015DQG22,A4UCU29FTOFCF,Janet Krisman,0,0,negative,1347235200,Lifesavers?,"The so-called ""new formula"" has a strong chemi...",b'new formula strong chemic tast dont like hop...
20335,22212,B000KV61FC,A3FV8UWW0KVI82,W. Sprague,0,1,negative,1281657600,Dog doesn't use it at all,I thought this would be fantastic to keep my p...,b'thought would fantast keep puppi busi chew k...
411032,444538,B001HTISGQ,ATN552TF5V40Z,Lh,4,4,positive,1251763200,They really mean sticky,I grew up with sticky rice but this is actuall...,b'grew sticki rice actual hard pull apart cont...
60477,65742,B003KSL1B6,A3RR2P5IS3DGPR,"Dr. M. A. Dixon ""hyper-observant""",1,1,positive,1287792000,My favorite cinnamon,"While not organic, Saigon Cinnamon is the best...",b'organ saigon cinnamon best tast cinnamon opi...


In [21]:
random_final = random_final.sort_values('Time') # Sorting the dataframe based on Time

In [22]:
random_final

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
138691,150509,0006641040,A3CMRKGE0P909G,Teresa,3,4,positive,1018396800,A great way to learn the months,This is a book of poetry about the months of t...,b'book poetri month year goe month cute littl ...
325019,351770,B0000DG4B3,A1IU7S4HCK1XK0,Joanna Daneman,13,15,positive,1072656000,If they must have mac and cheese like the box ...,This is the powdered dried cheese like in &quo...,b'powder dri chees like canadian friend call r...
422107,456525,B001D6C13O,A281NPSIMI1C2R,"Rebecca of Amazon ""The Rebecca Review""",2,2,positive,1074988800,Organic Darjeeling best with Organic Sucanat,"""Sitting on the porch of a bungalow on a tea p...",b'sit porch bungalow tea plantat darjeel see p...
316306,342462,B000084F3O,A3DWUM6SN3N3NR,"Author Brian Wallace (Mind Transmission, Inc.)",4,33,negative,1076457600,the most unnatural odor,I really hate to do this (having been a fan of...,b'realli hate fan van patten year feel must sa...
389412,421062,B0000E227M,A55MRYPUAX4QU,Avid Reader,3,3,positive,1076803200,"Musky, exquisite",The quality of this coffee is the first thing ...,b'qualiti coffe first thing one notic goe with...
125303,135926,B0001217A4,A20Q6AL0RPC1US,J. Giles,0,3,positive,1080864000,"Yo, Cherry Limeade Recipe","Yes, summer's coming. Rather than drinking tea...",b'yes summer come rather drink tea sweet break...
222671,241462,B0001E5CJO,A239VY115ZCDFU,"S. Johnston ""scott13""",4,5,positive,1090368000,you should check this out...,"Despite the silly premise and acting, i can wa...",b'despit silli premis act watch fim quit sure ...
413420,447119,B0002PCET8,A2TWEIJVSW1G61,A. Rehnblom,8,8,positive,1095206400,yummm,This stuff tastes just like Sweettarts. I rec...,b'stuff tast like sweettart recommend anyon ge...
422255,456689,B00068K7UE,A1W9KQRCZ9ORHB,"Stuart Gardner ""www.sdgardner.com""",4,5,positive,1097712000,Excellent - Fish Breath No More,"If your cat has ""foul"" breath after a chicken ...",b'cat foul breath chicken dinner even fish rea...
457717,494909,B0002ML9U6,AQ8DU6XVA3USJ,"Alejandra Vernon ""artist & illustrator""",15,17,positive,1098057600,for fretting felines and canines,This 100% natural product for cats and dogs go...,b'natur product cat dog go stress time includ ...


In [22]:
#BoW
count_vect = CountVectorizer() 
final_counts = count_vect.fit_transform(random_final['CleanedText'].values)


In [23]:
type(final_counts) # sparse matrix

scipy.sparse.csr.csr_matrix

In [24]:
final_counts.get_shape() # shape of matrix after Vectorizing

(10000, 12925)

In [35]:
# Standardizing the data with mean = 0 and std.dev = 1
from sklearn.preprocessing import StandardScaler
standardized_data = StandardScaler(with_mean=False).fit_transform(final_counts)
print(standardized_data.shape)



(10000, 12925)


In [36]:
X = standardized_data.toarray() # converting the standardized data to dense array
y = np.array(random_final['Score']) # converting Review attribute to a numpy array

In [37]:
# split the data set into train and test 70% train and 30% test 
from sklearn.model_selection import train_test_split

X1, X_test, y1, y_test = train_test_split(X, y, test_size=0.3, shuffle=False, random_state=0)

In [38]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
                               
tss = TimeSeriesSplit(n_splits=10) # TimeSeries Split with number of splits=10

parameters = [{'C': [10**-8, 10**-6, 10**-4, 10**-2, 10**0, 10**2, 10**4, 10**6, 10**8]}]# specifying the range of the hyperparameter lambda(1/C) for GridSearch

Lr = LogisticRegression()

# GridSearch for finding right hyperparameter with with 10 fold CV on TimeSeriesSplit
model_BOW = GridSearchCV(Lr, parameters, cv=tss, refit=True)

# Fit the training data
model_BOW.fit(X1, y1)



GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [1e-08, 1e-06, 0.0001, 0.01, 1, 100, 10000, 1000000, 100000000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [120]:
model_BOW.best_params_  # print the best hyperparameter C

{'C': 0.01}

In [121]:
from sklearn.metrics import recall_score, f1_score, confusion_matrix, accuracy_score, precision_score


# instantiate learning model for best C
Lr_op = LogisticRegression(penalty='l2', C=0.01)

# fitting the model
Lr_op.fit(X1, y1)

# predict the response
pred_Lr_op = Lr_op.predict(X_test)


print()

# evaluate various performance measures:
print("Accuracy = ",accuracy_score(y_test, pred_Lr_op)) # accuracy
print("Confusion Matrix\n", confusion_matrix(y_test, pred_Lr_op)) # Confusion Matrix
tn, fn, fp, tp = confusion_matrix(y_test, pred_Lr_op).ravel() # tn, fn, fp, tp
print("(tn, fn, fp, tp) =",(tn, fn, fp, tp))
print("Recall = ",recall_score(y_test, pred_Lr_op, pos_label='positive')) # Recall
print("f1-Score = "  ,f1_score(y_test, pred_Lr_op, pos_label='positive')) # f1-Score
print("Precision = ",precision_score(y_test, pred_Lr_op, pos_label='positive'))# Precision



Accuracy =  0.8883333333333333
Confusion Matrix
 [[ 209  267]
 [  68 2456]]
(tn, fn, fp, tp) = (209, 267, 68, 2456)
Recall =  0.9730586370839936
f1-Score =  0.9361539927577663
Precision =  0.901946382666177


In [124]:
Lr_op.coef_ # weights of eatures

array([[-0.01712988,  0.00451658,  0.00214061, ...,  0.00021602,
         0.        ,  0.00586088]])

In [125]:
a = np.empty(0)
a = Lr_op.coef_ # storing in a numpy array

In [136]:
X_m = a + np.random.normal(0, 0.002)  # adding a small noise on weights and testing whether they are collinear or not

In [151]:
X_m # array after adding noise

array([[-1.70719825e-02,  4.57447204e-03,  2.19850284e-03, ...,
         2.73908585e-04,  5.78927724e-05,  5.91877651e-03]])

In [152]:
# As we can see the difference is very very low in terms of weight matrix and updated weight matrix after adding a noise.
# They are not collinear.

In [153]:
# print top 20 features for each class for BOW
print("Top 20 Positive Features for BOW and Top 20 Negative Features for BOW ")
print("\n\n")

# defining a function for finding the top features of each class
def top_most_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names() # getting all the feature names
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names)) # retrieve the coefficient & sort them based on values 
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1]) 
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print ("\t%-15s\t%.4f\t%-15s\t\t%-15s\t%.4f\t%-15s" % (fn_2, coef_1, 'positive', fn_1, coef_2, 'negative'))

        
top_most_features(count_vect, Lr_op)


Top 20 Positive Features for BOW and Top 20 Negative Features for BOW 



	great          	-0.2171	positive       		disappoint     	0.3940	negative       
	love           	-0.1693	positive       		aw             	0.3477	negative       
	best           	-0.1669	positive       		stale          	0.2998	negative       
	good           	-0.1463	positive       		return         	0.2298	negative       
	excel          	-0.1444	positive       		worst          	0.2115	negative       
	delici         	-0.1391	positive       		bland          	0.2087	negative       
	nice           	-0.1361	positive       		didnt          	0.1947	negative       
	find           	-0.1335	positive       		horribl        	0.1870	negative       
	enjoy          	-0.1323	positive       		wont           	0.1726	negative       
	easi           	-0.1250	positive       		wast           	0.1696	negative       
	snack          	-0.1244	positive       		bare           	0.1596	negative       
	perfect        	-0.1240	positive  

In [106]:
# instantiate learning model for best C
Lr_op = LogisticRegression(penalty = 'l1', C=0.01)

# fitting the model
Lr_op.fit(X1, y1)

# predict the response
pred_Lr_op = Lr_op.predict(X_test)

# Sparsity of array
w1 = Lr_op.coef_
print("Sparsity: " ,np.count_nonzero(w1))


print()

# evaluate various performance measures:
print("Accuracy = ",accuracy_score(y_test, pred_Lr_op)) # accuracy
print("Confusion Matrix\n", confusion_matrix(y_test, pred_Lr_op)) # Confusion Matrix
tn, fn, fp, tp = confusion_matrix(y_test, pred_Lr_op).ravel() # tn, fn, fp, tp
print("(tn, fn, fp, tp) =",(tn, fn, fp, tp))
print("Recall = ",recall_score(y_test, pred_Lr_op, pos_label='positive')) # Recall
print("f1-Score = "  ,f1_score(y_test, pred_Lr_op, pos_label='positive')) # f1-Score
print("Precision = ",precision_score(y_test, pred_Lr_op, pos_label='positive'))# Precision

Sparsity:  115

Accuracy =  0.861
Confusion Matrix
 [[  78  398]
 [  19 2505]]
(tn, fn, fp, tp) = (78, 398, 19, 2505)
Recall =  0.992472266244057
f1-Score =  0.9231619679380872
Precision =  0.8629004478126077


In [108]:
# instantiate learning model for best C
Lr_op = LogisticRegression(penalty = 'l1', C=0.007)

# fitting the model
Lr_op.fit(X1, y1)

# predict the response
pred_Lr_op = Lr_op.predict(X_test)

# Checking sparsity of array with reduced C
w1 = Lr_op.coef_
print("Sparsity: " ,np.count_nonzero(w1))


print()
print("Precision = ",precision_score(y_test, pred_Lr_op, pos_label='positive'))# Precision

Sparsity:  55

Precision =  0.8534599728629579


In [109]:
# instantiate learning model for best alpha
Lr_op = LogisticRegression(penalty = 'l1', C=0.004)

# fitting the model
Lr_op.fit(X1, y1)

# predict the response
pred_Lr_op = Lr_op.predict(X_test)

# Checking sparsity of array with reduced C
w1 = Lr_op.coef_
print("Sparsity: " ,np.count_nonzero(w1))


print()
print("Precision = ",precision_score(y_test, pred_Lr_op, pos_label='positive'))# Precision

Sparsity:  13

Precision =  0.8434782608695652


In [110]:
# instantiate learning model for best alpha
Lr_op = LogisticRegression(penalty = 'l1', C=0.002)

# fitting the model
Lr_op.fit(X1, y1)

# predict the response
pred_Lr_op = Lr_op.predict(X_test)

# Checking sparsity of array with reduced C
w1 = Lr_op.coef_
print("Sparsity: " ,np.count_nonzero(w1))


print()
print("Precision = ",precision_score(y_test, pred_Lr_op, pos_label='positive'))# Precision

Sparsity:  2

Precision =  0.8413333333333334


In [113]:
# instantiate learning model for best alpha
Lr_op = LogisticRegression(penalty = 'l1', C=0.0002)

# fitting the model
Lr_op.fit(X1, y1)

# predict the response
pred_Lr_op = Lr_op.predict(X_test)

# Checking sparsity of array with reduced C
w1 = Lr_op.coef_
print("Sparsity: " ,np.count_nonzero(w1))


print()
print("Precision = ",precision_score(y_test, pred_Lr_op, pos_label='positive'))# Precision

Sparsity:  0

Precision =  0.0


  'precision', 'predicted', average, warn_for)


In [118]:
# instantiate learning model for best alpha
Lr_op = LogisticRegression(penalty = 'l1', C=0.8)

# fitting the model
Lr_op.fit(X1, y1)

# predict the response
pred_Lr_op = Lr_op.predict(X_test)

# Checking sparsity of array with increased C
w1 = Lr_op.coef_
print("Sparsity: " ,np.count_nonzero(w1))


print()
print("Precision = ",precision_score(y_test, pred_Lr_op, pos_label='positive'))# Precision

Sparsity:  2176

Precision =  0.9164430816404753


In [47]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_rand
                               
tss_1 = TimeSeriesSplit(n_splits=10) # TimeSeries Split with number of splits=10

parameters_1 = {'C': sp_rand() }# specifying the range of the hyperparameter C for RandomizedSearchCV

Lr_ran = LogisticRegression()

# RandomizedSearchCV for finding right hyperparameter with with 10 fold CV on TimeSeriesSplit
model_BOW_ran = RandomizedSearchCV(Lr_ran, parameters_1, cv=tss_1, refit=True)

# Fit the training data
model_BOW_ran.fit(X1, y1)


RandomizedSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
          error_score='raise',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f3535b17908>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [155]:
model_BOW_ran.best_params_ # best C or 1/lambda

{'C': 0.02875544258870244}

In [156]:
from sklearn.metrics import recall_score, f1_score, confusion_matrix, accuracy_score, precision_score


# instantiate learning model for best C
Lr_op = LogisticRegression(penalty='l2', C = 0.02875544258870244)

# fitting the model
Lr_op.fit(X1, y1)

# predict the response
pred_Lr_op = Lr_op.predict(X_test)

print()

# evaluate various performance measures:
print("Accuracy = ",accuracy_score(y_test, pred_Lr_op)) # accuracy
print("Confusion Matrix\n", confusion_matrix(y_test, pred_Lr_op)) # Confusion Matrix
tn, fn, fp, tp = confusion_matrix(y_test, pred_Lr_op).ravel() # tn, fn, fp, tp
print("(tn, fn, fp, tp) =",(tn, fn, fp, tp))
print("Recall = ",recall_score(y_test, pred_Lr_op, pos_label='positive')) # Recall
print("f1-Score = "  ,f1_score(y_test, pred_Lr_op, pos_label='positive')) # f1-Score
print("Precision = ",precision_score(y_test, pred_Lr_op, pos_label='positive'))# Precision



Accuracy =  0.8843333333333333
Confusion Matrix
 [[ 217  259]
 [  88 2436]]
(tn, fn, fp, tp) = (217, 259, 88, 2436)
Recall =  0.96513470681458
f1-Score =  0.9335121670818164
Precision =  0.9038961038961039


In [76]:
# instantiate learning model for best C
Lr_op = LogisticRegression(penalty='l1', C = 0.02875544258870244)


# fitting the model
Lr_op.fit(X1, y1)

# predict the response
pred_Lr_op = Lr_op.predict(X_test)

# Checking sparsity of array with C
w1 = Lr_op.coef_
print("Sparsity: " ,np.count_nonzero(w1))

print()

# evaluate various performance measures:
print("Accuracy = ",accuracy_score(y_test, pred_Lr_op)) # accuracy
print("Confusion Matrix\n", confusion_matrix(y_test, pred_Lr_op)) # Confusion Matrix
tn, fn, fp, tp = confusion_matrix(y_test, pred_Lr_op).ravel() # tn, fn, fp, tp
print("(tn, fn, fp, tp) =",(tn, fn, fp, tp))
print("Recall = ",recall_score(y_test, pred_Lr_op, pos_label='positive')) # Recall
print("f1-Score = "  ,f1_score(y_test, pred_Lr_op, pos_label='positive')) # f1-Score
print("Precision = ",precision_score(y_test, pred_Lr_op, pos_label='positive'))# Precision

Sparsity:  931

Accuracy =  0.8846666666666667
Confusion Matrix
 [[ 174  302]
 [  44 2480]]
(tn, fn, fp, tp) = (174, 302, 44, 2480)
Recall =  0.9825673534072901
f1-Score =  0.9347908028646814
Precision =  0.8914450035945363


In [77]:
# instantiate learning model for best alpha
Lr_op = LogisticRegression(penalty='l1', C = 0.020)


# fitting the model
Lr_op.fit(X1, y1)

# predict the response
pred_Lr_op = Lr_op.predict(X_test)

# Checking sparsity of array with reduced C
w1 = Lr_op.coef_
print("Sparsity: " ,np.count_nonzero(w1))

print()

print("Precision = ",precision_score(y_test, pred_Lr_op, pos_label='positive'))# Precision

Sparsity:  685

Precision =  0.882937211777226


In [78]:
# instantiate learning model for best alpha
Lr_op = LogisticRegression(penalty='l1', C = 0.015)


# fitting the model
Lr_op.fit(X1, y1)

# predict the response
pred_Lr_op = Lr_op.predict(X_test)

# Checking sparsity of array with reduced C
w1 = Lr_op.coef_
print("Sparsity: " ,np.count_nonzero(w1))

print()


print("Precision = ",precision_score(y_test, pred_Lr_op, pos_label='positive'))# Precision

Sparsity:  462

Precision =  0.8773717498243149


In [81]:
# instantiate learning model for best alpha
Lr_op = LogisticRegression(penalty='l1', C = 0.010)


# fitting the model
Lr_op.fit(X1, y1)

# predict the response
pred_Lr_op = Lr_op.predict(X_test)

# Checking sparsity of array with reduced C
w1 = Lr_op.coef_
print("Sparsity: " ,np.count_nonzero(w1))

print()

print("Precision = ",precision_score(y_test, pred_Lr_op, pos_label='positive'))# Precision

Sparsity:  115

Precision =  0.8629004478126077


In [82]:
# instantiate learning model for best alpha
Lr_op = LogisticRegression(penalty='l1', C = 0.007)


# fitting the model
Lr_op.fit(X1, y1)

# predict the response
pred_Lr_op = Lr_op.predict(X_test)

# Checking sparsity of array with reduced C
w1 = Lr_op.coef_
print("Sparsity: " ,np.count_nonzero(w1))

print()

print("Precision = ",precision_score(y_test, pred_Lr_op, pos_label='positive'))# Precision

Sparsity:  55

Precision =  0.8534599728629579


In [83]:
# instantiate learning model for best alpha
Lr_op = LogisticRegression(penalty='l1', C = 0.002)


# fitting the model
Lr_op.fit(X1, y1)

# predict the response
pred_Lr_op = Lr_op.predict(X_test)

# Checking sparsity of array with reduced C
w1 = Lr_op.coef_
print("Sparsity: " ,np.count_nonzero(w1))

print()

print("Precision = ",precision_score(y_test, pred_Lr_op, pos_label='positive'))# Precision

Sparsity:  2

Precision =  0.8413333333333334


In [98]:
# instantiate learning model for best alpha
Lr_op = LogisticRegression(penalty='l1', C = 0.0004)


# fitting the model
Lr_op.fit(X1, y1)

# predict the response
pred_Lr_op = Lr_op.predict(X_test)

# Checking sparsity of array with reduced C
w1 = Lr_op.coef_
print("Sparsity: " ,np.count_nonzero(w1))

print()

print("Precision = ",precision_score(y_test, pred_Lr_op, pos_label='positive'))# Precision

Sparsity:  0

Precision =  0.0


  'precision', 'predicted', average, warn_for)


In [99]:
# instantiate learning model for best alpha
Lr_op = LogisticRegression(penalty='l1', C = 0.5)


# fitting the model
Lr_op.fit(X1, y1)

# predict the response
pred_Lr_op = Lr_op.predict(X_test)

# Checking sparsity of array with increased C
w1 = Lr_op.coef_
print("Sparsity: " ,np.count_nonzero(w1))

print()

print("Precision = ",precision_score(y_test, pred_Lr_op, pos_label='positive'))# Precision

Sparsity:  2133

Precision =  0.9153318077803204


 TF-IDF

In [23]:
# Taking Tf-idf bigram and vectorizing it
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
final_tf_idf = tf_idf_vect.fit_transform(random_final['CleanedText'].values)


In [24]:
final_tf_idf.get_shape() # Shape of tf-idf vectorizer


(10000, 235352)

In [235]:
# Standardizing the data with mean=0 and std.dev=1
from sklearn.preprocessing import StandardScaler
standardized_data_tf = StandardScaler(with_mean=False).fit_transform(final_tf_idf)
print(standardized_data_tf.shape)

(10000, 230972)


In [181]:
X_tf = standardized_data_tf.toarray() # storing the values after standardization in a numpy array
y_tf = np.array(random_final['Score']) # storing the values of Scores in numpy array

In [182]:
# split the data set into train and test, 70% train and 30% test 
X2, X_test2, y2, y_test2 = train_test_split(X_tf, y_tf, test_size=0.3, shuffle=False, random_state=0)

In [184]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
                               
tss_tf = TimeSeriesSplit(n_splits=10) # TimeSeries Split with number of splits=10

parameters_tf = [{'C': [10**-8, 10**-6, 10**-4, 10**-2, 10**0, 10**2, 10**4, 10**6, 10**8]}]# specifying the range of the hyperparameter C for GridSearch

Lr_tf = LogisticRegression()

# GridSearch for finding right hyperparameter with with 10 fold CV on TimeSeriesSplit
model_tf = GridSearchCV(Lr_tf, parameters_tf, cv=tss_tf, refit=True)

# Fit the training data
model_tf.fit(X2, y2)



GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [1e-08, 1e-06, 0.0001, 0.01, 1, 100, 10000, 1000000, 100000000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [185]:
model_tf.best_params_  # print the best hyperparameter C

{'C': 0.01}

In [186]:
from sklearn.metrics import recall_score, f1_score, confusion_matrix, accuracy_score, precision_score


# instantiate learning model for best C
Lr_tf = LogisticRegression(penalty='l2', C=0.01)

# fitting the model
Lr_tf.fit(X2, y2)

# predict the response
pred_Lr_tf = Lr_tf.predict(X_test2)


print()

# evaluate various performance measures:
print("Accuracy = ",accuracy_score(y_test2, pred_Lr_tf)) # accuracy
print("Confusion Matrix\n", confusion_matrix(y_test2, pred_Lr_tf)) # Confusion Matrix
tn, fn, fp, tp = confusion_matrix(y_test2, pred_Lr_tf).ravel() # tn, fn, fp, tp
print("(tn, fn, fp, tp) =",(tn, fn, fp, tp))
print("Recall = ",recall_score(y_test2, pred_Lr_tf, pos_label='positive')) # Recall
print("f1-Score = "  ,f1_score(y_test2, pred_Lr_tf, pos_label='positive')) # f1-Score
print("Precision = ",precision_score(y_test2, pred_Lr_tf, pos_label='positive'))# Precision



Accuracy =  0.893
Confusion Matrix
 [[ 241  235]
 [  86 2438]]
(tn, fn, fp, tp) = (241, 235, 86, 2438)
Recall =  0.9659270998415214
f1-Score =  0.9382335963055608
Precision =  0.9120838009726898


In [187]:
Lr_tf.coef_ # weights of the features of tf-idf

array([[ 3.44296975e-01, -7.33319413e-02, -5.05756970e-02,
        -3.88375215e-02,  2.35701281e-01,  1.21512996e-01,
        -3.77018290e-01,  5.68081899e-01,  6.70595783e-01,
        -2.15901516e-01,  4.62851848e-02, -2.23349184e-03,
        -1.03468668e-01, -4.87580273e-02, -1.29410402e-01,
        -1.78656586e-02, -3.14731018e-03, -2.71647002e-01,
         2.76581550e-01, -3.02133963e-01,  4.11345397e-04,
         3.26997745e-02,  2.41330521e-02, -2.86670188e-01,
        -2.56182740e-01, -4.47187236e-02, -2.05758113e-01,
         6.44139432e-02, -1.08971534e-02, -7.49406618e-02,
         3.12927216e-01, -5.94701965e-02,  4.19310999e-06,
        -1.44160040e-01,  5.43187044e-02, -5.84022756e-02,
         1.85875981e-01,  1.04249139e-01,  2.82745629e-02,
         1.23403580e-01,  2.08463803e-03,  8.59077982e-03,
         1.53544439e-02,  3.13736579e-01, -9.57707682e-03,
        -1.40603857e-02,  4.16678250e-03, -1.29515909e-01,
         3.08293064e-02, -3.85846679e-02, -8.90637435e-0

In [188]:
c = np.empty(0)
c = Lr_tf.coef_ # put in a numpy array i.e weights

In [190]:
X_tf_idf = c + np.random.normal(0, 0.002) # adding noise i.e pertubation test

In [194]:
X_tf_idf

array([[ 3.45201315e-01, -7.24276006e-02, -4.96713563e-02,
        -3.79331808e-02,  2.36605622e-01,  1.22417336e-01,
        -3.76113949e-01,  5.68986239e-01,  6.71500124e-01,
        -2.14997175e-01,  4.71895255e-02, -1.32915113e-03,
        -1.02564327e-01, -4.78536866e-02, -1.28506061e-01,
        -1.69613179e-02, -2.24296947e-03, -2.70742661e-01,
         2.77485891e-01, -3.01229623e-01,  1.31568611e-03,
         3.36041153e-02,  2.50373928e-02, -2.85765848e-01,
        -2.55278399e-01, -4.38143829e-02, -2.04853773e-01,
         6.53182839e-02, -9.99281273e-03, -7.40363211e-02,
         3.13831557e-01, -5.85658558e-02,  9.08533819e-04,
        -1.43255699e-01,  5.52230451e-02, -5.74979349e-02,
         1.86780322e-01,  1.05153479e-01,  2.91789036e-02,
         1.24307920e-01,  2.98897874e-03,  9.49512053e-03,
         1.62587846e-02,  3.14640919e-01, -8.67273611e-03,
        -1.31560450e-02,  5.07112321e-03, -1.28611568e-01,
         3.17336471e-02, -3.76803272e-02, -8.81594028e-0

In [None]:
# As we can see the difference is very very low in terms of weight matrix and updated weight matrix after adding a noise.
# They are not collinear.

In [196]:
# print top 20 features for each class for tf-idf
print("Top 20 Positive Features for BOW and Top 20 Negative Features for tf-idf ")
print("\n\n")

# defining a function for finding the top features of each class
def top_most_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names() # getting all the feature names
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names)) # retrieve the coefficient & sort them based on values 
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1]) 
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print ("\t%-15s\t%.4f\t%-15s\t\t%-15s\t%.4f\t%-15s" % (fn_2, coef_1, 'positive', fn_1, coef_2, 'negative'))

        
top_most_features(tf_idf_vect, Lr_tf)


Top 20 Positive Features for BOW and Top 20 Negative Features for tf-idf 



	abandon eat    	-0.3770	positive       		abandon        	0.6706	negative       
	abandon contain	-0.3021	positive       		abc six        	0.5681	negative       
	aaah           	-0.2867	positive       		abid           	0.3443	negative       
	abil synthes   	-0.2716	positive       		abc            	0.3137	negative       
	abil compar    	-0.2562	positive       		abid kid       	0.3129	negative       
	abc news       	-0.2232	positive       		abl coffe      	0.2766	negative       
	aback          	-0.2172	positive       		abl follow     	0.2357	negative       
	abl ever       	-0.2159	positive       		abandon idea   	0.2002	negative       
	abil hydrat    	-0.2058	positive       		abil brew      	0.1859	negative       
	abl enjoy      	-0.1715	positive       		abl itll       	0.1791	negative       
	abl bear       	-0.1704	positive       		abl brew       	0.1525	negative       
	abl keep       	-0.1531	positiv

In [200]:
from sklearn.metrics import recall_score, f1_score, confusion_matrix, accuracy_score, precision_score


# instantiate learning model for best C
Lr_tf = LogisticRegression(penalty='l1', C=0.01)

# fitting the model
Lr_tf.fit(X2, y2)

# predict the response
pred_Lr_tf = Lr_tf.predict(X_test2)

# Checking sparsity of array with C
w2 = Lr_tf.coef_
print("Sparsity: " ,np.count_nonzero(w2))

print()

# evaluate various performance measures:
print("Accuracy = ",accuracy_score(y_test2, pred_Lr_tf)) # accuracy
print("Confusion Matrix\n", confusion_matrix(y_test2, pred_Lr_tf)) # Confusion Matrix
tn, fn, fp, tp = confusion_matrix(y_test2, pred_Lr_tf).ravel() # tn, fn, fp, tp
print("(tn, fn, fp, tp) =",(tn, fn, fp, tp))
print("Recall = ",recall_score(y_test2, pred_Lr_tf, pos_label='positive')) # Recall
print("f1-Score = "  ,f1_score(y_test2, pred_Lr_tf, pos_label='positive')) # f1-Score
print("Precision = ",precision_score(y_test2, pred_Lr_tf, pos_label='positive'))# Precision


Sparsity:  38

Accuracy =  0.8603333333333333
Confusion Matrix
 [[  78  398]
 [  21 2503]]
(tn, fn, fp, tp) = (78, 398, 21, 2503)
Recall =  0.9916798732171157
f1-Score =  0.9227649769585253
Precision =  0.8628059289900034


In [204]:
# instantiate learning model for best alpha
Lr_tf = LogisticRegression(penalty='l1', C = 0.007)


# fitting the model
Lr_tf.fit(X2, y2)

# predict the response
pred_Lr_tf = Lr_tf.predict(X_test2)

# Checking sparsity of array with reduced C
w2 = Lr_tf.coef_
print("Sparsity: " ,np.count_nonzero(w2))

print()

print("Precision = ",precision_score(y_test2, pred_Lr_tf, pos_label='positive'))# Precision

Sparsity:  22

Precision =  0.8523421588594705


In [205]:
# instantiate learning model for best alpha
Lr_tf = LogisticRegression(penalty='l1', C = 0.004)


# fitting the model
Lr_tf.fit(X2, y2)

# predict the response
pred_Lr_tf = Lr_tf.predict(X_test2)

# Checking sparsity of array with reduced C
w2 = Lr_tf.coef_
print("Sparsity: " ,np.count_nonzero(w2))

print()

print("Precision = ",precision_score(y_test2, pred_Lr_tf, pos_label='positive'))# Precision

Sparsity:  7

Precision =  0.8418945963975984


In [207]:
# instantiate learning model for best alpha
Lr_tf = LogisticRegression(penalty='l1', C = 0.0004)


# fitting the model
Lr_tf.fit(X2, y2)

# predict the response
pred_Lr_tf = Lr_tf.predict(X_test2)

# Checking sparsity of array with reduced C
w2 = Lr_tf.coef_
print("Sparsity: " ,np.count_nonzero(w2))

print()

print("Precision = ",precision_score(y_test2, pred_Lr_tf, pos_label='positive'))# Precision

Sparsity:  0

Precision =  0.0


  'precision', 'predicted', average, warn_for)


In [208]:
# instantiate learning model for best alpha
Lr_tf = LogisticRegression(penalty='l1', C = 0.5)


# fitting the model
Lr_tf.fit(X2, y2)

# predict the response
pred_Lr_tf = Lr_tf.predict(X_test2)

# Checking sparsity of array with increased C
w2 = Lr_tf.coef_
print("Sparsity: " ,np.count_nonzero(w2))

print()

print("Precision = ",precision_score(y_test2, pred_Lr_tf, pos_label='positive'))# Precision

Sparsity:  373

Precision =  0.9246653919694072


In [212]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_rand
                               
tss_tf_r = TimeSeriesSplit(n_splits=10) # TimeSeries Split with number of splits=10

parameters_tf_r = {'C': sp_rand()} # specifying the range of the hyperparameter C for RandomizedSearchCV

Lr_ran_tf = LogisticRegression()

# RandomizedSearchCV for finding right hyperparameter with with 10 fold CV on TimeSeriesSplit
model_tf_r = RandomizedSearchCV(Lr_ran_tf, parameters_tf_r, cv=tss_tf_r, refit=True)

# Fit the training data
model_tf_r.fit(X2, y2)


RandomizedSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
          error_score='raise',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f352e45c4a8>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [213]:
model_tf_r.best_params_ # best C

{'C': 0.06988426976771223}

In [214]:
from sklearn.metrics import recall_score, f1_score, confusion_matrix, accuracy_score, precision_score


# instantiate learning model for best C
Lr_tf_r = LogisticRegression(penalty='l2', C = 0.06988426976771223)

# fitting the model
Lr_tf_r.fit(X2, y2)

# predict the response
pred_Lr_tf_r = Lr_tf_r.predict(X_test2)

print()

# evaluate various performance measures:
print("Accuracy = ",accuracy_score(y_test2, pred_Lr_tf_r)) # accuracy
print("Confusion Matrix\n", confusion_matrix(y_test2, pred_Lr_tf_r)) # Confusion Matrix
tn, fn, fp, tp = confusion_matrix(y_test2, pred_Lr_tf_r).ravel() # tn, fn, fp, tp
print("(tn, fn, fp, tp) =",(tn, fn, fp, tp))
print("Recall = ",recall_score(y_test2, pred_Lr_tf_r, pos_label='positive')) # Recall
print("f1-Score = "  ,f1_score(y_test2, pred_Lr_tf_r, pos_label='positive')) # f1-Score
print("Precision = ",precision_score(y_test2, pred_Lr_tf_r, pos_label='positive'))# Precision



Accuracy =  0.8936666666666667
Confusion Matrix
 [[ 267  209]
 [ 110 2414]]
(tn, fn, fp, tp) = (267, 209, 110, 2414)
Recall =  0.9564183835182251
f1-Score =  0.9380221488245579
Precision =  0.9203202439954251


In [223]:
from sklearn.metrics import recall_score, f1_score, confusion_matrix, accuracy_score, precision_score


# instantiate learning model for best C
Lr_tf_r = LogisticRegression(penalty='l1', C = 0.06988426976771223)

# fitting the model
Lr_tf_r.fit(X2, y2)

# predict the response
pred_Lr_tf_r = Lr_tf_r.predict(X_test2)

# Checking sparsity of array with C
w3 = Lr_tf_r.coef_
print("Sparsity: " ,np.count_nonzero(w3))

print()

# evaluate various performance measures:
print("Accuracy = ",accuracy_score(y_test2, pred_Lr_tf_r)) # accuracy
print("Confusion Matrix\n", confusion_matrix(y_test2, pred_Lr_tf_r)) # Confusion Matrix
tn, fn, fp, tp = confusion_matrix(y_test2, pred_Lr_tf_r).ravel() # tn, fn, fp, tp
print("(tn, fn, fp, tp) =",(tn, fn, fp, tp))
print("Recall = ",recall_score(y_test2, pred_Lr_tf_r, pos_label='positive')) # Recall
print("f1-Score = "  ,f1_score(y_test2, pred_Lr_tf_r, pos_label='positive')) # f1-Score
print("Precision = ",precision_score(y_test2, pred_Lr_tf_r, pos_label='positive'))# Precision


Sparsity:  283

Accuracy =  0.8976666666666666
Confusion Matrix
 [[ 243  233]
 [  74 2450]]
(tn, fn, fp, tp) = (243, 233, 74, 2450)
Recall =  0.9706814580031695
f1-Score =  0.9410409064720568
Precision =  0.9131569139023481


In [228]:
# instantiate learning model for best alpha
Lr_tf_r = LogisticRegression(penalty='l1', C = 0.0408)


# fitting the model
Lr_tf_r.fit(X2, y2)

# predict the response
pred_Lr_tf_r = Lr_tf_r.predict(X_test2)

# Checking sparsity of array with reduced C
w3 = Lr_tf_r.coef_
print("Sparsity: " ,np.count_nonzero(w3))

print()

print("Precision = ",precision_score(y_test2, pred_Lr_tf_r, pos_label='positive'))# Precision

Sparsity:  228

Precision =  0.907448377581121


In [229]:
# instantiate learning model for best alpha
Lr_tf_r = LogisticRegression(penalty='l1', C = 0.0308)


# fitting the model
Lr_tf_r.fit(X2, y2)

# predict the response
pred_Lr_tf_r = Lr_tf_r.predict(X_test2)

# Checking sparsity of array with reduced C
w3 = Lr_tf_r.coef_
print("Sparsity: " ,np.count_nonzero(w3))

print()

print("Precision = ",precision_score(y_test2, pred_Lr_tf_r, pos_label='positive'))# Precision

Sparsity:  189

Precision =  0.8998178506375227


In [230]:
# instantiate learning model for best alpha
Lr_tf_r = LogisticRegression(penalty='l1', C = 0.0108)


# fitting the model
Lr_tf_r.fit(X2, y2)

# predict the response
pred_Lr_tf_r = Lr_tf_r.predict(X_test2)

# Checking sparsity of array with reduced C
w3 = Lr_tf_r.coef_
print("Sparsity: " ,np.count_nonzero(w3))

print()

print("Precision = ",precision_score(y_test2, pred_Lr_tf_r, pos_label='positive'))# Precision

Sparsity:  45

Precision =  0.8644536652835408


In [231]:
# instantiate learning model for best alpha
Lr_tf_r = LogisticRegression(penalty='l1', C = 0.0018)


# fitting the model
Lr_tf_r.fit(X2, y2)

# predict the response
pred_Lr_tf_r = Lr_tf_r.predict(X_test2)

# Checking sparsity of array with reduced C
w3 = Lr_tf_r.coef_
print("Sparsity: " ,np.count_nonzero(w3))

print()

print("Precision = ",precision_score(y_test2, pred_Lr_tf_r, pos_label='positive'))# Precision

Sparsity:  1

Precision =  0.8413333333333334


In [232]:
# instantiate learning model for best alpha
Lr_tf_r = LogisticRegression(penalty='l1', C = 0.0004)


# fitting the model
Lr_tf_r.fit(X2, y2)

# predict the response
pred_Lr_tf_r = Lr_tf_r.predict(X_test2)

# Checking sparsity of array with reduced C
w3 = Lr_tf_r.coef_
print("Sparsity: " ,np.count_nonzero(w3))

print()

print("Precision = ",precision_score(y_test2, pred_Lr_tf_r, pos_label='positive'))# Precision

Sparsity:  0

Precision =  0.0


  'precision', 'predicted', average, warn_for)


In [234]:
# instantiate learning model for best alpha
Lr_tf_r = LogisticRegression(penalty='l1', C = 0.9)


# fitting the model
Lr_tf_r.fit(X2, y2)

# predict the response
pred_Lr_tf_r = Lr_tf_r.predict(X_test2)

# Checking sparsity of array with increased C
w3 = Lr_tf_r.coef_
print("Sparsity: " ,np.count_nonzero(w3))

print()

print("Precision = ",precision_score(y_test2, pred_Lr_tf_r, pos_label='positive'))# Precision

Sparsity:  384

Precision =  0.9253159708923784


 Word2Vec

In [25]:
# Training Word2Vec model using our own text corpus

import gensim
i=0
list_of_sent=[]
for sent in random_final['Text'].values:
    filtered_sentence=[]
    sent=cleanhtml(sent)
    for w in sent.split():
        for cleaned_words in cleanpunc(w).split():
            if(cleaned_words.isalpha()):    
                filtered_sentence.append(cleaned_words.lower())
            else:
                continue 
    list_of_sent.append(filtered_sentence)
    

In [26]:
w2v_model=gensim.models.Word2Vec(list_of_sent,min_count=5,size=50, workers=6)  


 Avg W2V

In [27]:
# average Word2Vec
# compute average word2vec for each review.
sent_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sent in list_of_sent: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    cnt_words =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = w2v_model.wv[word]
            sent_vec += vec
            cnt_words += 1
        except:
            pass
    sent_vec /= cnt_words
    sent_vectors.append(sent_vec)
print(len(sent_vectors)) # length(rows) of average word2vec model
print(len(sent_vectors[0])) # dimensionality of table i.e. number of attributes

10000
50


In [28]:
# Standardizing the data with mean=0 and std.dev=1
from sklearn.preprocessing import StandardScaler
standardized_data_av = StandardScaler().fit_transform(sent_vectors)
print(standardized_data_av.shape)

(10000, 50)


In [29]:
X_av = np.array(standardized_data_av) # storing the values after standardization in a numpy array
y_av = np.array(random_final['Score']) # storing the values of Scores in numpy array

In [30]:
# split the data set into train and test, 70% train and 30% test 
X3, X_test3, y3, y_test3 = train_test_split(X_av, y_av, test_size=0.3, shuffle=False, random_state=0)

In [34]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
                               
tss_av = TimeSeriesSplit(n_splits=10) # TimeSeries Split with number of splits=10

parameters_av = [{'C': [10**-8, 10**-6, 10**-4, 10**-2, 10**0, 10**2, 10**4, 10**6, 10**8]}]# specifying the range of the hyperparameter C for GridSearch

Lr_av = LogisticRegression()

# GridSearch for finding right hyperparameter with with 10 fold CV on TimeSeriesSplit
model_av = GridSearchCV(Lr_av, parameters_av, cv=tss_av, refit=True)

# Fit the training data
model_av.fit(X3, y3)



GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [1e-08, 1e-06, 0.0001, 0.01, 1, 100, 10000, 1000000, 100000000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [35]:
model_av.best_params_ # print the best hyperparameter C

{'C': 1}

In [36]:
from sklearn.metrics import recall_score, f1_score, confusion_matrix, accuracy_score, precision_score


# instantiate learning model for best C
Lr_av = LogisticRegression(penalty='l2', C = 1)

# fitting the model
Lr_av.fit(X3, y3)

# predict the response
pred_Lr_av = Lr_av.predict(X_test3)

print()

# evaluate various performance measures:
print("Accuracy = ",accuracy_score(y_test3, pred_Lr_av)) # accuracy
print("Confusion Matrix\n", confusion_matrix(y_test3, pred_Lr_av)) # Confusion Matrix
tn, fn, fp, tp = confusion_matrix(y_test3, pred_Lr_av).ravel() # tn, fn, fp, tp
print("(tn, fn, fp, tp) =",(tn, fn, fp, tp))
print("Recall = ",recall_score(y_test3, pred_Lr_av, pos_label='positive')) # Recall
print("f1-Score = "  ,f1_score(y_test3, pred_Lr_av, pos_label='positive')) # f1-Score
print("Precision = ",precision_score(y_test3, pred_Lr_av, pos_label='positive'))# Precision



Accuracy =  0.869
Confusion Matrix
 [[ 148  312]
 [  81 2459]]
(tn, fn, fp, tp) = (148, 312, 81, 2459)
Recall =  0.9681102362204724
f1-Score =  0.9260026360384108
Precision =  0.8874052688560087


In [37]:
Lr_av.coef_ # weights of aver-w2v features

array([[ 0.08005884,  0.08280384,  0.33503268,  0.10609063, -0.51906066,
         0.37434723,  0.6006179 ,  0.04445225, -0.27361939,  0.65260072,
        -0.59205295, -0.00717538, -0.39121655,  0.04496764, -0.31790092,
         0.05801493,  0.65085597,  0.52352541,  0.21409915, -0.21894988,
        -0.0274133 ,  0.02651862,  0.56263945, -0.43338965, -0.05337297,
        -0.42315488, -0.11941252, -0.65448379, -0.81239727,  1.0579726 ,
        -0.29500718, -0.4507781 , -0.2468308 , -0.51595347,  1.33254397,
        -0.41452788, -0.03099506,  0.3503463 ,  0.03604411, -0.03547868,
        -0.76691235, -0.06297229,  0.11322419, -0.694697  ,  0.08773148,
         0.29493774,  0.58331405, -0.67724479,  0.42696756,  0.55977858]])

In [38]:
a_av = np.empty(0)
a_av = Lr_av.coef_ # storing in anumpy array

In [52]:
X_av = a_av + np.random.normal(0, 0.02) # adding a small noise in weights

In [54]:
X_av

array([[ 0.09323852,  0.09598352,  0.34821236,  0.11927032, -0.50588098,
         0.38752691,  0.61379758,  0.05763193, -0.2604397 ,  0.66578041,
        -0.57887326,  0.00600431, -0.37803687,  0.05814733, -0.30472124,
         0.07119461,  0.66403565,  0.53670509,  0.22727884, -0.2057702 ,
        -0.01423362,  0.03969831,  0.57581914, -0.42020997, -0.04019329,
        -0.4099752 , -0.10623284, -0.64130411, -0.79921758,  1.07115229,
        -0.2818275 , -0.43759841, -0.23365112, -0.50277379,  1.34572365,
        -0.40134819, -0.01781538,  0.36352598,  0.0492238 , -0.022299  ,
        -0.75373267, -0.04979261,  0.12640388, -0.68151732,  0.10091116,
         0.30811742,  0.59649373, -0.66406511,  0.44014724,  0.57295826]])

In [None]:
# As the weights don't differ significantly hence they are not collinear and we can use the weights as feature importance

In [79]:
# print top 20 positive features for positive class
print("Top 10 Positive Features for Average Word2Vec")
print()
print(w2v_model.most_similar(positive=['amazing'], topn=10))
print()
print()

# print top 20 positive features for negative class
print("Top 10 Negative Features for Average Word2Vec")
print()
print(w2v_model.most_similar(negative=['awful'], topn=10))
print()
print()


Top 10 Positive Features for Average Word2Vec

[('wonderful', 0.8924843072891235), ('fantastic', 0.883131742477417), ('fabulous', 0.8691504597663879), ('awesome', 0.8667495250701904), ('truly', 0.8283336162567139), ('terrific', 0.8055059909820557), ('awful', 0.8021843433380127), ('excellent', 0.799284815788269), ('unbeatable', 0.7787255048751831), ('smooth', 0.7784639596939087)]


Top 10 Negative Features for Average Word2Vec

[('keep', 0.1298961490392685), ('went', 0.08189871907234192), ('were', 0.06614072620868683), ('days', 0.045755207538604736), ('are', 0.043285585939884186), ('am', 0.03632929548621178), ('started', 0.017165493220090866), ('couple', 0.007540944963693619), ('im', 0.006798788905143738), ('came', 0.0007816553115844727)]




In [104]:
from sklearn.metrics import recall_score, f1_score, confusion_matrix, accuracy_score, precision_score


# instantiate learning model for best C
Lr_av = LogisticRegression(penalty='l1', C = 1)

# fitting the model
Lr_av.fit(X3, y3)

# predict the response
pred_Lr_av = Lr_av.predict(X_test3)

# Sparsity of av-w2v features with C
w4 = Lr_av.coef_
print("Sparsity: " ,np.count_nonzero(w4))

print()

# evaluate various performance measures:
print("Accuracy = ",accuracy_score(y_test3, pred_Lr_av)) # accuracy
print("Confusion Matrix\n", confusion_matrix(y_test3, pred_Lr_av)) # Confusion Matrix
tn, fn, fp, tp = confusion_matrix(y_test3, pred_Lr_av).ravel() # tn, fn, fp, tp
print("(tn, fn, fp, tp) =",(tn, fn, fp, tp))
print("Recall = ",recall_score(y_test3, pred_Lr_av, pos_label='positive')) # Recall
print("f1-Score = "  ,f1_score(y_test3, pred_Lr_av, pos_label='positive')) # f1-Score
print("Precision = ",precision_score(y_test3, pred_Lr_av, pos_label='positive'))# Precision


Sparsity:  47

Accuracy =  0.8683333333333333
Confusion Matrix
 [[ 146  314]
 [  81 2459]]
(tn, fn, fp, tp) = (146, 314, 81, 2459)
Recall =  0.9681102362204724
f1-Score =  0.9256540560888387
Precision =  0.8867652362062748


In [107]:
# instantiate learning model for best alpha
Lr_av = LogisticRegression(penalty='l1', C = 0.05)


# fitting the model
Lr_av.fit(X3, y3)

# predict the response
pred_Lr_av = Lr_av.predict(X_test3)

# Sparsity of av-w2v features with reduced C
w4 = Lr_av.coef_
print("Sparsity: " ,np.count_nonzero(w4))

print()

print("Precision = ",precision_score(y_test3, pred_Lr_av, pos_label='positive'))# Precision

Sparsity:  36

Precision =  0.8772919605077574


In [108]:
# instantiate learning model for best alpha
Lr_av = LogisticRegression(penalty='l1', C = 0.01)


# fitting the model
Lr_av.fit(X3, y3)

# predict the response
pred_Lr_av = Lr_av.predict(X_test3)

# Sparsity of av-w2v features with reduced C
w4 = Lr_av.coef_
print("Sparsity: " ,np.count_nonzero(w4))

print()

print("Precision = ",precision_score(y_test3, pred_Lr_av, pos_label='positive'))# Precision

Sparsity:  13

Precision =  0.8565587734241908


In [109]:
# instantiate learning model for best alpha
Lr_av = LogisticRegression(penalty='l1', C = 0.005)


# fitting the model
Lr_av.fit(X3, y3)

# predict the response
pred_Lr_av = Lr_av.predict(X_test3)

# Sparsity of av-w2v features with reduced C
w4 = Lr_av.coef_
print("Sparsity: " ,np.count_nonzero(w4))

print()

print("Precision = ",precision_score(y_test3, pred_Lr_av, pos_label='positive'))# Precision

Sparsity:  7

Precision =  0.8490123870103783


In [112]:
# instantiate learning model for best alpha
Lr_av = LogisticRegression(penalty='l1', C = 0.0004)


# fitting the model
Lr_av.fit(X3, y3)

# predict the response
pred_Lr_av = Lr_av.predict(X_test3)

# Sparsity of av-w2v features with reduced C
w4 = Lr_av.coef_
print("Sparsity: " ,np.count_nonzero(w4))

print()

print("Precision = ",precision_score(y_test3, pred_Lr_av, pos_label='positive'))# Precision

Sparsity:  0

Precision =  0.0


  'precision', 'predicted', average, warn_for)


In [116]:
# instantiate learning model for best alpha
Lr_av = LogisticRegression(penalty='l1', C = 10)


# fitting the model
Lr_av.fit(X3, y3)

# predict the response
pred_Lr_av = Lr_av.predict(X_test3)

# Sparsity of av-w2v features with increased C
w4 = Lr_av.coef_
print("Sparsity: " ,np.count_nonzero(w4))

print()

print("Precision = ",precision_score(y_test3, pred_Lr_av, pos_label='positive'))# Precision

Sparsity:  50

Precision =  0.8882863340563991


In [117]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_rand
                               
                              
tss_av_r = TimeSeriesSplit(n_splits=10) # TimeSeries Split with number of splits=10

parameters_av_r = {'C': sp_rand()}# specifying the range of the hyperparameter C for RandomizedSearchCV

Lr_av_r = LogisticRegression()

# RandomizedSearchCV for finding right hyperparameter with with 10 fold CV on TimeSeriesSplit
model_av_r = RandomizedSearchCV(Lr_av_r, parameters_av_r, cv=tss_av_r, refit=True)

# Fit the training data
model_av_r.fit(X3, y3)



RandomizedSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
          error_score='raise',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f926454e550>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [118]:
model_av_r.best_params_ # best hyperaparameter C

{'C': 0.40039032299062904}

In [119]:
from sklearn.metrics import recall_score, f1_score, confusion_matrix, accuracy_score, precision_score


# instantiate learning model for best C
Lr_av_r = LogisticRegression(penalty='l2', C = 0.40039032299062904)

# fitting the model
Lr_av_r.fit(X3, y3)

# predict the response
pred_Lr_av_r = Lr_av_r.predict(X_test3)

print()

# evaluate various performance measures:
print("Accuracy = ",accuracy_score(y_test3, pred_Lr_av_r)) # accuracy
print("Confusion Matrix\n", confusion_matrix(y_test3, pred_Lr_av_r)) # Confusion Matrix
tn, fn, fp, tp = confusion_matrix(y_test3, pred_Lr_av_r).ravel() # tn, fn, fp, tp
print("(tn, fn, fp, tp) =",(tn, fn, fp, tp))
print("Recall = ",recall_score(y_test3, pred_Lr_av_r, pos_label='positive')) # Recall
print("f1-Score = "  ,f1_score(y_test3, pred_Lr_av_r, pos_label='positive')) # f1-Score
print("Precision = ",precision_score(y_test3, pred_Lr_av_r, pos_label='positive'))# Precision



Accuracy =  0.8683333333333333
Confusion Matrix
 [[ 145  315]
 [  80 2460]]
(tn, fn, fp, tp) = (145, 315, 80, 2460)
Recall =  0.968503937007874
f1-Score =  0.9256820319849484
Precision =  0.8864864864864865


In [120]:
from sklearn.metrics import recall_score, f1_score, confusion_matrix, accuracy_score, precision_score


# instantiate learning model for best C
Lr_av_r = LogisticRegression(penalty='l1', C = 0.40039032299062904)

# fitting the model
Lr_av_r.fit(X3, y3)

# predict the response
pred_Lr_av_r = Lr_av_r.predict(X_test3)

# Sparsity of av-w2v features with C
w5 = Lr_av_r.coef_
print("Sparsity: " ,np.count_nonzero(w5))

print()

# evaluate various performance measures:
print("Accuracy = ",accuracy_score(y_test3, pred_Lr_av_r)) # accuracy
print("Confusion Matrix\n", confusion_matrix(y_test3, pred_Lr_av_r)) # Confusion Matrix
tn, fn, fp, tp = confusion_matrix(y_test3, pred_Lr_av_r).ravel() # tn, fn, fp, tp
print("(tn, fn, fp, tp) =",(tn, fn, fp, tp))
print("Recall = ",recall_score(y_test3, pred_Lr_av_r, pos_label='positive')) # Recall
print("f1-Score = "  ,f1_score(y_test3, pred_Lr_av_r, pos_label='positive')) # f1-Score
print("Precision = ",precision_score(y_test3, pred_Lr_av_r, pos_label='positive'))# Precision


Sparsity:  43

Accuracy =  0.8676666666666667
Confusion Matrix
 [[ 144  316]
 [  81 2459]]
(tn, fn, fp, tp) = (144, 316, 81, 2459)
Recall =  0.9681102362204724
f1-Score =  0.9253057384760112
Precision =  0.8861261261261262


In [122]:
# instantiate learning model for best alpha
Lr_av_r = LogisticRegression(penalty='l1', C = 0.05)


# fitting the model
Lr_av_r.fit(X3, y3)

# predict the response
pred_Lr_av_r = Lr_av_r.predict(X_test3)


# Sparsity of av-w2v features with reduced C
w5 = Lr_av_r.coef_
print("Sparsity: " ,np.count_nonzero(w5))

print()

print("Precision = ",precision_score(y_test3, pred_Lr_av_r, pos_label='positive'))# Precision

Sparsity:  36

Precision =  0.8772919605077574


In [123]:
# instantiate learning model for best alpha
Lr_av_r = LogisticRegression(penalty='l1', C = 0.01)


# fitting the model
Lr_av_r.fit(X3, y3)

# predict the response
pred_Lr_av_r = Lr_av_r.predict(X_test3)

# Sparsity of av-w2v features with reduced C
w5 = Lr_av_r.coef_
print("Sparsity: " ,np.count_nonzero(w5))

print()

print("Precision = ",precision_score(y_test3, pred_Lr_av_r, pos_label='positive'))# Precision

Sparsity:  13

Precision =  0.8565587734241908


In [126]:
# instantiate learning model for best alpha
Lr_av_r = LogisticRegression(penalty='l1', C = 0.004)


# fitting the model
Lr_av_r.fit(X3, y3)

# predict the response
pred_Lr_av_r = Lr_av_r.predict(X_test3)

# Sparsity of av-w2v features with reduced C
w5 = Lr_av_r.coef_
print("Sparsity: " ,np.count_nonzero(w5))

print()

print("Precision = ",precision_score(y_test3, pred_Lr_av_r, pos_label='positive'))# Precision

Sparsity:  7

Precision =  0.8468468468468469


In [127]:
# instantiate learning model for best alpha
Lr_av_r = LogisticRegression(penalty='l1', C = 0.0001)


# fitting the model
Lr_av_r.fit(X3, y3)

# predict the response
pred_Lr_av_r = Lr_av_r.predict(X_test3)

# Sparsity of av-w2v features with reduced C
w5 = Lr_av_r.coef_
print("Sparsity: " ,np.count_nonzero(w5))

print()

print("Precision = ",precision_score(y_test3, pred_Lr_av_r, pos_label='positive'))# Precision

Sparsity:  0

Precision =  0.0


  'precision', 'predicted', average, warn_for)


In [133]:
# instantiate learning model for best alpha
Lr_av_r = LogisticRegression(penalty='l1', C = 2.5)


# fitting the model
Lr_av_r.fit(X3, y3)

# predict the response
pred_Lr_av_r = Lr_av_r.predict(X_test3)

# Sparsity of av-w2v features with increased C
w5 = Lr_av_r.coef_
print("Sparsity: " ,np.count_nonzero(w5))

print()

print("Precision = ",precision_score(y_test3, pred_Lr_av_r, pos_label='positive'))# Precision

Sparsity:  48

Precision =  0.8880057803468208


In [None]:
TF_IDF-w2v

In [134]:
# TF-IDF weighted Word2Vec
tfidf_feat = tf_idf_vect.get_feature_names() # tfidf words/col-names
# final_tf_idf is the sparse matrix with row= sentence, col=word and cell_val = tfidf

tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this list
row=0;
for sent in list_of_sent: # for each review/sentence
    sent_vec = np.zeros(50) # as word vectors are of zero length
    weight_sum =0; # num of words with a valid vector in the sentence/review
    for word in sent: # for each word in a review/sentence
        try:
            vec = w2v_model.wv[word]
            # obtain the tf_idfidf of a word in a sentence/review
            tfidf = final_tf_idf[row, tfidf_feat.index(word)]
            sent_vec += (vec * tf_idf)
            weight_sum += tf_idf
        except:
            pass
    sent_vec /= weight_sum
    tfidf_sent_vectors.append(sent_vec)
    row += 1
    

    



In [135]:
print(len(tfidf_sent_vectors)) # number of rows in tf-idf-w2v
print(len(tfidf_sent_vectors[0])) # number of features

10000
50


In [136]:
df = pd.DataFrame(tfidf_sent_vectors) # converting to dataframe 

In [137]:
df.isnull().any() # check if there is any Nan entry

0     True
1     True
2     True
3     True
4     True
5     True
6     True
7     True
8     True
9     True
10    True
11    True
12    True
13    True
14    True
15    True
16    True
17    True
18    True
19    True
20    True
21    True
22    True
23    True
24    True
25    True
26    True
27    True
28    True
29    True
30    True
31    True
32    True
33    True
34    True
35    True
36    True
37    True
38    True
39    True
40    True
41    True
42    True
43    True
44    True
45    True
46    True
47    True
48    True
49    True
dtype: bool

In [138]:
df = df.fillna(0) # fill with 0 inplace of NaN

In [139]:
# Standardizing the data with mean=0 and std.dev=1
from sklearn.preprocessing import StandardScaler
standardized_data_av_tf = StandardScaler().fit_transform(df)
print(standardized_data_av_tf.shape)

(10000, 50)


In [140]:
X_av_tf = np.array(standardized_data_av_tf) # storing the values after standardization in a numpy array
y_av_tf = np.array(random_final['Score']) # storing the values of Scores in numpy array

In [141]:
# split the data set into train and test, 70% train and 30% test 
X4, X_test4, y4, y_test4 = train_test_split(X_av_tf, y_av_tf, test_size=0.3, shuffle=False, random_state=0)

In [142]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
                               
tss_av_tf = TimeSeriesSplit(n_splits=10) # TimeSeries Split with number of splits=10

parameters_av_tf = [{'C': [10**-8, 10**-6, 10**-4, 10**-2, 10**0, 10**2, 10**4, 10**6, 10**8]}]# specifying the range of the hyperparameter C for GridSearch

Lr_av_tf = LogisticRegression()

# GridSearch for finding right hyperparameter with with 10 fold CV on TimeSeriesSplit
model_av_tf = GridSearchCV(Lr_av_tf, parameters_av_tf, cv=tss_av_tf, refit=True)

# Fit the training data
model_av_tf.fit(X4, y4)



GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
       error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'C': [1e-08, 1e-06, 0.0001, 0.01, 1, 100, 10000, 1000000, 100000000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [145]:
model_av_tf.best_params_ # best hyperaparamter C

{'C': 1e-08}

In [146]:
from sklearn.metrics import recall_score, f1_score, confusion_matrix, accuracy_score, precision_score


# instantiate learning model for best C
Lr_av_tf = LogisticRegression(penalty='l2', C = 1e-08)

# fitting the model
Lr_av_tf.fit(X4, y4)

# predict the response
pred_Lr_av_tf = Lr_av_tf.predict(X_test4)

print()

# evaluate various performance measures:
print("Accuracy = ",accuracy_score(y_test4, pred_Lr_av_tf)) # accuracy
print("Confusion Matrix\n", confusion_matrix(y_test4, pred_Lr_av_tf)) # Confusion Matrix
tn, fn, fp, tp = confusion_matrix(y_test4, pred_Lr_av_tf).ravel() # tn, fn, fp, tp
print("(tn, fn, fp, tp) =",(tn, fn, fp, tp))
print("Recall = ",recall_score(y_test4, pred_Lr_av_tf, pos_label='positive')) # Recall
print("f1-Score = "  ,f1_score(y_test4, pred_Lr_av_tf, pos_label='positive')) # f1-Score
print("Precision = ",precision_score(y_test4, pred_Lr_av_tf, pos_label='positive'))# Precision



Accuracy =  0.8466666666666667
Confusion Matrix
 [[   0  460]
 [   0 2540]]
(tn, fn, fp, tp) = (0, 460, 0, 2540)
Recall =  1.0
f1-Score =  0.9169675090252708
Precision =  0.8466666666666667


In [147]:
a_av_tf = np.empty(0)
a_av_tf = Lr_av_tf.coef_ # put the weights in a numpy array

In [148]:
a_av_tf

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]])

In [160]:
X_av_tf = a_av_tf + np.random.normal(0, 0.02) # adding a small noise

In [195]:
X_av_tf

array([[0.0094857, 0.0094857, 0.0094857, 0.0094857, 0.0094857, 0.0094857,
        0.0094857, 0.0094857, 0.0094857, 0.0094857, 0.0094857, 0.0094857,
        0.0094857, 0.0094857, 0.0094857, 0.0094857, 0.0094857, 0.0094857,
        0.0094857, 0.0094857, 0.0094857, 0.0094857, 0.0094857, 0.0094857,
        0.0094857, 0.0094857, 0.0094857, 0.0094857, 0.0094857, 0.0094857,
        0.0094857, 0.0094857, 0.0094857, 0.0094857, 0.0094857, 0.0094857,
        0.0094857, 0.0094857, 0.0094857, 0.0094857, 0.0094857, 0.0094857,
        0.0094857, 0.0094857, 0.0094857, 0.0094857, 0.0094857, 0.0094857,
        0.0094857, 0.0094857]])

In [196]:
# As seen that they are changing i.e. thay are not collinear. So, we can take edge weights as feature importance 

In [213]:
# print top 20 positive features for positive class
print("Top 10 Positive Features for Average Word2Vec")
print()
print( w2v_model.most_similar(positive=['amazing'], topn=10))
print()
print()

# print top 20 positive features for negative class
print("Top 10 Negative Features for Average Word2Vec")
print()
print(w2v_model.most_similar(negative=['awful'], topn=10))
print()
print()


Top 10 Positive Features for Average Word2Vec

[('wonderful', 0.8924843072891235), ('fantastic', 0.883131742477417), ('fabulous', 0.8691504597663879), ('awesome', 0.8667495250701904), ('truly', 0.8283336162567139), ('terrific', 0.8055059909820557), ('awful', 0.8021843433380127), ('excellent', 0.799284815788269), ('unbeatable', 0.7787255048751831), ('smooth', 0.7784639596939087)]


Top 10 Negative Features for Average Word2Vec

[('keep', 0.1298961490392685), ('went', 0.08189871907234192), ('were', 0.06614072620868683), ('days', 0.045755207538604736), ('are', 0.043285585939884186), ('am', 0.03632929548621178), ('started', 0.017165493220090866), ('couple', 0.007540944963693619), ('im', 0.006798788905143738), ('came', 0.0007816553115844727)]




In [164]:
from sklearn.metrics import recall_score, f1_score, confusion_matrix, accuracy_score, precision_score


# instantiate learning model for best C
Lr_av_tf = LogisticRegression(penalty='l1', C = 1e-08)

# fitting the model
Lr_av_tf.fit(X4, y4)

# predict the response
pred_Lr_av_tf = Lr_av_tf.predict(X_test4)

# Sparsity of av-w2v features with C
w6 = Lr_av_tf.coef_
print("Sparsity: " ,np.count_nonzero(w6))

print()

# evaluate various performance measures:
print("Accuracy = ",accuracy_score(y_test4, pred_Lr_av_tf)) # accuracy
print("Confusion Matrix\n", confusion_matrix(y_test4, pred_Lr_av_tf)) # Confusion Matrix
tn, fn, fp, tp = confusion_matrix(y_test4, pred_Lr_av_tf).ravel() # tn, fn, fp, tp
print("(tn, fn, fp, tp) =",(tn, fn, fp, tp))
print("Recall = ",recall_score(y_test4, pred_Lr_av_tf, pos_label='positive')) # Recall
print("f1-Score = "  ,f1_score(y_test4, pred_Lr_av_tf, pos_label='positive')) # f1-Score
print("Precision = ",precision_score(y_test4, pred_Lr_av_tf, pos_label='positive'))# Precision


Sparsity:  0

Accuracy =  0.15333333333333332
Confusion Matrix
 [[ 460    0]
 [2540    0]]
(tn, fn, fp, tp) = (460, 0, 2540, 0)
Recall =  0.0
f1-Score =  0.0
Precision =  0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [168]:
# instantiate learning model for best alpha
Lr_av_tf = LogisticRegression(penalty='l1', C = 1e-01)


# fitting the model
Lr_av_tf.fit(X4, y4)

# predict the response
pred_Lr_av_tf = Lr_av_tf.predict(X_test3)

# Sparsity of av-w2v features with reduced C
w6 = Lr_av_tf.coef_
print("Sparsity: " ,np.count_nonzero(w6))

print()

print("Precision = ",precision_score(y_test4, pred_Lr_av_tf, pos_label='positive'))# Precision

Sparsity:  0

Precision =  0.8466666666666667


In [169]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform as sp_rand
                               
                              
tss_av_tf_r = TimeSeriesSplit(n_splits=10) # TimeSeries Split with number of splits=10

parameters_av_tf_r = {'C': sp_rand()}# specifying the range of the hyperparameter C for RandomizedSearchCV

Lr_av_tf_r = LogisticRegression()

# RandomizedSearchCV for finding right hyperparameter with with 10 fold CV on TimeSeriesSplit
model_av_tf_r = RandomizedSearchCV(Lr_av_tf_r, parameters_av_tf_r, cv=tss_av_tf_r, refit=True)

# Fit the training data
model_av_tf_r.fit(X4, y4)



RandomizedSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=10),
          error_score='raise',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f9257e3bb38>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [170]:
model_av_tf_r.best_params_ # best hyperparameter C

{'C': 0.41034631322795734}

In [171]:
from sklearn.metrics import recall_score, f1_score, confusion_matrix, accuracy_score, precision_score


# instantiate learning model for best C
Lr_av_tf_r = LogisticRegression(penalty='l2', C = 0.41034631322795734)

# fitting the model
Lr_av_tf_r.fit(X4, y4)

# predict the response
pred_Lr_av_tf_r = Lr_av_tf_r.predict(X_test4)

print()

# evaluate various performance measures:
print("Accuracy = ",accuracy_score(y_test4, pred_Lr_av_tf_r)) # accuracy
print("Confusion Matrix\n", confusion_matrix(y_test4, pred_Lr_av_tf_r)) # Confusion Matrix
tn, fn, fp, tp = confusion_matrix(y_test4, pred_Lr_av_tf_r).ravel() # tn, fn, fp, tp
print("(tn, fn, fp, tp) =",(tn, fn, fp, tp))
print("Recall = ",recall_score(y_test4, pred_Lr_av_tf_r, pos_label='positive')) # Recall
print("f1-Score = "  ,f1_score(y_test4, pred_Lr_av_tf_r, pos_label='positive')) # f1-Score
print("Precision = ",precision_score(y_test4, pred_Lr_av_tf_r, pos_label='positive'))# Precision



Accuracy =  0.8466666666666667
Confusion Matrix
 [[   0  460]
 [   0 2540]]
(tn, fn, fp, tp) = (0, 460, 0, 2540)
Recall =  1.0
f1-Score =  0.9169675090252708
Precision =  0.8466666666666667


In [190]:
from sklearn.metrics import recall_score, f1_score, confusion_matrix, accuracy_score, precision_score


# instantiate learning model for best alpha
Lr_av_tf_r = LogisticRegression(penalty='l1', C = 0.41034631322795734)

# fitting the model
Lr_av_tf_r.fit(X4, y4)

# predict the response
pred_Lr_av_tf_r = Lr_av_tf_r.predict(X_test4)

# Sparsity of av-w2v features with C
w7 = Lr_av_tf.coef_
print("Sparsity: " ,np.count_nonzero(w7))

print()

# evaluate various performance measures:
print("Accuracy = ",accuracy_score(y_test4, pred_Lr_av_tf_r)) # accuracy
print("Confusion Matrix\n", confusion_matrix(y_test4, pred_Lr_av_tf_r)) # Confusion Matrix
tn, fn, fp, tp = confusion_matrix(y_test4, pred_Lr_av_tf_r).ravel() # tn, fn, fp, tp
print("(tn, fn, fp, tp) =",(tn, fn, fp, tp))
print("Recall = ",recall_score(y_test4, pred_Lr_av_tf_r, pos_label='positive')) # Recall
print("f1-Score = "  ,f1_score(y_test4, pred_Lr_av_tf_r, pos_label='positive')) # f1-Score
print("Precision = ",precision_score(y_test4, pred_Lr_av_tf_r, pos_label='positive'))# Precision


Sparsity:  0

Accuracy =  0.8466666666666667
Confusion Matrix
 [[   0  460]
 [   0 2540]]
(tn, fn, fp, tp) = (0, 460, 0, 2540)
Recall =  1.0
f1-Score =  0.9169675090252708
Precision =  0.8466666666666667


In [193]:
# instantiate learning model for best alpha
Lr_av_tf_r = LogisticRegression(penalty='l1', C = 0.01)


# fitting the model
Lr_av_tf_r.fit(X4, y4)

# predict the response
pred_Lr_av_tf_r = Lr_av_tf.predict(X_test4)

# Sparsity of av-w2v features with reduced C
w7 = Lr_av_tf_r.coef_
print("Sparsity: " ,np.count_nonzero(w7))

print()

print("Precision = ",precision_score(y_test4, pred_Lr_av_tf_r, pos_label='positive'))# Precision

Sparsity:  0

Precision =  0.0


  'precision', 'predicted', average, warn_for)
