In [10]:
import pandas as pd
import numpy as np

df = pd.read_csv("Amazon_Unlocked_Mobile.csv")
#df.head()

## Panda also provide mechanism to sample the data
#df = df.sample(frac=0.1, random_state=10)
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [11]:
## preprocessing the data
df.dropna(inplace=True)
# remove any neutral rating
df = df[df['Rating'] != 3]

df.loc[:,'Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)


In [12]:
## mean showcase that most of the review is positively rated
df['Positively Rated'].mean()

0.7482686025879323

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], df['Positively Rated'], random_state=0)

In [18]:
print("X_train Shape: ", X_train.shape)

X_train Shape:  (231207,)


### CountVectorizer

##### Convert a collection of text documents to a matrix of token counts

##### This implementation produces a sparse representation of the counts using scipy.sparse.csr_matrix.

##### If you do not provide an a-priori dictionary and you do not use an analyzer that does some kind of feature selection then the number of features will be equal to the vocabulary size found by analyzing the data.

In [46]:
from sklearn.feature_extraction.text import CountVectorizer

vect= CountVectorizer()

In [47]:
X_train_vectorized = vect.fit_transform(X_train)

In [48]:
X_test_vectorized = vect.transform(X_test)

In [49]:
## With Spare data/matric LogisticRegression is best suited and also because 
## output is binary (1 or 0)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(X_test_vectorized)

print("AUC Score: ", roc_auc_score(y_test, predictions))

AUC Score:  0.9268686014350423


In [43]:
X_test_vectorized

<77070x53216 sparse matrix of type '<class 'numpy.int64'>'
	with 2046809 stored elements in Compressed Sparse Row format>

In [44]:
X_train_vectorized

<231207x53216 sparse matrix of type '<class 'numpy.int64'>'
	with 6117776 stored elements in Compressed Sparse Row format>

In [51]:
feature_names = np.array(vect.get_feature_names())

In [52]:
len(feature_names)

53216

In [56]:
## Sort coefficient from model
sorted_model_coef = model.coef_[0].argsort()

In [58]:
print("Smallest coeff: ", feature_names[sorted_model_coef[:10]])
print("Largest coef: ", feature_names[sorted_model_coef][-11:-1])

Smallest coeff:  ['worst' 'false' 'worthless' 'junk' 'mony' 'garbage' 'useless' 'messing'
 'unusable' 'blacklist']
Largest coef:  ['love' 'lovely' 'amazing' 'perfecto' 'efficient' 'loves' 'loving'
 'excellent' 'exelente' 'excelente']


### Tfidf
#### term frequecy inverse document frequency
##### terms with frequent occurance in particular document but not in all the document are given higher weightage, vs terms that appear more frequent across various document. 

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
vect = TfidfVectorizer(min_df =5).fit(X_train)
len(vect.get_feature_names())

17951

In [62]:
X_train_vectorized = vect.transform(X_train)

X_test_vectorized = vect.transform(X_test)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(X_test_vectorized)

aucscore = roc_auc_score(y_test, predictions)

print ("AUC Score: ", aucscore)

AUC Score:  0.9266100666746837


In [63]:
feature_name = np.array(vect.get_feature_names())

In [64]:
feature_name[:10]

array(['00', '000', '0000', '000000', '000mah', '007', '00pm', '01', '02',
       '03'], dtype='<U31')

In [75]:
sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print("Smallest Tfidf ", feature_name[sorted_tfidf_index[:10]])
print("Largest Tfidf ", feature_name[sorted_tfidf_index[-11:-1]])

Smallest Tfidf  ['commenter' 'pthalo' 'warmness' 'storageso' 'aggregration' '1300'
 '625nits' 'a10' 'submarket' 'brawns']
Largest Tfidf  ['goog' 'excellant' 'problems' 'aceptable' 'basico' 'goood' 'luis' 'epic'
 'gooood' 'batteries']


In [76]:
sorted_coef_index = model.coef_[0].argsort()

print("Smallest Model Coef ", feature_name[sorted_coef_index[:10]])
print("Largest Model Coef ", feature_name[sorted_coef_index[-11:-1]])

Smallest Model Coef  ['not' 'worst' 'useless' 'disappointed' 'terrible' 'return' 'waste' 'poor'
 'horrible' 'doesn']
Largest Model Coef  ['far' 'loves' 'best' 'easy' 'perfectly' 'awesome' 'amazing' 'perfect'
 'excellent' 'great']


In [77]:
## Model here treats both positive and negative review the same. And assigns Predicted Rating as 0
print(model.predict(vect.transform(['not an issue, phone is working', 'an issue, phone is not working'])))

[0 0]


### n-grams
#### using countervectorization with i/p parameter of ngram_range

In [78]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

vect = CountVectorizer(min_df=5, ngram_range=(1, 2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

X_test_vectorized = vect.transform(X_test)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

prediction = model.predict(X_test_vectorized)
print("AUC Score: ", roc_auc_score(y_test, prediction))


AUC Score:  0.9266100666746837


In [80]:
print("AUC Score: ", roc_auc_score(y_test, prediction))

AUC Score:  0.9671263879424379


In [82]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print("Smallest Coef Index", feature_names[sorted_coef_index[:30]])
print("Largest Coef Index", feature_names[sorted_coef_index[-11:-1]])

Smallest Coef Index ['no good' 'worst' 'junk' 'not good' 'not happy' 'horrible' 'garbage'
 'terrible' 'looks ok' 'nope' 'not very' 'defective' 'sucks' 'useless'
 'one star' 'not satisfied' 'poor' 'product good' 'not worth' 'awful'
 'never worked' 'didn like' 'broken' 'ok not' 'at best' 'disappointed'
 'like game' 'noooooo' 'them all' 'fake']
Largest Coef Index ['amazing' 'great' 'no issues' 'awesome' 'exelente' 'no problems'
 'perfect' 'excellent' 'excelente' 'excelent']


In [81]:
print(model.predict(vect.transform(['not an issue, phone is working','an issue, phone is not working'])))

[1 0]
