<h1>Sentiment Analysis</h1>

In [1]:
import pandas as pd
import numpy as np

In [3]:
# Read Amazon_Unlocked_Mobile dataset

df = pd.read_csv('Amazon_unlocked_Mobile.csv')

# Look at the dataset

df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


Firstly we clean up the dataset

In [5]:
# Drop any row with missing values

df.dropna(inplace=True)

In [6]:
# Remove any rating equal to 3, as we assume them negative
df = df[df['Rating']!=3]

In [7]:
# Created a new column with shows 1 for any rating greater than 3 
# and else shows 0

df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)

In [9]:
# Look at the dataset
df.head(10)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0,1
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0,1
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0,1
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0,1
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0,1
5,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,1,I already had a phone with problems... I know ...,1.0,0
6,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,The charging port was loose. I got that solder...,0.0,0
7,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,2,"Phone looks good but wouldn't stay charged, ha...",0.0,0
8,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I originally was using the Samsung S2 Galaxy f...,0.0,1
11,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,This is a great product it came after two days...,0.0,1


In [10]:
# Look at the mean 
df['Positively Rated'].mean()

0.7482686025879323

This shows we have imbalanced classes

In [13]:
# Split the dataset into training and testing dataset

from sklearn.model_selection import train_test_split

X_train, x_test, Y_train, Y_test = train_test_split(df['Reviews'], df['Positively Rated'], random_state=0)

In [20]:
# Look at the training dataset
X_train[0]

"I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!"

In [21]:
X_train[100]

'excelente'

In [18]:
# dimension  of training dataset

X_train.shape

(231207,)

We are having 231207 rows of reviews

<h2>CountVectorizer</h2>

converting the text data into numerical data so that we can use it in sklearn

In [22]:
# apply the bag-of-words approach
# to get the vocabulary generated after fitting CountVectorizer to the trainig dataset

from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(X_train)

In [26]:
# Have a look on the vocabulary
vect.get_feature_names()[::2000]

['00',
 '4less',
 'adr6275',
 'assignment',
 'blazingly',
 'cassettes',
 'condishion',
 'debi',
 'dollarsshipping',
 'esteem',
 'flashy',
 'gorila',
 'human',
 'irullu',
 'like',
 'microsaudered',
 'nightmarish',
 'p770',
 'poori',
 'quirky',
 'responseive',
 'send',
 'sos',
 'synch',
 'trace',
 'utiles',
 'withstanding']

This vocabulary is built on any tokens that occurred in the training data 

In [27]:
len(vect.get_feature_names())

53216

Number of features we have  = 53216

In [28]:
X_train_vectorized = vect.transform(X_train)

X_train_vectorized

<231207x53216 sparse matrix of type '<class 'numpy.int64'>'
	with 6117776 stored elements in Compressed Sparse Row format>

Train the model using LogisticRegression

In [30]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vectorized, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [32]:
# Make prediction on X_test after transforming X_test dataset
from sklearn.metrics import roc_auc_score
# Any words in X_test that didn't appeared in X_train will be ignored 
predictions = model.predict(vect.transform(x_test))
roc_auc_score(Y_test, predictions)

0.92799846340909387

In [33]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['worst' 'false' 'mony' 'worthless' 'junk' 'garbage' 'messing' 'useless'
 'blacklist' 'unsatisfied']

Largest Coefs: 
['excelent' 'excelente' 'exelente' 'excellent' 'loving' 'efficient' 'loves'
 'perfecto' 'lovely' 'amazing']


<h2>Tfidf (Term frequency inverse document frequency)</h2>

It allow us to weight terms based on how important they are to a document

High weight is given to terms that appears often in a particular document, but don't appear often on the corpus

Features with low Tfidf are either commonly used across all the documents or rarely used and only occur in long documents

And features with high Tfidf are vice-versa

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
# min_df = 5 : remove any vocabulary that appear in fewer than 5 documents

vect = TfidfVectorizer(min_df=5).fit(X_train)

In [36]:
len(vect.get_feature_names())

17951

Number of features we have now  = 17951

In [39]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, Y_train)

predictions = model.predict(vect.transform(x_test))

print('AUC: ', roc_auc_score(Y_test, predictions))

AUC:  0.926610066675


Accuracy doest not change but we get the same accuracy even after reducing the number of features

In [40]:
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['commenter' 'pthalo' 'warmness' 'storageso' 'aggregration' '1300'
 '625nits' 'a10' 'submarket' 'brawns']

Largest tfidf: 
['defective' 'batteries' 'gooood' 'epic' 'luis' 'goood' 'basico'
 'aceptable' 'problems' 'excellant']


In [41]:
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['not' 'worst' 'useless' 'disappointed' 'terrible' 'return' 'waste' 'poor'
 'horrible' 'doesn']

Largest Coefs: 
['love' 'great' 'excellent' 'perfect' 'amazing' 'awesome' 'perfectly'
 'easy' 'best' 'loves']


In [42]:
# These reviews are treated the same by our current model
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[0 0]


<h1>n-grams</h1>

In [43]:
# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

198917

Number of featues increases by adding n-grams to 198917

In [45]:
model = LogisticRegression()
model.fit(X_train_vectorized, Y_train)

predictions = model.predict(vect.transform(x_test))

print('AUC: ', roc_auc_score(Y_test, predictions))

AUC:  0.96715244318


Now the accuracy increase to 96.7%

In [46]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['no good' 'worst' 'junk' 'not good' 'not happy' 'horrible' 'garbage'
 'terrible' 'looks ok' 'nope']

Largest Coefs: 
['not bad' 'excelent' 'excelente' 'excellent' 'perfect' 'no problems'
 'exelente' 'awesome' 'no issues' 'great']


In [47]:
# These reviews are now correctly identified
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[1 0]


1 is positive review and 0 is negative review

In [51]:
print(model.predict(vect.transform(['having bad issue with gadget ',
                                    'everything working great'])))

[0 1]
