# Sentiment analysis on Amazon reviews on unlocked phone
Source: Kaggle.com

## Data Prep

In [11]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [12]:
reviews_df = pd.read_csv("Amazon_Unlocked_Mobile.csv")
reviews_df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [7]:
reviews_df.shape

(413840, 6)

In [17]:
reviews_df["Brand Name"].value_counts().nlargest(5)

Samsung       65747
BLU           63248
Apple         58186
LG            22417
BlackBerry    16872
Name: Brand Name, dtype: int64

In [23]:
reviews_df[::100000]

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
100000,BlackBerry Storm 2 9550 Unlocked Phone - No Wa...,BlackBerry,79.99,2,"After three weeks, I began to have problems wi...",0.0
200000,HUAWEI Ascend P7 P7-L10 16GB Unlocked GSM 4G L...,Huawei,182.99,5,exelente,0.0
300000,POSH MOBILE TITAN PRO HD ANDROID GSM UNLOCKED ...,Posh Mobile,97.59,5,This little phone is so cool and usefull,0.0
400000,Sprint LG Marquee LS855 Black (CDMA) Android S...,,44.99,4,Decent little phone with basic functions. Ok f...,0.0


## Data Cleaning

In [25]:
reviews_df.isna().sum()

Product Name        0
Brand Name      65171
Price            5933
Rating              0
Reviews            62
Review Votes    12296
dtype: int64

In [38]:
reviews_new = reviews_df.copy()

In [39]:
#Drop missing values
reviews_new.dropna(inplace = True)
#Remove neutral values
reviews_new = reviews_new[reviews_new["Rating"]!=3]

In [40]:
reviews_new.shape

(308277, 6)

In [41]:
#Encode 1,2 as 0 (negative reviews) and encode 4,5 as 1 (positive reviews)
reviews_new["Positively Rated"] = np.where(reviews_new["Rating"]>3,1,0)

### CountVectorizer - simplest version

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [43]:
X_train, X_test, y_train, y_test = train_test_split(reviews_new["Reviews"], reviews_new["Positively Rated"], random_state = 24)

In [70]:
vectorizer = CountVectorizer()
v_fit = vectorizer.fit(X_train)
X_train_v = v_fit.transform(X_train)

In [123]:
X_train_v.shape

(231207, 53572)

In [104]:
features = np.array(v_fit.get_feature_names())

### Logistic Regression 

In [71]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [138]:
lr = LogisticRegression(solver = "liblinear")

In [73]:
lr.fit(X_train_v, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [74]:
y_pred = lr.predict(v_fit.transform(X_test))

In [75]:
roc_auc_score(y_pred, y_test)

0.9418624726982646

In [115]:
args = lr.coef_[0].argsort()
top_positive_words = features[args[:10]]
top_negative_words = features[args[-10:]]

In [116]:
print(top_positive_words)
print(top_negative_words)

['worst' 'mony' 'junk' 'worthless' 'garbage' 'lemon' 'cuts' 'useless'
 'false' 'nope']
['lovely' 'awesome' 'love' 'perfecto' 'loves' 'loving' 'excellent'
 'exelente' 'excelente' 'excelent']


### Countvectorizer - removing stop words, taking only words in 10 or more docs, lower case, n grams

In [117]:
from sklearn.feature_extraction.text import CountVectorizer

In [133]:
v_fit2 = CountVectorizer(stop_words= "english", min_df = 10, ngram_range = (1,2) ).fit(X_train)

In [140]:
X_train_v2 = v_fit2.transform(X_train)

In [141]:
lr.fit(X_train_v2,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [143]:
y_pred2 = lr.predict(v_fit2.transform(X_test))

In [144]:
roc_auc_score(y_pred2, y_test)

0.9549209280631135

In [154]:
features2 = np.array(v_fit2.get_feature_names())

In [155]:
args2 = lr.coef_[0].argsort()

In [158]:
features2[args2[:20]]

array(['junk', 'ok best', 'worst', 'garbage', 'ok better', 'horrible',
       'pos', 'price ok', 'blacklist', 'product doesn', 'good going',
       'cam ok', 'nope', 'useless', 'ok complains', 'worthless',
       'great support', 'cuts', 'terrible', 'looks ok'], dtype='<U29')

In [159]:
features2[args2[-20:]]

array(['bueno', 'awsome', 'exellent', 'exelent', 'downside', 'does need',
       'superb', 'great', 'perfectly', 'perfecto', 'amazing', 'awesome',
       'loving', 'love', 'perfect', 'exelente', 'loves', 'excellent',
       'excelent', 'excelente'], dtype='<U29')

### Tf-idf

In [160]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [163]:
v_fit_ti = TfidfVectorizer(stop_words ="english", min_df = 10, ngram_range= (1,2)).fit(X_train)

In [164]:
X_train_vti = v_fit_ti.transform(X_train)

In [165]:
lr.fit(X_train_vti, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [166]:
y_preds_ti = lr.predict(v_fit_ti.transform(X_test))

In [168]:
roc_auc_score(y_preds_ti, y_test)

0.944460956150688

In [169]:
args_ti = lr.coef_[0].argsort()

In [170]:
features_ti = np.array(v_fit_ti.get_feature_names())

In [171]:
features_ti[args_ti[:10]]

array(['return', 'worst', 'disappointed', 'terrible', 'poor', 'useless',
       'horrible', 'returning', 'waste', 'slow'], dtype='<U29')

In [173]:
features_ti[args_ti[-10:]]

array(['easy', 'best', 'awesome', 'far', 'loves', 'amazing', 'perfect',
       'excellent', 'love', 'great'], dtype='<U29')