### Import Libaries

In [1]:
import pandas as  pd
import numpy as np

from sklearn.model_selection import train_test_split


### Data Prep

In [2]:
df = pd.read_csv("Amazon_Unlocked_Mobile.csv")
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [4]:
#drop missing value
df.dropna(inplace = True)

#avoiding rating 3 who are neutral
df = df[df['Rating']!= 3]

#add new col for binary output 1 and 0
df['Positively Rating'] = np.where(df['Rating']>3, 1, 0)

df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rating
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0,1
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0,1
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0,1
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0,1
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0,1


In [5]:
#to see data imbalance
df['Positively Rating'].mean()

0.7482686025879323

In [6]:
df['Positively Rating'].value_counts()


1    230674
0     77603
Name: Positively Rating, dtype: int64

### Data Splitting

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], df['Positively Rating'],
                                                    test_size = 0.3, random_state = 0)

In [10]:
print("X_train 1st entry:\n\n", X_train[0])
print("\n Shape of X_train:\n", X_train.shape)

X_train 1st entry:

 I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!

 Shape of X_train:
 (215793,)


### CountVectorization

1. tokenizing each documents by finding wordSeq, Characters/numbers at least two letters
2. separated by word boundaries
3. converts everthings to lowercase
4.build vocabulary using token

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(X_train)


In [12]:
#get vocabular for 1000 features
count_list = vect.get_feature_names()[::1000]
sorted(count_list)

['00',
 '20mis',
 '500mah',
 '_as',
 'aewsome',
 'animales',
 'atributos',
 'batteries',
 'bluowner',
 'buystry',
 'centre',
 'coincides',
 'conserves',
 'crushes',
 'defraudadas',
 'direction',
 'dsiapointed',
 'enero',
 'excellentmuch',
 'faxes',
 'fountain',
 'gestionar',
 'guessed',
 'hint',
 'imperfection',
 'internalo',
 'jpg',
 'lego',
 'lr',
 'memoryi',
 'monthsunlocked',
 'net',
 'ocurre',
 'overally',
 'performancewhat',
 'poca',
 'procedures',
 'quemaron',
 'recoup',
 'respectivos',
 'sacrifices',
 'sencondone',
 'sisiter',
 'soundvery',
 'streaming',
 't68i',
 'thicknesscons',
 'transiitons',
 'unfortunally',
 'varify',
 'watches',
 'workswell']

note: Very messy including words with number, wrong spelling

In [13]:
len(vect.get_feature_names())

51932

### Transform CountVectorized Vocabulary for Bag of Words

In [14]:
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

<215793x51932 sparse matrix of type '<class 'numpy.int64'>'
	with 5718068 stored elements in Compressed Sparse Row format>

### Model Buiding

In [15]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

In [16]:
from sklearn.metrics import roc_auc_score

prediction = model.predict(vect.transform(X_test))
prediction

array([1, 0, 1, ..., 1, 1, 1])

In [17]:
print("AUC: ", roc_auc_score(y_test, prediction)) 

AUC:  0.9185889373890636


In [18]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

sorted_coef_index

array([51047, 20782, 26081, ..., 18128, 17958, 17959])

In [19]:
print('small coefficient: \n {} \n'.format(feature_names[sorted_coef_index[:10]]))

small coefficient: 
 ['worst' 'garbage' 'junk' 'unusable' 'useless' 'false' 'disappointing'
 'worthless' 'waste' 'awful'] 



In [20]:
print('above suggests negative review')

above suggests negative review


In [21]:
print('largest coefficient: \n {} \n'.format(feature_names[sorted_coef_index[:-11:-1]]))

largest coefficient: 
 ['excelente' 'excelent' 'exelente' 'loves' 'loving' 'excellent' 'love'
 'perfecto' 'complaints' 'perfect'] 



In [22]:
print('above suggests positive review')

above suggests positive review


### TfidfVectorizer

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
vect =  TfidfVectorizer().fit(X_train)
vect

In [25]:
len(vect.get_feature_names())

51932

note: features with high tfidf have high frequency in documents. TfidfVectorizer has processed same tokenization like CountVectorizer

min_df: minimum frequecy of a word in Document and reduce features to avoid overfitting

In [26]:
vect =  TfidfVectorizer(min_df =5).fit(X_train) #min_df: minimum frequecy of a word in Document 
vect

In [27]:
len(vect.get_feature_names())

17391

In [28]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

prediction = model.predict(vect.transform(X_test))
prediction

array([1, 0, 1, ..., 1, 1, 1])

In [29]:
X_train_vectorized[0]

<1x17391 sparse matrix of type '<class 'numpy.float64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [30]:
print('AUC: ', roc_auc_score(y_test, prediction))

AUC:  0.9257627946433948


In [31]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

sorted_coef_index

array([10538, 17165, 16389, ...,  5954,  7283,  9389])

In [32]:
print('small coefficient: \n {} \n'.format(feature_names[sorted_coef_index[:10]]))

small coefficient: 
 ['not' 'worst' 'useless' 'terrible' 'waste' 'disappointed' 'return' 'poor'
 'horrible' 'doesn'] 



In [33]:
print('above suggests negative review')

above suggests negative review


In [34]:
print('largest coefficient: \n {} \n'.format(feature_names[sorted_coef_index[:-11:-1]]))

largest coefficient: 
 ['love' 'great' 'excellent' 'perfect' 'amazing' 'awesome' 'perfectly'
 'easy' 'best' 'loves'] 



In [35]:
print('above suggests positive review')

above suggests positive review


In [36]:
print(model.predict(vect.transform(['not an issue, phone is working', 
                                    'an issue, phone is working'])))

[0 1]


### n-grams

sequence of words feature are known as n-grams. 

**Unigrams:** not, is, working

**Bigrams:** is working, not working

**Trigrams:** is not working

**ngram_range:** min:1, max:2

In [37]:
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], df['Positively Rating'],
                                                    test_size = 0.3, random_state = 0)

In [38]:
# document frequency of 5 and extracting 1-grams and 2-grams
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)


In [39]:
len(vect.get_feature_names())

189258

In [40]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC and ROC Score: ', roc_auc_score(y_test, predictions))



AUC and ROC Score:  0.9622163457383004


In [41]:
feature_names = np.array(vect.get_feature_names())
feature_names

array(['00', '00 activation', '00 also', ..., 'աɨtɦ ʍʏ', 'աօʀҡ',
       'աօʀҡ աɨtɦ'], dtype='<U34')

In [42]:
sorted_coef_index = model.coef_[0].argsort()

sorted_coef_index

array([105858, 186426, 107233, ..., 106897,  53936,  53895])

In [43]:
print('Smallest Coefficient:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))

Smallest Coefficient:
['no good' 'worst' 'not good' 'not happy' 'junk' 'not worth' 'garbage'
 'not satisfied' 'terrible' 'horrible']



In [44]:
print('above suggests negative review')

above suggests negative review


In [45]:
print('Largeest Coefficient:\n{}\n'.format(feature_names[sorted_coef_index[:-11:-1]]))

Largeest Coefficient:
['excelent' 'excelente' 'not bad' 'exelente' 'excellent' 'perfect'
 'awesome' 'no problems' 'no issues' 'amazing']



In [46]:
print('above suggests positive review')

above suggests positive review


In [47]:
print(model.predict(vect.transform(['not an issue, phone is working', 
                                    'an issue, phone is working', 'phone is not performing'])))

[1 1 0]
