In [16]:
import pandas as pd
import numpy as np
import re
import nltk

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.stem.porter import PorterStemmer
english_stemmer=nltk.stem.SnowballStemmer('english')

In [17]:
def preprocess_word(word):
    # Remove punctuation
    word = word.strip('\'"?!,.():;')
    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    word = re.sub(r'(.)\1+', r'\1\1', word)
    # Remove - & '
    word = re.sub(r'(-|\')', '', word)
    #Stem the word PorterStemmer()
    stemmer = english_stemmer
    word = stemmer.stem(word)
    return word

In [18]:
def is_valid_word(word):
	# Check if word begins with an alphabet
	return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)


In [19]:
def handle_emojis(text):
	# Smile -- :), : ), :-), (:, ( :, (-:, :')
	text = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', text)
	# Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
	text = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', text)
	# Love -- <3, :*
	text = re.sub(r'(<3|:\*)', ' EMO_POS ', text)
	# Wink -- ;-), ;), ;-D, ;D, (;,  (-;
	text = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', text)
	# Sad -- :-(, : (, :(, ):, )-:
	text = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', text)
	# Cry -- :,(, :'(, :"(
	text = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', text)
	return text

In [20]:
def remove_links(text):
	# Replace URLS by URL
	text = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', 'URL ', text)
	# Replace EMAILS by EMAIL
	text = re.sub(r'(\w+)@([A-Z0-9]+)\.([A-Z]{2,4})', 'EMAIL', text)
	# Replace hashtags by the hashtag
	text = re.sub(r'#(\S+)', r' \1 ', text)
	return text


In [21]:
def remove_spaces(text):
	# Replace tab with space
	text.replace('\t',' ')
	# Remove new lines 
	text.replace('\n',' ')
	text.replace('\r', ' ')
	# Replace 2+ dots with space
	text = re.sub(r'\.{2,}', ' ', text)
	# Strip space, " and ' from text
	text = text.strip(' "\'')
	# Replace multiple spaces with a single space
	text = re.sub(r'\s+', ' ', text)
	return text

In [22]:

def compelete_clean(text):
	if text:
		# Convert to lower case
		final_text = []
		text = text.lower()
		text =remove_links(text)
		text =handle_emojis(text)
		text =remove_spaces(text)
		words = text.split()
		for word in words:
			if is_valid_word(word):
				final_text.append(preprocess_word(word))
		return ' '.join(final_text)
	return ''



In [23]:
def convert_reviews(x):
	if x > 3 :
		return 'POSITIVE'
	elif x < 3 :
		return 'NEGATIVE'
	else:
		return 'NEUTRE'


In [24]:
df = pd.read_csv('Amazon_Unlocked_Mobile.csv')

In [25]:
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [26]:
df["sentiment"] = df["Rating"].apply(convert_reviews)

In [27]:
df["Reviews"] = df["Reviews"].astype(str).apply(compelete_clean)

In [65]:
reviews = df[['sentiment','Reviews']].dropna()

In [66]:
print(reviews.head())

  sentiment                                            Reviews
0  POSITIVE  i feel so lucki to have found this use to us n...
1  POSITIVE  nice nice up grade from my pantach revu veri c...
2  POSITIVE                                         veri pleas
3  POSITIVE  it work good but it goe slow sometim but it a ...
4  POSITIVE  great phone to replac my lost phone the onli t...


In [67]:
reviews.to_csv('prep_unlocked_mobile.csv',index=False)

In [68]:
reviews = pd.read_csv('prep_unlocked_mobile.csv')

In [71]:
reviews["sentiment"] = reviews["sentiment"].astype(str)
reviews["Reviews"] = reviews["Reviews"].astype(str)

In [72]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413840 entries, 0 to 413839
Data columns (total 2 columns):
sentiment    413840 non-null object
Reviews      413840 non-null object
dtypes: object(2)
memory usage: 6.3+ MB


In [73]:
train, test = train_test_split(reviews[['sentiment','Reviews']], test_size=0.2)

In [74]:
print("Vectorization ...")
countVector = CountVectorizer(min_df = 1, ngram_range = (1, 4))
X_train_counts = countVector.fit_transform(train["Reviews"])

Vectorization ...


In [75]:
#applying tfidf to term frequency
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [76]:
X_new_counts = countVector.transform(test["Reviews"])
X_test_tfidf = tfidf_transformer.transform(X_new_counts)

In [77]:
# Save TFIDF

import pickle

pickle.dump(X_train_tfidf, open("train_comment_features.pickle", "wb"))
pickle.dump(X_test_tfidf, open("test_comment_features.pickle", "wb"))

In [78]:
y_train = train["sentiment"]
y_test = test["sentiment"]

In [80]:
prediction = dict()

Logistic regression learning method




[LibLinear]

In [None]:
#Visualize the accuracy, recall and f1-score for Logistic Regression
print(metrics.classification_report(y_test, prediction['Logistic'], target_names = ["NEGATIVE", "POSITIVE","NEUTRE"]))
print(accuracy_score(y_test, prediction['Logistic']))

In [None]:
# Save and Load model
from sklearn.externals import joblib
joblib.dump(clf, 'logreg.joblib') 
logreg = joblib.load('logreg.joblib') 

In [112]:
NB = MultinomialNB(alpha=0.0001)
NB.fit( X_train_tfidf, y_train )

MultinomialNB(alpha=0.0001, class_prior=None, fit_prior=True)

In [113]:
pred_1 = NB.predict( X_test_tfidf )

In [114]:
pred_1.shape

(82768,)

In [115]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print(classification_report(y_test, pred_1, target_names=['POSITIVE','NEGATIVE','NEUTRE']))
print(accuracy_score(y_test, pred_1))

              precision    recall  f1-score   support

    POSITIVE       0.94      0.92      0.93     19418
    NEGATIVE       0.93      0.66      0.77      6404
      NEUTRE       0.95      0.99      0.97     56946

   micro avg       0.95      0.95      0.95     82768
   macro avg       0.94      0.86      0.89     82768
weighted avg       0.95      0.95      0.94     82768

0.9461023583993814


In [116]:
from sklearn.linear_model import SGDClassifier, SGDRegressor
SGDC = SGDClassifier(loss='modified_huber', n_iter=50, random_state=0, shuffle=True)
SGDC.fit( X_train_tfidf, y_train )



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='modified_huber',
       max_iter=None, n_iter=50, n_iter_no_change=5, n_jobs=None,
       penalty='l2', power_t=0.5, random_state=0, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [117]:
pred_2 = SGDC.predict( X_test_tfidf )

In [118]:
print(classification_report(y_test, pred_2, target_names=['POSITIVE','NEGATIVE','NEUTRE']))
print(accuracy_score(y_test, pred_2))

              precision    recall  f1-score   support

    POSITIVE       0.85      0.87      0.86     19418
    NEGATIVE       0.92      0.12      0.21      6404
      NEUTRE       0.90      0.98      0.94     56946

   micro avg       0.89      0.89      0.89     82768
   macro avg       0.89      0.66      0.67     82768
weighted avg       0.89      0.89      0.86     82768

0.8888217668664218


In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
RandomF = RandomForestClassifier()
RandomF.fit(X_train_tfidf, y_train)

In [None]:
pred_3 = RandomForestClassifier.predict( X_test_tfidf )

In [None]:
print(classification_report(y_test, pred_3, target_names=['POSITIVE','NEGATIVE','NEUTRE']))
print(accuracy_score(y_test, pred_3))

In [None]:
GradientBo = GradientBoostingClassifier()
GradientBo.fit( X_train_tfidf, y_train )

In [None]:
pred_4 = GradientBoostingClassifier.predict( X_test_tfidf )

In [None]:
print(classification_report(y_test, pred_4, target_names=['POSITIVE','NEGATIVE','NEUTRE']))
print(accuracy_score(y_test, pred_4))