## Sentimental Analysis on Amazon product textual reviews.
#### - By NIKITA UDAYSING PATIL.

In [None]:
# import libraries 
import numpy as np
import  pandas as pd
import warnings
warnings.filterwarnings('ignore')
from textblob  import TextBlob
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
# pip install nltk
import os
import nltk 
import nltk.corpus
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from nltk.corpus import stopwords 
from nltk import sent_tokenize, word_tokenize
from bs4 import BeautifulSoup 
import re
!pip install wordcloud
from wordcloud import WordCloud


In [None]:
data=pd.read_excel(r'iphone data.xlsx')
data

In [None]:
data.describe()

In [None]:
data.review_country.value_counts()
data.isnull().any().sum()

In [None]:
data1=data.dropna()
data1

In [None]:
data1.review_rating.value_counts()

In [None]:
data1['review_rating']=data['review_rating'].replace(['5.0 out of 5 stars'],'5')
data1['review_rating']=data1['review_rating'].replace(['4.0 out of 5 stars'],'4')
data1['review_rating']=data1['review_rating'].replace(['3.0 out of 5 stars'],'3')
data1['review_rating']=data1['review_rating'].replace(['2.0 out of 5 stars'],'2')
data1['review_rating']=data1['review_rating'].replace(['1.0 out of 5 stars'],'1')
data1


In [None]:
print("Summary statistics of numerical features : \n", data1.describe())

print("\nTotal number of reviews: ",len(data1))



In [None]:
data1.dtypes


# Step 1: Data Visualization.

In [None]:
# Plot distribution of rating
plt.figure(figsize=(8,8))
# sns.countplot(df['Rating'])
data1['review_rating'].value_counts().sort_index().plot(kind='bar',color='violet')
plt.title('Distribution of Rating')
plt.xlabel('Rating')
plt.ylabel('Count')

In [None]:
# Plot distribution of text_review length
review_length = data1["review_text"].dropna().map(lambda x: len(x))
plt.figure(figsize=(5,5))
review_length.loc[review_length < 100].hist(color='maroon',grid=False)
plt.title("Distribution of Review Length")
plt.xlabel('Review length (Number of character)')
plt.ylabel('Count')

In [None]:
# Plot distribution of rating
#plt.figure(figsize=(18,8))
# sns.countplot(df['Rating'])
#data1['reviewed_at'].value_counts().sort_index().plot(kind='bar',color='violet')
#plt.title('Distribution of Rating')
#plt.xlabel('Rating')
#plt.ylabel('Count')

# Step 2: Data Preparation

For illustrative purpose, to find polarity of given "review_text" I use TextBlob. After that I converted it into 'positive_sentiment' as (1) when "sentiment_polarity > 0" and 'negative_sentiment' as (0) when " sentiment_polarity < 0 " . Also I drop reviews contain "sentiment_polarity == 0" because it is referred as 'neutral'.

# Prepare Data

In [None]:
def find_pol(review_rating):
    return TextBlob(review_rating).sentiment.polarity
data1['sentiment_polarity'] = data1['review_text'].apply(find_pol)
data1.head()

In [None]:
most_negative = data1[data1.sentiment_polarity <0].review_text
print(most_negative)



In [None]:
most_positive = data1[data1.sentiment_polarity >0].review_text
print(most_positive)

In [None]:
 
# Drop missing values
data1.dropna(inplace=True)

# Remove any 'neutral' ratings equal to 0
data2 = data1[data1['sentiment_polarity'] != 0]
data2
# Encode  1 as (positive sentiment) and  0  as (negative sentiment)
data1['sentiment'] = np.where(data1['sentiment_polarity'] < 0, 0, 1)
data1

# Train Test Split

In [None]:
# Split data into training set and testing set
X_train, X_test, y_train, y_test = train_test_split(data1['review_text'], data1['sentiment'], 
                                                    test_size=0.20, random_state=0)


In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Step 3: Bag of Word

The main aim of this project is to classify reviews in textual format into positive and negative sentiment. So there are two steps needed. Firstly we find word embedding convert textual reviews in numerical representation and secondly,then fit supervised machine learning algorithms on that numerical representation.

Word embedding is frequency based embedding such as Bag of Words (BoW) model. This model learns a vocubulary list from a given corpus and represent each document based on some counting methods of words.In this part,we will explore the model using BoW with SVM algorithms.

The following steps of workflow:

* step 1: preprocess raw text_reviews into clean one.
* step 2: Create Bow using CountVectorizer / Tfidfvectorizer in sklearn.
* step 3: Transform review text to numerical representations
* step 4: Fit SVM algorithm (e.g Naive Bayes,Logistic regression, etc.)
* step 5: Improve the model performance by GridSearch

# Text Preprocessing

The following steps are implemented to convert raw text_reviews into clean text_reviews.

* Remove Html tags using BeautifulSoup.
* Remove non_character such as digits and symbols.
* Convert uppercase letters into lowercase.
* Remove stop words such as "the", "and" .....
* Convert root word by stemming.

In [None]:
def cleanText(raw_text, remove_stopwords=False, stemming=False, split_text=False, \
             ):
    '''
    Convert a raw review to a cleaned review
    '''
    text = BeautifulSoup(raw_text, 'lxml').get_text()  #remove html
    letters_only = re.sub("[^a-zA-Z]", " ", text)  # remove non-character
    words = letters_only.lower().split() # convert to lower case 
    
    if remove_stopwords: # remove stopword
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
        
    if stemming==True: # stemming
#         stemmer = PorterStemmer()
        stemmer = SnowballStemmer('english') 
        words = [stemmer.stem(w) for w in words]
        
    if split_text==True:  # split text
        return (words)
    
    return( " ".join(words)) 

In [None]:
cleanText(raw_text=data.review_text[5], remove_stopwords=True, stemming=True, split_text=True, \
             )

In [None]:
# Preprocess text data in training set and testing set
X_train_cleaned = []
X_test_cleaned = []

for d in X_train:
    X_train_cleaned.append(cleanText(d))
print('Show a cleaned review in the training set : \n',  X_train_cleaned[5])
    
for d in X_test:
    X_test_cleaned.append(cleanText(d))

# CountVectorizer with Multinomial Naive Bayes 
(Benchmark Model)

Now our text_reviews are cleaned !! The next step is to convert them into numerical representations for SVM algorithm.
In sklearn library , we can use CountVectorizer which implements both tokenization and counting in a single class.

In [None]:
# Fit and transform the training data to a document-term matrix using CountVectorizer
countVect = CountVectorizer() 
X_train_countVect = countVect.fit_transform(X_train_cleaned)
print("Number of features : %d \n" %len(countVect.get_feature_names())) 
print("Show some feature names : \n", countVect.get_feature_names()[::100])


In [None]:
# Train MultinomialNB classifier
mnb = MultinomialNB()
mnb.fit(X_train_countVect, y_train)

In [None]:
X_train_countVect

In [None]:
X_train_countVect.shape, y_train.shape

In [None]:
def modelEvaluation(predictions):
    '''
    Print model evaluation to predicted result 
    '''
    print ("\nAccuracy on validation set: {:.4f}".format(accuracy_score(y_test, predictions)))
    print("\nAUC score : {:.4f}".format(roc_auc_score(y_test, predictions)))
    print("\nClassification report : \n", metrics.classification_report(y_test, predictions))
    print("\nConfusion Matrix : \n", metrics.confusion_matrix(y_test, predictions))

In [None]:
# Evaluate the model on validaton set
predictions = mnb.predict(countVect.transform(X_test_cleaned))
modelEvaluation(predictions)

# TfidfVectorizer with Logistic Regression.


Some words might frequently apper but have meaningful information about the sentiment of a particular review.Insted of using occurance counting,we can use tf-idf transform to scale down the impact of frequntly words in given corpus.

In sklearn library,we can use TfidVectorizer which implements both tokenization and ti-idf weighted counting on a single class.

In [None]:
# Fit and transform the training data to a document-term matrix using TfidfVectorizer 
tfidf = TfidfVectorizer(min_df=5) #minimum document frequency of 5
X_train_tfidf = tfidf.fit_transform(X_train)
print("Number of features : %d \n" %len(tfidf.get_feature_names())) 
print("Show some feature names : \n", tfidf.get_feature_names()[::100])



In [None]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train_tfidf, y_train)
X_train_tfidf.shape, y_train.shape

In [None]:
# Look at the top 10 features with smallest and the largest coefficients
feature_names = np.array(tfidf.get_feature_names())
sorted_coef_index = lr.coef_[0].argsort()
print('\nTop 10 features with smallest coefficients :\n{}\n'.format(feature_names[sorted_coef_index[:100]]))
print('Top 10 features with largest coefficients : \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

In [None]:
# Evaluate on the validaton set
predictions = lr.predict(tfidf.transform(X_test_cleaned))
modelEvaluation(predictions)

# pipeline and GridSearch


In sklearn library,we can build pipeline to stremline the workflow and use GridSearch on the pipline model to implememt hyper_parameter tuning for both vectorizer and classifier in one go!!

In [None]:
# Building a pipeline
estimators = [("tfidf", TfidfVectorizer()), ("lr", LogisticRegression())]
model = Pipeline(estimators)

In [None]:
# Grid search
params = {"lr__C":[0.1, 1, 10], #regularization param of logistic regression
          "tfidf__min_df": [1, 5], #min count of words 
          "tfidf__max_features": [10, None], #max features
          "tfidf__ngram_range": [(1,1), (1,2)], #1-grams or 2-grams
          "tfidf__stop_words": [None, "english"]} #use stopwords or don't

grid = GridSearchCV(estimator=model, param_grid=params, scoring="accuracy", n_jobs=-1)
grid.fit(X_train_cleaned, y_train)
print("The best paramenter set is : \n", grid.best_params_)



In [None]:
# Evaluate on the validaton set
predictions = grid.predict(X_test_cleaned)
modelEvaluation(predictions)

# Step 4: Word Cloud

In this part,we use word cloud to get an bunch of words most appear in text_review.so I again preprocess and clean the raw text_reviews into clean ones.

Here I combine all text_reviews in one variable to get easy further.

In [None]:
data2.head()

In [None]:
x=', '.join(most_negative)
x

In [None]:
text=x.lower() # lowercase letters
print(text)

In [None]:
allstopwords=stopwords.words('english')
filtered_text=text

tokenized_text=word_tokenize(filtered_text)
review_text=[word for word in tokenized_text if not word in allstopwords]
print(review_text)


In [None]:
filtered_sentence=(" ").join(review_text)
filtered_sentence

In [None]:
#letters_only 
review_text= re.sub("[^a-zA-Z]", " ",filtered_sentence )
review_text

In [None]:
cloud=WordCloud(background_color="black").generate(review_text)
plt.figure(figsize=(10,10))
plt.imshow(cloud)
plt.axis('off')
plt.show()

In [None]:
cloud=WordCloud(background_color="black").generate(review_text)
plt.figure(figsize=(10,15))
plt.imshow(cloud)
plt.axis('off')
plt.show()

Here I found that the words like "good","phone","camera","battery","life","excellent","awesome" and so on are frequently used in most of reviews.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [None]:
model=DecisionTreeClassifier(criterion='entropy',max_depth=3)
model.fit(X_train_countVect,y_train)

In [None]:
#Plot the Decision Tree
tree.plot_tree(model);

In [None]:
fig,axis=plt.subplots(nrows=1,ncols=1,figsize=(3,3),dpi=200)
tree.plot_tree(model,#feature_names=fn,class_names=cn,
               filled=True);

In [None]:
predictions = model.predict(countVect.transform(X_test_cleaned))
pd.Series(predictions).value_counts()

In [None]:
y_test.value_counts()

In [None]:
predictions

In [None]:
#Cross table as like the confusion matrix
pd.crosstab(y_test,predictions)

In [None]:
#Accuracy
np.mean(predictions==y_test)

Inference : After applying supervised learning algorithms we found that Naive Bayes gives model with accuracy 93.81% , logistics regression model gives 93.71% and Desicion Tree model gives accuracy 94.11% .  