#### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### Importing item.csv dataset

In [2]:
item = pd.read_csv('./amazon-items.csv')
item.head()

FileNotFoundError: [Errno 2] No such file or directory: './amazon-items.csv'

#### Importing reviews.csv dataset

In [None]:
review = pd.read_csv('./amazon-reviews.csv')
review.head()

#### Merging the two dataset

In [None]:
df = pd.merge(review, item, how="left", left_on="asin", right_on="asin")

In [None]:
df.rename(columns={"rating_x": "rating", "title_x": "title", "title_y": "item_title", "rating_y": "overall_rating"}, inplace=True)
df.head(100)

In [None]:
print("The dataset contains {0[0]: .0f} rows and {0[1]: .0f} variables.".format(df.shape))
df.head()

#### Checking the null values

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

#### Removing all unnecessary columns

In [None]:
data = df.copy()
data = data[['asin', 'brand', 'rating', 'date', 'totalReviews', 'overall_rating', 'item_title', 'body']]
data.head(10)

#### Total Mobile count

In [None]:
plt.figure(figsize=(20,12))
sns.countplot(x = 'brand', data =data)

In [None]:
# Average rating per brand
ax = data.groupby("brand").mean()["rating"].sort_values().plot(kind="barh",
                                                                figsize=(8,5), 
                                                                title="Average rating per Brand")
plt.show()

#### Extracting year and month from date

In [None]:
# new data frame which has date and year
new = data["date"].str.split(",", n = 1, expand = True) 
  
# making separate date column from new data frame 
data["Dated"]= new[0] 
  
# making separate year column from new data frame 
data["year"]= new[1] 

data=data.drop(['date'], axis=1)
data.head()

In [None]:
# Splitting the date 
new1 = data["Dated"].str.split(" ", n = 1, expand = True) 
  
# adding month to the main dataset 
data["month"]= new1[0] 
  
# adding day to the main dataset 
data["day"]= new1[1] 

data=data.drop(['Dated'], axis=1)
data.head()

#### Ploting reviews over time

In [None]:

ax = pd.pivot_table(data, 
                    index="year", 
                    columns="brand", 
                    values="asin", 
                    aggfunc="count", 
                    fill_value=0).plot.area(title="Yearly Number of Reviews per Brand", figsize=(10, 6))

From this plot we can concluded that Samsung is the most rated brands, while Xiaomi has the highest average rating.

### Data  Preprocessing

In [None]:
def review_cleaning(text):
   
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
data['body']=data['body'].apply(lambda x:review_cleaning(x))
data.head()

In [None]:
#Tokenization of text
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer=ToktokTokenizer()

#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

In [None]:
#set stopwords to english
from nltk.corpus import stopwords

stop=set(stopwords.words('english'))
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text


#Apply function on review column
data['body'] = data['body'].apply(remove_stopwords)
data.head()

In [None]:
#Stemming the text

def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

#Apply function on review column
data['body'] = data['body'].apply(simple_stemmer)
data.head()

### Lemmatization

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
 

def lemmatize(text):
    lemmatizer = WordNetLemmatizer() 
    text= ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text


data['body'] = data['body'].apply(lemmatize)
print(data['body'])

### Sentiment Analysis

In [None]:
def f(row):
    
    if row['rating'] == 1.0 or row['rating'] == 2.0:
        val = 'Negative'
    elif row['rating'] == 3.0 or row['rating'] == 4.0 or row['rating'] == 5.0:
        val = 'Positive'
    else:
        val = -1
    return val

In [None]:

data['sentiment'] = data.apply(f, axis=1)
data.head(10)

In [None]:
data['sentiment'].value_counts()

In [None]:
lb=LabelEncoder()
data['sentiment'] = lb.fit_transform(data['sentiment'])
print(data['sentiment'])

In [None]:
    X = data['body']
    y = data['sentiment']
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)
    
    print(X_train.shape,y_train.shape)
    print(X_test.shape,y_test.shape)
    

### Word2Vec

In [None]:
from gensim.models import Word2Vec
sentences = data['body']

sentences = [nltk.word_tokenize(sentence) for sentence in sentences]

model = Word2Vec(sentences, min_count=2)
#words = model.wv.vocab

vectors = model.wv['phone']
print(vectors)

### Bags of words model

It is used to convert text documents to numerical vectors or bag of words.

In [None]:
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))

#transformed train reviews
cv_train_reviews=cv.fit_transform(X_train)
#transformed test reviews
cv_test_reviews=cv.transform(X_test)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)


### Term Frequency-Inverse Document Frequency model (TFIDF)

It is used to convert text documents to matrix of tfidf features.

In [None]:
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))

#transformed train reviews
tv_train_reviews=tv.fit_transform(X_train)

#transformed test reviews
tv_test_reviews=tv.transform(X_test)

print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

### Machine Learning Classification

#### Logistic regression model for both bag of words and tfidf features

In [None]:
#training the model
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)

#Fitting the model for Bag of words
lr_bow=lr.fit(cv_train_reviews,y_train)
print(lr_bow)

#Fitting the model for tfidf features
lr_tfidf=lr.fit(tv_train_reviews,y_train)
print(lr_tfidf)

#### Logistic regression model performane on test dataset

In [None]:
#Predicting the model for bag of words
lr_bow_predict=lr.predict(cv_test_reviews)
print(lr_bow_predict)

##Predicting the model for tfidf features
lr_tfidf_predict=lr.predict(tv_test_reviews)
print(lr_tfidf_predict)

#### Accuracy of the model

In [None]:
#Accuracy score for bag of words
lr_bow_score=accuracy_score(y_test,lr_bow_predict)
print("lr_bow_score :{:.2f}%".format(lr_bow_score*100))

#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(y_test,lr_tfidf_predict)
print("lr_tfidf_score : {:.2f}%".format(lr_tfidf_score*100))

#### Classification report

In [None]:
#Classification report for bag of words 
lr_bow_report=classification_report(y_test,lr_bow_predict,target_names=['Positive','Negative'])
print(lr_bow_report)

#Classification report for tfidf features
lr_tfidf_report=classification_report(y_test,lr_tfidf_predict,target_names=['Positive','Negative'])
print(lr_tfidf_report)

#### Stochastic gradient descent or Linear support vector machines for bag of words and tfidf features

In [None]:
#training the linear svm
svm=SGDClassifier(loss='hinge',max_iter=500,random_state=42)

#fitting the svm for bag of words
svm_bow=svm.fit(cv_train_reviews,y_train)
print(svm_bow)

#fitting the svm for tfidf features
svm_tfidf=svm.fit(tv_train_reviews,y_train)
print(svm_tfidf)

#### Model performance on test data

In [None]:
#Predicting the model for bag of words
svm_bow_predict=svm.predict(cv_test_reviews)
print(svm_bow_predict)

#Predicting the model for tfidf features
svm_tfidf_predict=svm.predict(tv_test_reviews)
print(svm_tfidf_predict)

#### Accuracy of the model

In [None]:
#Accuracy score for bag of words
svm_bow_score=accuracy_score(y_test,svm_bow_predict)
print("svm_bow_score :{:.2f}%".format(svm_bow_score*100))

#Accuracy score for tfidf features
svm_tfidf_score=accuracy_score(y_test,svm_tfidf_predict)
print("svm_tfidf_score :{:.2f}%".format(svm_tfidf_score*100))

#### Classification report

In [None]:
#Classification report for bag of words 
svm_bow_report=classification_report(y_test,svm_bow_predict,target_names=['Positive','Negative'])
print(svm_bow_report)

#Classification report for tfidf features
svm_tfidf_report=classification_report(y_test,svm_tfidf_predict,target_names=['Positive','Negative'])
print(svm_tfidf_report)

#### Multinomial Naive Bayes for bag of words and tfidf features

In [None]:
#training the model
mnb=MultinomialNB()

#fitting the svm for bag of words
mnb_bow=mnb.fit(cv_train_reviews,y_train)


#fitting the svm for tfidf features
mnb_tfidf=mnb.fit(tv_train_reviews,y_train)


#### Model performance on test data


In [None]:

#Predicting the model for bag of words
mnb_bow_predict=mnb.predict(cv_test_reviews)
print(mnb_bow_predict)

#Predicting the model for tfidf features
mnb_tfidf_predict=mnb.predict(tv_test_reviews)
print(mnb_tfidf_predict)

#### Accuracy of the model

In [None]:
#Accuracy score for bag of words
mnb_bow_score=accuracy_score(y_test,mnb_bow_predict)
print("mnb_bow_score :{:.2f}%".format(mnb_bow_score*100))

#Accuracy score for tfidf features
mnb_tfidf_score=accuracy_score(y_test,mnb_tfidf_predict)
print("mnb_tfidf_score :{:.2f}%".format(mnb_tfidf_score*100))

#### Classification report

In [None]:
#Classification report for bag of words 
mnb_bow_report=classification_report(y_test,mnb_bow_predict,target_names=['Positive','Negative'])
print(mnb_bow_report)

#Classification report for tfidf features
mnb_tfidf_report=classification_report(y_test,mnb_tfidf_predict,target_names=['Positive','Negative'])
print(mnb_tfidf_report)

#### Conclusion

We can observed that multinomial naive bayes model performing well compared to logistic regression and linear support vector machines.

## Sentimental Analysis using Vader

There are two common approaches for text sentiment analysis: the lexical method and the machine learning methhod.

The lexcial method maps the new text to a pre-defined "dictionary of sentiment". VADER is one example of such method. Wtihe VADER, the sentiment score of a sentence is the normalised sum of sentiment scores of each word in that sentence.

In [None]:
nltk.download('vader_lexicon')

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
sid = SentimentIntensityAnalyzer()
sid.polarity_scores(df['body'].iloc[11])

In [None]:
dfs = df.copy()
dfs = dfs[['asin', 'brand', 'rating','overall_rating','title', 'item_title', 'body']]
dfs.head()

In [None]:
def review_cleaning(text):

    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [None]:
dfs['body']=dfs['body'].apply(lambda x:review_cleaning(x))
dfs['title']=dfs['title'].apply(lambda x:review_cleaning(x))
dfs.head()

In [None]:
dfs['title_score'] = dfs['title'].apply(lambda x: sid.polarity_scores(x))
dfs.head()

In [None]:
dfs['title_compound'] = dfs['title_score'].apply(lambda x: x['compound'])
dfs.head()

In [None]:
  def f(dfs):
        
    if dfs['title_compound'] >= 0.05 :
       value = "Positive"
 
    elif dfs['title_compound'] <= - 0.05 :
      value = "Negative"
 
    else :
       value = "Neutral"
    
    return value

dfs['title_sentiment'] = dfs.apply(f, axis=1)
dfs.head()


In [None]:
dfs['title_sentiment'].value_counts()

In [None]:
dfs['body_score'] = dfs['body'].apply(lambda x: sid.polarity_scores(x))
dfs.head()

In [None]:
dfs['body_compound'] = dfs['body_score'].apply(lambda x: x['compound'])
dfs.head()

In [None]:
  def f(dfs):
        
    if dfs['body_compound'] >= 0.05 :
       value = "Positive"
 
    elif dfs['body_compound'] <= - 0.05 :
      value = "Negative"
 
    else :
       value = "Neutral"
    
    return value
 

In [None]:
dfs['body_sentiment'] = dfs.apply(f, axis=1)
dfs.head()

In [None]:
dfs['body_sentiment'].value_counts()

In [None]:
plt.figure(figsize=(18,8))
sns.countplot(x = 'brand', hue = 'body_sentiment', data = dfs)
plt.xlabel('Comments Positive, Negative or Neutral', fontsize = 18)
plt.ylabel('Count', fontsize = 18)
plt.title('Comments sentiment analysis', fontsize = 24)

In [None]:
plt.figure(figsize=(18,8))
sns.countplot(x = 'brand', hue = 'title_sentiment', data = dfs)
plt.xlabel('Title Positive, Negative or Neutral', fontsize = 18)
plt.ylabel('Count', fontsize = 18)
plt.title('Title sentiment analysis', fontsize = 24)

In [None]:
print(classification_report(dfs['title_sentiment'], dfs['body_sentiment']))

#### Conclusion for sentiment analysis using vader

As a result, we can conclude that the majority of the reviews and titles are positive, and the correlation between the rating and the sentiment of the reviews is also favorable.

## Word Cloud

In [None]:
reviews = df.copy()

In [None]:
reviews["positivity"] = reviews["rating"].apply(lambda x: 1 if x>3 else(0 if x==3 else -1))

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
import string
stop = set(stopwords.words('english'))
punc = set(string.punctuation)
keywords = reviews["brand"].apply(lambda x: x.lower()).unique().tolist()
keywords.append("phone")
lemma = WordNetLemmatizer()
def clean_text(text):
    
    text = text.lower()
    
    wordList = text.split()
    
    wordList = ["".join(x for x in word if (x=="'")|(x not in punc)) for word in wordList]
   
    wordList = [word for word in wordList if word not in stop]
   
    wordList = [word for word in wordList if word not in keywords]
    
    wordList = [lemma.lemmatize(word) for word in wordList]
    return " ".join(wordList)


In [None]:
reviews["body"] = reviews["body"].astype("str")
reviews["clean_text"] = reviews["body"].apply(clean_text)

##### Creating wordcloud

In [None]:
def word_freq_dict(text):
  
    wordList = text.split()
  
    wordFreqDict = {word: wordList.count(word) for word in wordList}
    
    return wordFreqDict


##### Brand subsets

In [None]:

apple = reviews[reviews["brand"]=="Apple"].sort_values(by=["date"], ascending=False)
samsung = reviews[reviews["brand"]=="Samsung"].sort_values(by=["date"], ascending=False)


In [None]:
from wordcloud import WordCloud, ImageColorGenerator

# Function to create a wordcloud from dictionary of word frequency
def wordcloud_from_frequency(word_freq_dict, title, figure_size=(10, 6)):
    wordcloud.generate_from_frequencies(word_freq_dict)
    plt.figure(figsize=figure_size)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(title)
    plt.show()

In [None]:
# Function to plot top10 positive words and top10 negative words in a grouped bar plot (from dictionaries)

def topn_wordfreq_bar_both(pos_word_freq_dict, neg_word_freq_dict, pos_num_doc, neg_num_doc, topn, title, palette, height=6, aspect=2):
    # Transforming positive word frequency into DF
    df_pos = pd.DataFrame.from_dict(pos_word_freq_dict, orient="index").sort_values(by=0, ascending=False).head(topn)
    df_pos.columns = ["frequency"]
    df_pos["frequency"] = df_pos["frequency"] / pos_num_doc
    df_pos["label"] = "Positive"
    
    # Transforming negative word frequency into DF
    df_neg = pd.DataFrame.from_dict(neg_word_freq_dict, orient="index").sort_values(by=0, ascending=False).head(topn)
    df_neg.columns = ["frequency"]
    df_neg["frequency"] = df_neg["frequency"] / neg_num_doc
    df_neg["label"] = "Negative"
    
    # Append two dataframes
    df_append = df_pos.append(df_neg)
    df_append.reset_index(inplace=True)
    # Plot
    sns.catplot(x="index", y="frequency", hue="label", data=df_append, 
                kind="bar",
                palette=palette,
                height=height, aspect=aspect, 
                legend_out=False)
    plt.title(title)
    plt.show()

In [None]:
# Wordclouds for 1000 reviews for Apple

apple_pos = " ".join(apple[apple["positivity"]==1]["clean_text"][0:1000])
apple_pos_word_freq = word_freq_dict(apple_pos)
wordcloud = WordCloud(width=5000, 
                      height=3000, 
                      max_words=200, 
                      colormap="Blues",
                      background_color="white")
wordcloud_from_frequency(apple_pos_word_freq, "Most Frequent Words in the Latest 1000 Positive Reviews for Apple")

In [None]:
apple[apple["clean_text"].apply(lambda x: "new" in x)]["item_title"].value_counts().sort_values(ascending=True).tail(10).plot(kind="barh")
plt.title("Most reviews that mention 'new' are from renewed iPhone buyers")
plt.show()

In [None]:
apple["renewed"] = apple["item_title"].apply(lambda x: ("Renewed" in x) | ("Reburshied" in x))
print("{0: 0.1%} iPhones that were sold on Amazon are renewed/reburshied.".format(apple["renewed"].sum() / len(apple["renewed"])))

In [None]:
apple_neg = " ".join(apple[apple["positivity"]==-1]["clean_text"][0:1000])
apple_neg_word_freq = word_freq_dict(apple_neg)
wordcloud = WordCloud(width=5000, 
                      height=3000, 
                      max_words=200, 
                      colormap="Blues",
                      background_color="black")
wordcloud_from_frequency(apple_neg_word_freq, "Most Frequent Words in the Latest 1000 Negative Reviews for Apple")

In [None]:
topn_wordfreq_bar_both(apple_pos_word_freq, apple_neg_word_freq, 
                       min(sum(apple["positivity"]==1), 1000), 
                       min(sum(apple["positivity"]==-1), 1000), 
                       10, 
                       "Top10 Frequent Words in Latest Positive and Negative Reviews for Apple", 
                       ["lightblue", "lightcoral"], 
                       height=6, aspect=2)

In [None]:
# Wordclouds for 1000 reviews for Samsung

samsung_pos = " ".join(samsung[samsung["positivity"]==1]["clean_text"][0:1000])
samsung_pos_word_freq = word_freq_dict(samsung_pos)
wordcloud = WordCloud(width=5000, 
                      height=3000, 
                      max_words=200, 
                      colormap="Greens",
                      background_color="white")
wordcloud_from_frequency(samsung_pos_word_freq, "Most Frequent Words in the Latest 1000 Positive Reviews for Samsung")

In [None]:
samsung_neg = " ".join(samsung[samsung["positivity"]==-1]["clean_text"][0:1000])
samsung_neg_word_freq = word_freq_dict(samsung_neg)
wordcloud = WordCloud(width=5000, 
                      height=3000, 
                      max_words=200, 
                      colormap="Greens",
                      background_color="black")
wordcloud_from_frequency(samsung_neg_word_freq, "Most Frequent Words in the Latest 1000 Negative Reviews for Samsung")


In [None]:
topn_wordfreq_bar_both(samsung_pos_word_freq, samsung_neg_word_freq, 
                       min(sum(samsung["positivity"]==1), 1000), 
                       min(sum(samsung["positivity"]==-1), 1000), 
                       10, 
                       "Top10 Frequent Words in Latest Positive and Negative Reviews for Samsung", 
                       ["steelblue", "orange"], 
                       height=6, aspect=2)

The visulisations above show that:

The most important considerations for cell phone buyers are battery health and screen condition.

The majority of iPhones sold on Amazon are renewed/refurbished. Apple customers are satisfied if their purchases are in (near)    new condition, and they usually complain if there is a scratch on the screen or the battery health is poor.

Samsung buyers are generally satisfied with the overall performance of their purchases, but they have complained about the        screen. Some customers also complained about unlocked phones sold by carriers.