In [None]:
# Common imports
import numpy as np
import pandas as pd

# To plot pretty figures
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# to make this notebook's output identical at every run
np.random.seed(42)

#display max columns and rows
pd.options.display.max_rows
pd.options.display.max_columns

# For regular expressions
import re
# For handling string
import string
# For performing mathematical operations
import math

#text preprocessing
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
import unicodedata
tokenizer = ToktokTokenizer()
from sklearn.feature_extraction.text import CountVectorizer

# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")
from scipy import stats
warnings.filterwarnings('ignore')

In [None]:
import os
os.getcwd()

In [None]:
import os
files = [f for f in os.listdir('.') if os.path.isfile(f)]
for f in files:
    print(files)

In [None]:
#Get the data
el = pd.read_csv("/Users/dustin.ellis/Desktop/Desktop/Drexel University Data Science/Classes/DSCI_591-Capstone_1/Exploratory Data Analysis/amazon_reviews_us_Electronics_v1_00.tsv",sep="\t", error_bad_lines = False)
el.head()

In [None]:
#get the information about the dataset
el.info()

In [None]:
#get basic stats on numerical columns. Customer ID and product parent should be treated as objects. 
el.describe()

In [None]:
#get counts of star ratings to get a sense of how many reviews there are
el['star_rating'].value_counts()

In [None]:
el['helpful_votes'].value_counts()

In [None]:
el['total_votes'].value_counts()

In [None]:
# plot a histogram for each numerical attribute to get a feel of data
%matplotlib inline
import matplotlib.pyplot as plt
el.hist(bins=50, figsize=(20,15))
plt.show()

This next section is the data cleaning. In the cleaning process, the vine column was removed as it did not contain useful information. A year column was created, as specific date was not of interest and year is typically used to examine trends in technology. Having the date column would therefore be redundant. I wanted to ensure that only products that were verified purchases by Amazon and sold in the US marketplace were used, so any of the products that did not meet this criteria were dropped. Rows containing null values were dropped. 

In [None]:
#drop vine column because it is useless:
el = el.drop(['vine'], axis = 1)

#extract year from review date column
el['year'] = pd.DatetimeIndex(el['review_date']).year

#drop review date column since year is extracted
el = el.drop(['review_date'], axis = 1)

#count the # of reviews per year
el['year'].value_counts(ascending = True)

In [None]:
#el['helpful_votes'].value_counts()
#if you run, you will see that many people do not leave helpful vote clicks on reviews. 

In [None]:
#el['review_headline'].value_counts()
#I was curious to see if there were common headlines people write when reviewing. It appears that commonly, people will write out a star review and then elaborate on it rather than make a direct comment about something in their review to preview.

In [None]:
#Eliminate rows where verified purchase = N and where marketplace != US
el.drop(el.loc[el['verified_purchase']=='N'].index, inplace=True)
el.drop(el.loc[el['marketplace']!='US'].index, inplace=True)
el

In [None]:
#check for nulls
el.isnull().sum()
#drop nulls
el.dropna(inplace=True)
el.isnull().sum()

In [None]:
#(Create a generic Title per Amazons guidelines) is a null value for product title that was discovered. It is important to remove these values, as a blank product title could be anything and we only want to include products that have a clear product title.
el = el[el.product_title   != "(Create a generic Title per Amazons guidelines)"]

In [None]:
#Get a sense of the unique products in the dataset. There are products such as mounts, cases, sleeves, cleaners, cables, etc. that are not electronic devices themselves. I decided to remove these products as reviews for these kinds of products are going to widely differ from reviews for actual electronic devices.
#el['product_title'].unique().tolist()

This next segment is pre-processing. The dataframe was converted into lowercase text to make it easier to work with. Next, only products that had 500+ reviews were selected, yielding a total of 679 unique products. Products that were mounts, cases, sleeves, or otherwise products sold in electronics departments that are not devices themselves were removed. Lastly, 60% of the data was randomly selected to work with to make analysis feasible. 

In [None]:
#Try to only take products with 500 reviews or more
el=el.groupby('product_title').filter(lambda x:len(x)>500).reset_index(drop=True)
print('Number of products=>',len(el['product_title'].unique()))

In [None]:
#lowercase the dataframe
el = el.apply(lambda x: x.astype(str).str.lower())
el.head()

In [None]:
#remove rows where product is not necessarily an electronic device
el = el[~el['product_title'].str.contains('mount|splitter|hdmi|case|cable|sleeve|adapter|famale|female|male|charger|chord|amazonbasics|spray|cleaner|wipes|batteries')]

In [None]:
#el['product_title'].unique().tolist()

In [None]:
# splitting dataframe in a particular size
el = el.sample(frac=0.6,random_state=200)
el.reset_index()

In [None]:
#look at some product reviews to get a sense of what they are like
#for index,text in enumerate(el['review_body'][35:40]):
  #print('Review %d:\n'%(index+1),text)

In [None]:
#peak at unique products that remaine after cleaning. 
#el['product_title'].unique().tolist()

This next segment is where exploratory data analysis and feature engineering were conducted. The number of reviews for the top and bottom 20 products was calculated and plotted. Next, review bodies were cleaned to remove punctuations and contractions. This was done as to be able to accurately analyze polarity, helpfulness ratio, frequency of words within reviews, etc. WordClouds were generated for each product to understand which words appear most frequently in review for each product. Other graphs examined the helpfulness of reviews in regard to star ratings and review text length. 

In [None]:
#examine value counts of the top 20 reviewed products after the junk products have been filtered out

products = el["product_title"].value_counts()
plt.figure(figsize=(12,8))
products[:20].plot(kind='bar')
plt.title("Number of Reviews for Top 20 Products")
plt.xlabel('Product Title')
plt.ylabel('Number of Reviews')

In [None]:
#examine value counts of the bottom 20 reviewed products after the junk products have been filtered out

products = el["product_title"].value_counts()
# brands.count()
plt.figure(figsize=(12,8))
products[-20:].plot(kind='bar')
plt.title("Number of Reviews for Bottom 20 Products")
plt.xlabel('Product Title')
plt.ylabel('Number of Reviews')

In [None]:
# Dictionary of English Contractions
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

# Expanding Contractions in the reviews
el['review_body']=el['review_body'].apply(lambda x:expand_contractions(x))

In [None]:
#lowercase review bodies
el['cleaned_review_bodies']=el['review_body'].apply(lambda x: x.lower())

In [None]:
#remove digits and words containing digits
el['cleaned_revs'] = el['cleaned_review_bodies'].apply(lambda x: re.sub('\w*\d\w*','',x))

In [None]:
#Remove punctuations
el['cleaned_revs']=  el['cleaned_review_bodies'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))

In [None]:
# Removing extra spaces
el['cleaned_revs']=el['cleaned_revs'].apply(lambda x: re.sub(' +',' ',x))

In [None]:
#preview of cleaned review body text
#for index,text in enumerate(el['cleaned_revs'][35:40]):
  #print('Review %d:\n'%(index+1),text)

In [None]:
pip install -U pip setuptools wheel

In [None]:
#Installation of spaCy to access language analytical tools 
import spacy

In [None]:
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl

In [None]:
# Importing spacy
#import spacy

# Loading model
nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner'])

# Lemmatization with stopwords removal
el['lemmatized']=el['cleaned_revs'].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))


In [None]:
#group reviews according to products

el_grouped=el[['product_title','lemmatized']].groupby(by='product_title').agg(lambda x:' '.join(x))
el_grouped.head()

In [None]:
# Creating Document Term Matrix
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(analyzer='word')
data=cv.fit_transform(el_grouped['lemmatized'])
el_dtm = pd.DataFrame(data.toarray(), columns=cv.get_feature_names())
el_dtm.index=el_grouped.index
el_dtm.head(3)

In [None]:
pip install wordcloud

In [None]:
pip install -U textwrap3

In [None]:
# Importing wordcloud for plotting word clouds and textwrap for wrapping longer text
from wordcloud import WordCloud
from textwrap import wrap

# Function for generating word clouds
def generate_wordcloud(data,title):
  wc = WordCloud(width=400, height=330, max_words=150,colormap="Dark2").generate_from_frequencies(data)
  plt.figure(figsize=(10,8))
  plt.imshow(wc, interpolation='bilinear')
  plt.axis("off")
  plt.title('\n'.join(wrap(title,60)),fontsize=13)
  plt.show()
  
# Transposing document term matrix
el_dtm=el_dtm.transpose()

# Plotting word cloud for each product. Run if you would like to see these!
for index,product in enumerate(el_dtm.columns):
    generate_wordcloud(el_dtm[product].sort_values(ascending=False),product)

In [None]:
pip install -U textblob

In [None]:
from textblob import TextBlob
el['polarity']= el['lemmatized'].apply(lambda x:TextBlob(x).sentiment.polarity)

In [None]:
print("3 Random Reviews with Highest Polarity:")
for index,review in enumerate(el.loc[el['polarity'].sort_values(ascending=False)[:3].index]['review_body']):
    print('Review {}:\n'.format(index+1),review)

In [None]:
print("3 Random Reviews with Lowest Polarity:")
for index,review in enumerate(el.loc[el['polarity'].sort_values(ascending=True)[:3].index]['review_body']):
  print('Review {}:\n'.format(index+1),review)

In [None]:
#polarity plot of 20 products with lowest polarity
product_polarity_sorted=pd.DataFrame(el.groupby('product_title')['polarity'].mean().sort_values(ascending=True)[:20])

plt.figure(figsize=(16,8))
plt.xlabel('Polarity')
plt.ylabel('Products')
plt.title('Polarity of Different Amazon Product Reviews')
polarity_graph=plt.barh(np.arange(len(product_polarity_sorted.index)),product_polarity_sorted['polarity'],color='blue',)

# Writing product names on bar
for bar,product in zip(polarity_graph,product_polarity_sorted.index):
  plt.text(0.005,bar.get_y()+bar.get_width(),'{}'.format(product),va='center',fontsize=11,color='white')

# Writing polarity values on graph
for bar,polarity in zip(polarity_graph,product_polarity_sorted['polarity']):
  plt.text(bar.get_width()+0.001,bar.get_y()+bar.get_width(),'%.3f'%polarity,va='center',fontsize=11,color='black')
  
plt.yticks([])
plt.show()

In [None]:
#polarity plot of 20 products with highest polarity
product_polarity_sorted=pd.DataFrame(el.groupby('product_title')['polarity'].mean().sort_values(ascending=False)[:20])

plt.figure(figsize=(16,8))
plt.xlabel('Polarity')
plt.ylabel('Products')
plt.title('Polarity of Different Amazon Product Reviews')
polarity_graph=plt.barh(np.arange(len(product_polarity_sorted.index)),product_polarity_sorted['polarity'],color='orange',)

# Writing product names on bar
for bar,product in zip(polarity_graph,product_polarity_sorted.index):
  plt.text(0.005,bar.get_y()+bar.get_width(),'{}'.format(product),va='center',fontsize=11,color='white')

# Writing polarity values on graph
for bar,polarity in zip(polarity_graph,product_polarity_sorted['polarity']):
  plt.text(bar.get_width()+0.001,bar.get_y()+bar.get_width(),'%.3f'%polarity,va='center',fontsize=11,color='black')
  
plt.yticks([])
plt.show()

You are able to get reading levels for the reviews. A reason why you may want to do this is to see whether people write more complex reviews for 1 star vs. 5 star reviews. 

The Dale–Chall readability formula is a readability test that provides a numeric gauge of the comprehension difficulty that readers come upon when reading a text. It uses a list of 3000 words that groups of fourth-grade American students could reliably understand, considering any word not on that list to be difficult.


The Flesch–Kincaid readability tests are readability tests designed to indicate how difficult a passage in English is to understand. The Flesch Reading-Ease uses word length and sentence length are used along with weighting factors to determine reading ease.In the Flesch reading-ease test, higher scores indicate material that is easier to read; lower numbers mark passages that are more difficult to read. 

The Gunning fog index is a readability test for English writing. The index estimates the years of formal education a person needs to understand the text on the first reading. For instance, a fog index of 12 requires the reading level of a United States high school senior (around 18 years old).

To do this analysis, you have to import textstat.Textstat is an easy to use library to calculate statistics from text. It helps determine readability, complexity, and grade level.

Running this analysis is really time consuming. This is probably best to do with small chunks of the data, rather than a large dataset. 

In [None]:
#import textstat

#el['dale_chall_score']=el['review_body'].apply(lambda x: textstat.dale_chall_readability_score(x))
#el['flesh_reading_ease']=el['review_body'].apply(lambda x: textstat.flesch_reading_ease(x))
#el['gunning_fog']=el['review_body'].apply(lambda x: textstat.gunning_fog(x))

#print('Dale Chall Score of upvoted reviews=>',el[el['helpful_votes']>1]['dale_chall_score'].mean())
#print('Dale Chall Score of not upvoted reviews=>',el[el['helpful_votes']<=1]['dale_chall_score'].mean())

#print('Flesch Reading Score of upvoted reviews=>',el[el['helpful_votes']>1]['flesh_reading_ease'].mean())
#print('Flesch Reading Score of not upvoted reviews=>',el[el['helpful_votes']<=1]['flesh_reading_ease'].mean())

#print('Gunning Fog Index of upvoted reviews=>',el[el['helpful_votes']>1]['gunning_fog'].mean())
#print('Gunning Fog Index of not upvoted reviews=>',el[el['helpful_votes']<=1]['gunning_fog'].mean())


In [None]:
#convert star rating, helpful votes, and total votes from object to integer to allow for caluclations and manipulations to be performed. 
el["star_rating"] = el["star_rating"].astype(str).astype(int)
el["helpful_votes"] = el["helpful_votes"].astype(str).astype(int)
el["total_votes"] = el["total_votes"].astype(str).astype(int)

In [None]:
j_1 = el[(el['product_title']=="beats solo hd over-ear headphone") & (el['star_rating']>3)]
j_1

In [None]:
print(j_1['cleaned_review_bodies'][381279])

In [None]:
from nltk.tokenize import RegexpTokenizer
def RegExpTokenizer(Sent):
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(Sent)

ListWords1 = []
for m in j_1['cleaned_review_bodies']:
    n = RegExpTokenizer(str(m))
    ListWords1.append(n)
print(ListWords1[1])

In [None]:
#All Words
from nltk import FreqDist
def Bag_Of_Words(ListWords1):
    all_words1 = []
    for m in ListWords1:
        for w in m:
            all_words1.append(w.lower())
    all_words2 = FreqDist(all_words1)
    return all_words2

In [None]:
import matplotlib as mpl
from wordcloud import WordCloud
all_words3 = Bag_Of_Words(ListWords1)
ax = plt.figure(figsize=(15,10))
# Generate a word cloud image
wordcloud = WordCloud(background_color='white',max_font_size=40).generate(' '.join(all_words3.keys()))

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
print("Combien de Mots !!!",len(all_words3))

In [None]:
plt.figure(figsize = (8,6))

import seaborn as sns
from sklearn.manifold import TSNE
all_words5 = Bag_Of_Words(ListWords1)
count = []
Words  = []
for w in all_words5.most_common(10):
    count.append(w[1])
    Words.append(w[0])
sns.set_style("darkgrid")
sns.barplot(Words,count)

In [None]:

# Total numbers of ratings in the home and kitchen product reviews
plt.figure(figsize = (10,6))
sns.countplot(el['star_rating'])
plt.title('Total Review Numbers for Each Rating', color='r')
plt.xlabel('Rating')
plt.ylabel('Number of Reviews')
plt.show()

# Customer totals for each rating class
el['star_rating'].value_counts()

In [None]:
%matplotlib inline
plt.figure(figsize = (10,6))

el.groupby('star_rating').star_rating.count()
el.groupby('star_rating').star_rating.count().plot(kind='pie',autopct='%1.1f%%',startangle=90,explode=(0,0.1,0,0,0),)

In [None]:
#data= el.copy()
word_count=[]
for s1 in el.cleaned_review_bodies:
    word_count.append(len(str(s1).split()))

In [None]:
plt.figure(figsize = (8,6))

import seaborn as sns
import matplotlib.pyplot as plt
sns.boxplot(x="star_rating",y=word_count,data=el)
plt.xlabel('Rating')
plt.ylabel('Review Length')

plt.show()

In [None]:
#Since there are outliers in the above boxplot we are not able to clearly visualize.So remove the outliers 
plt.figure(figsize = (8,6))

sns.boxplot(x="star_rating",y=word_count,data=el,showfliers=False)
plt.xlabel('Rating')
plt.ylabel('Review Length')

plt.show()

In [None]:

# Total review for every year for the beats solo hd over-ear headphone

plt.figure(figsize = (12,8))
sns.countplot(el['year'])
plt.title('Total Review Numbers for Each Year', color='r')
plt.xlabel('year')
plt.ylabel('Number of Reviews')
plt.show()

# Customer totals for each rating class
el['year'].value_counts()

It may be of interest to understand the quantity of unique customers in the dataset. Do we see more customers year after year? How many customers leave 1-star vs 5-star ratings?

In [None]:
# How many unique customers do we have in the dataset?
print('Number of unique customers: {}'.format(len(el['customer_id'].unique())))

In [None]:
# How many unique reviews do we have in the dataset?
print('Number of unique reviews: {}'.format(len(el['cleaned_review_bodies'].unique())))

In [None]:
# How many unique customers in each year?
unique_cust = el.groupby('year')['customer_id'].nunique()

# Plot unique customer numbers in each year
plt.figure(figsize = (10,6))
unique_cust.plot(kind='bar', rot = 0, color = 'blue')
plt.title('Unique Customers in Each Year', color='gray', size = 14)
plt.xlabel('Year')
plt.ylabel('Unique Customer Numbers')
plt.show()

# Print unique customer numbers in each year
print(unique_cust)

In [None]:
# unique customers for each "star_rating"
a = list(el.groupby(['star_rating'])['customer_id'].unique())  

# number of customers
a2 = [len(a[0]),len(a[1]), len(a[2]), len(a[3]), len(a[4])] 

# number of reviews for each "star_rating"
b = list(el['star_rating'].value_counts())              



uniq_cust_rate = pd.DataFrame({'star_rating': ['1', '2', '3', '4', '5'],
                               'number_of_customers': a2,
                               'number_of_reviews': sorted(b)})
print(uniq_cust_rate)

We may also want to understand how many unique products there are in the dataset. How many unique products are there each year?

In [None]:
# Print number of unique electronic products in the dataset
print('Number of unique electronic products: {}'.format(len(el['product_id'].unique())))

In [None]:
# How many unique electronic products in each year?
unique_prod = el.groupby('year')['product_id'].nunique()

# Plot unique product numbers in each year
plt.figure(figsize = (10,6))
unique_prod.plot(kind='bar', color = 'blue', rot =0)
plt.title('Unique Products in Each Year', color = 'gray', size = 14)
plt.xlabel('Year')
plt.ylabel('Unique Product Numbers')
plt.show()

# Print unique product numbers in each year
print(unique_prod)

Feature engineering: helpfulness ratio. Helpfulness ratio is calculated by taking the # of helpful votes and dividng it by the total votes. This tells you out of the total votes, which proportion of the votes were actually meaningful. This feature may differ across star ratings and across products. It may also have some impact on how reviews are structured. Let's find out. 

In [None]:
el['helpfulness_proportion'] = el['helpful_votes']/el['total_votes']

In [None]:
## How helpful are star ratings?

plt.figure(figsize = (10,6))
el.groupby('star_rating')['helpfulness_proportion'].mean().plot(kind='bar', color=['black', 'red', 'green', 'blue', 'cyan']) 
plt.title("Helpfulness in rating",color='r')
plt.xlabel("Star Rating")
plt.ylabel("Helpfulness ratio")
plt.ylim([0, 1])
plt.xticks(rotation=0)
plt.show()

In [None]:
## Distribution of star rating in products

plt.figure(figsize = (8,6))

el1 = el.copy()
el1 = el1[np.isfinite(el1['helpfulness_proportion'])]
grp = el1.groupby('product_id')
counts = grp.product_id.count()        # number of reviews by each critic
means = grp.helpfulness_proportion.mean()     # average freshness for each critic

means[counts > 5].hist(bins=10, edgecolor='w', lw=1)
plt.xlabel("Average Helpfullness per product")
plt.ylabel("Number of products")
plt.show()

In [None]:
## Helpfulness based on length of text

plt.figure(figsize = (15,8))

el1['review_length'] = el1.cleaned_review_bodies.apply(len)
maxTextLen = max(el1.review_length)
el1.groupby(pd.cut(el1['review_length'], np.arange(0,maxTextLen+1000,1000)))['helpfulness_proportion'].mean().plot(kind='bar',color='blue')
plt.xlabel("length of review text")
plt.ylabel("Helpfulness ratio")
plt.title("Relationship between 'Helpfulness ratio' and 'Length of review text'")
plt.ylim([0, 1])

plt.show()

In [None]:
## How long are reviews typically and how many of them are there?

plt.figure(figsize = (15,8))

review_length = el["cleaned_review_bodies"].dropna().map(lambda x: len(x))
plt.figure(figsize=(12,8))
review_length.loc[review_length < 2000].hist()
plt.title("Distribution of Review Length")
plt.xlabel('Review length')
plt.ylabel('Number of Reviews')

In [None]:
## check helpfulness ratio below 2000 words

plt.figure(figsize = (15,8))

el1['review_length'] = el1.cleaned_review_bodies.apply(len)
maxTextLen = 2000
el1.groupby(pd.cut(el1['review_length'], np.arange(0,maxTextLen,100)))['helpfulness_proportion'].mean().plot(kind='bar',color='blue')
plt.xlabel("length of review text")
plt.ylabel("Helpfulness ratio")
plt.title("Relationship between 'Helpfulness ratio' and 'Length of review text < 2000'")
plt.ylim([0, 1])

In [None]:
#label helpfulness
def label_Helpfulness(row):
    if row['helpfulness_proportion'] > 0.75:
           return 'helpful'
    else: 
           return 'unhelpful'

el1['helpIndx'] = el1.apply(label_Helpfulness, axis=1)

el1.head(2)

In [None]:
#Text length

plt.figure(figsize = (15,8))

subset_el1 = el1[(el1.star_rating>=2) & (el1.review_length>=2)]

subset_el1.review_length[subset_el1.review_length<2000].plot(bins=50, kind='hist',alpha=0.2,color='blue')
dhelp = subset_el1.review_length[(subset_el1['helpIndx']=='helpful') & (subset_el1.review_length<2000)]
dunhelp = subset_el1.review_length[(subset_el1['helpIndx']=='unhelpful') & (subset_el1.review_length<2000)]

dhelp.plot(bins=50, kind='hist', alpha=0.7)
dunhelp.plot(bins=50, kind='hist',alpha=0.7)

plt.legend(['Overall', 'Helpfulness', 'Unhelpfulness'])
plt.xlabel("length of text")
plt.ylabel("Frequency")
plt.title("Frequency of text length for helpfulness and unhelpfulness")
plt.show()

Feature engineering: rating_class. Rating class can help us see trends in reviews over time. We will have to calculate it based on star reviews. This feature will be used to examine the good ratings year after year. 

In [None]:
def sentiments(star_rating):
    if (star_rating == 5) or (star_rating == 4):
        return "Positive"
    elif star_rating == 3:
        return "Neutral"
    elif (star_rating == 2) or (star_rating == 1):
        return "Negative"
# Add sentiments to the data
el["rating_class"] = el["star_rating"].apply(sentiments)
el["rating_class"][:20]

In [None]:
# Let's see how the rating classes change for each year in a table and graph.
# Create a dataframe which shows the relationship between review numbers and rating classes for each year

el['rating_class_num'] = el['rating_class'].map({'Positive': 1, 'Negative': 0})
    
feature = el.groupby('year').agg({'rating_class_num':['size', 'sum', 'mean']})
feature.columns = [' '.join(col).strip() for col in feature.columns.values]
feature = feature.reset_index()
feature['rating_class_num mean'] = feature['rating_class_num mean']*100
feature.columns = ['Year', 'Total Reviews', 'Helpful Votes', 
                       '% of Helpful Reviews in This Year'] 
feature['% of Helpful Reviews in This Year'] = (el["helpful_votes"] / 317386)*100
        
feature

In [None]:
# Create a graph for each rating class to compare within the years
##################################################################

el['rating_class_num'] = el['rating_class'].map({'Positive': 1, 'Negative': 0})

feature = el.groupby('year').agg({'rating_class_num':['size', 'sum', 'mean']})
feature.columns = [' '.join(col).strip() for col in feature.columns.values]
feature = feature.reset_index()
feature['rating_class_num mean'] = feature['rating_class_num mean']*100
feature.columns = ['Year', 'Total Reviews', 'Helpful Votes', 
                       '% of Helpful Reviews in This Year'] 
plt.figure(figsize = (12,8))
fig0, ax1 = plt.subplots(figsize = (12,6))
ax2 = ax1.twinx()
feature.set_index(feature['Year'], drop=True, inplace=True)
feature["Total Reviews"].plot(kind = 'bar', stacked = True, ax = ax1, colormap = 'summer', rot=0)
feature['% of Helpful Reviews in This Year'].plot(use_index = False, 
                                                         kind = 'line', ax = ax2, colormap='Spectral')   

plt.title('Good Ratings Compared to the Each Year', color='r')
ax1.set_xlabel('Year')
ax1.yaxis.label.set_color('blue')
ax1.set_ylabel("Total Reviews")
ax2.set_ylabel("Good Rating Percentage for Each Year")
ax2.yaxis.label.set_color('red')
plt.tight_layout()
plt.show()


In the early 2000's, people were still reliant on hard retail stores. This changed as the internet became a source of retail, which explains how in 2000 there was only 1 review and as time progressed, more people were giving feedback on their experiences of online retail. We see that from 2009-2015, good rating review percentages appear to stabilize around 85-86% of all reviews given. 

It's helpful to understand which numeric attributes are correlated with one another. A cross correlation heatmap was generated to provide insight as to which numeric attributes are highly correlated to one another.

In [None]:
# Plotting correlation matrix between numeric variables
plt.figure(figsize = (14,14))
sns.heatmap(el.corr(method="pearson"), cmap='Blues', annot = True)

Seaborn allows you to make plots of each numeric attribute to get a sense of the correlation holisticly. 

In [None]:
sns.pairplot(el)

Due to computational considerations, I will reduce reduce the number of observations. I will drop good rating class reviews longer than 250 words, and I will drop all observations earlier than year 2010.

In [None]:
el['review_length'] = el.cleaned_review_bodies.apply(len)

#drop reviews > 250 words
el1 = el.drop(el[(el['review_length'] > 250) & (el['rating_class'] == 'good')].index)

In [None]:
el2 = el1.drop(el1[(el1['year'] < str(2010))].index)
el2=el2.head(15000)
el2.info()

In [None]:
el2.head(50)

In [None]:
el2['review_length'].describe()

In [None]:
# Create a new data frame with clean text and rating class number
el3 = el2[["cleaned_review_bodies", "rating_class_num"]].reset_index()
el3.head(10)

In [None]:
# Initialize the countervectorizer
countVec = CountVectorizer(ngram_range=(1, 2),binary=True)

In [None]:
el3.info()

In [None]:
# Fit the 'clean_text' to countvectorizer
countVec.fit(el3["cleaned_review_bodies"])

In [None]:
# Transform the matriz
transformed_matrix = countVec.transform(el3["cleaned_review_bodies"])

In [None]:
# Convert matrix to array
transformed_matrix.toarray()

In [None]:
# Extracting the feature names
names = countVec.get_feature_names()

In [None]:
# Adding a 'rating' column from previous dataframe's rating value
el4 = pd.DataFrame(transformed_matrix.toarray(), columns=names)
el4['rating'] = el3['rating_class_num']

In [None]:
#el4['rating'].head()

In [None]:
# Create lists for forming a dataframe summary
feature_names = []
avg_ratings = [] 
rating_counts = []
for name in names:
    if name != 'rating':    
        avg_rating = el4[el4[name]== 1]['rating'].mean()
        rating_count = el4[el4[name]== 1]['rating'].count()
        feature_names.append(name)
        avg_ratings.append(avg_rating)
        rating_counts.append(rating_count)  
    else:
        pass

In [None]:
##### Create a new dataframe from words, average ratings, and rating counts
el_summary = pd.DataFrame({'feature_name':feature_names, 'avg_rating': avg_ratings, 'rating_count':rating_counts})

In [None]:
# Let's see the new dataframe
el_summary

In [None]:
# Words that are commonly used in the reviews which have good ratings
el_good = el_summary.query("rating_count > 20").sort_values(by='avg_rating', ascending=False)[4:50]
el_good.head(50)

In [None]:
wc_good = dict(zip(el_good['feature_name'].tolist(), el_good['avg_rating'].tolist()))

In [None]:
# Generate a word cloud image
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                min_font_size = 10).generate_from_frequencies(wc_good)
 
# plot the WordCloud image                       
plt.figure(figsize = (20, 20), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

In [None]:
# Words that are commonly used in the reviews which have bad ratings
el_bad = el_summary.query("rating_count > 10").sort_values(by= 'avg_rating', ascending=True)[:12]

In [None]:
el_bad.sort_values(by=['avg_rating'],ascending=False,inplace=True)
el_bad

In [None]:
wc_bad = dict(zip(el_bad['feature_name'].tolist(), df_bad['avg_rating'].tolist()))

In [None]:
# Generate a word cloud image

wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                min_font_size = 10).generate_from_frequencies(wc_bad)
 
# plot the WordCloud image                       
plt.figure(figsize = (20, 20), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

In [None]:
el.info()

The result of doing this cleaning, pre-processing, and analytical work will allow me to then perform sentiment analysis and product recommendation in the second portion of this class by using Naive Bayes, Random Forest, Logistic Regression, and some unsupervised learning methods. 

In [None]:
#Sources and guides for techniques employed for this project. 
#https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/
#https://towardsdatascience.com/sentiment-analysis-and-product-recommendation-on-amazons-electronics-dataset-reviews-part-1-6b340de660c2
#https://towardsdatascience.com/sentiment-analysis-on-amazon-reviews-45cd169447ac