## Customer Sentiment Analysis

Name: Sau Yee Yiu

Python Version: Python 3

In this program, an data exploration is conducted on the collected customer reviews of Myer dataset ("myer_customer_reviews.csv") to get an initial understanding of the customer sentiments. Then a sentiment analysis model is implemented and applied to the actual "review" text data in the collected data to classify customer reviews into positive, negative or neutral. 

In [None]:
import findspark
findspark.init()

from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import pandas as pd
import string

Step 1: Create a Spark Session

Create a SparkSession that loads settings from system properties.

In [None]:
spark = SparkSession.builder.getOrCreate()

Step 2: Load the dataset 


In [None]:
schema = StructType([
    StructField("reviewer name", StringType(), True),
    StructField("number of reviews by reviewer", IntegerType(), True),
    StructField("review title", StringType(), True),
    StructField("review rating", StringType(), True),
    StructField("review", StringType(), True)])

reviews_data = spark.read.format("org.apache.spark.csv").option("delimiter",",").schema(schema).option("header", "True").option("inferSchema", "True").csv("/content/drive/My Drive/Colab Notebooks/myer_customer_reviews.csv")

Step 3: Initial Data Exploration  - Fetch “Number of Reviews Written”

Fetch column: “Number of Reviews Written” to analyse number of reviews writtten by every customer that submitted a review of the Myer store on the TrustPilot website.

In [None]:
reviewscount_rdd = reviews_data.select("number of reviews by reviewer").rdd.flatMap(lambda x: x)
reviewscount_rdd.collect()
max_count=max(reviewscount_rdd.collect())

#visualise the output
bins, counts = reviewscount_rdd.histogram(max_count)
print(counts)
plt.hist(bins[:-1], bins=bins, weights=counts)
plt.xlabel('number of reviews posted')
plt.ylabel('number of reviewers')

Fetch “review rating”
    
Fetch column: “review rating” to analyse customers' overall rating to Myer.

In [None]:
reviewrate_rdd = reviews_data.select("review rating").rdd.flatMap(lambda x: x)
reviewrate_rdd.collect()
reviewrate_rdd.take(10)

#visualise the output
rrate=reviewrate_rdd.countByValue()
print(rrate)

plt.bar(range(len(rrate)), list(rrate.values()), align='center')
plt.xticks(range(len(rrate)), list(rrate.keys()))
plt.xlabel('customer review rating')
plt.ylabel('number of reviewers')

Step 4: Fetch column “review" and prepare the text corpus for sentiment analysis


In [None]:
reviews_rdd = reviews_data.select("review").rdd.flatMap(lambda x: x)
lowerCase_reviewsrdd = reviews_rdd.map(lambda x : x.lower())
lowerCase_reviewsrdd.collect()
lowerCase_reviewsrdd.take(10)


Step 5: Sentiment Analysis

In [None]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer() 

sentimentRDD = lowerCase_reviewsrdd.map(lambda x:analyzer.polarity_scores(x))
sentimentRDD.collect()
sentimentRDD.take(3)

field_list =['compound']

#function for filtering a dictionary by key
def filterlist(x):
    d = {}
    for k in x:
        if k in field_list:
            d[k] = x.get(k)
    return d

#use the compound scores only
rdd_subset = sentimentRDD.map(lambda x: filterlist(x))
rdd_subset.collect()
print(rdd_subset.take(3))

#use the compound score to determine if the review is positive, negative or neutral
def analyse_sentiment(polarityscore):
    if polarityscore.get('compound')> 0:
       return "positive"
    elif polarityscore.get('compound') == 0:
       return "neutral"
    else:
        return "negative"

sentiment_rate=rdd_subset.map(lambda x: analyse_sentiment(x))
sentiment_rate.collect()
sentiment_rate.take(10)

#visualise the output
sentiment_results=sentiment_rate.countByValue()
print(sentiment_results)

plt.bar(range(len(sentiment_results)), list(sentiment_results.values()), align='center')
plt.xticks(range(len(sentiment_results)), list(sentiment_results.keys()))
plt.xlabel('customer review categories')
plt.ylabel('number of reviewers')

Step 6: Extract keywords from the review title for further analyses (optional)

First, tokenise each sentence in a review first.

In [None]:
reviewst_rdd = reviews_data.select("review title").rdd.flatMap(lambda x: x)
lowerCase_reviewstrdd = reviewst_rdd.map(lambda x : x.lower())
lowerCase_reviewstrdd.collect()
lowerCase_reviewstrdd.take(10)
nltk.download('punkt')

#define a function for sentence tokenization
def sent_Tokenize(x):
    return nltk.sent_tokenize(x)

sentenceTokensRDD = lowerCase_reviewstrdd.map(sent_Tokenize)

sentenceTokensRDD.collect()

sentenceTokensRDD.take(10)

Then tokenize each word in all sentences



In [None]:
#define function for tokenise each word in a sentence
def word_Tokenize(x):
    w_token = [word for line in x for word in line.split()]
    return w_token

wordTokensRDD = sentenceTokensRDD.map(word_Tokenize)

wordTokensRDD.collect()

wordTokensRDD.take(10)

Remove all stop words and punctations from the word tokens list.

In [None]:
def removeStopWords(x):
    from nltk.corpus import stopwords
    stop_words=stopwords.words('english')
    stop_words.append('…')
    filtereds = [w for w in x if not w in stop_words]
    return filtereds

stopwordRDD = wordTokensRDD.map(removeStopWords)


def removePunctuations(x):
    list_punct=list(string.punctuation)
    filteredp = [''.join(c for c in s if c not in list_punct) for s in x] 
    filtered_sp = [s for s in filteredp if s] #remove empty space 
    return filtered_sp

filteredPunctRDD = stopwordRDD.map(removePunctuations)


Extract the top 30 keywords from filteredPunctedRDD

In [None]:
freqDistRDD = filteredPunctRDD.flatMap(lambda x : nltk.FreqDist(x).most_common()).map(lambda x: x).reduceByKey(lambda x,y : x+y).sortBy(lambda x: x[1], ascending = False)

Step 13: Visualize the output



In [None]:
nltk.download('stopwords')
df_fDist = freqDistRDD.toDF() #converting RDD to spark dataframe
df_fDist.createOrReplaceTempView("myTable") 
df2 = spark.sql("SELECT _1 AS Keywords, _2 as Frequency from myTable limit 30") #renaming columns 
Top30words = df2.toPandas() #converting spark dataframes to pandas dataframes
Top30words.plot.barh(x='Keywords', y='Frequency', rot=1, figsize=(10,8))