##### Social Media Analytics
### Introduction to Text Mining
## Sentiment Analysis
 Best Buy by Eva

### Initial setup

In [1]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk 
import re
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

In [2]:
dtypes = {
    'device': 'category',
    'user': 'category',
    'rating': 'float64',
    'text': 'string',
    'data': 'category',
    'ownership_length': 'category',
}

ds = pd.read_excel(
    "ExtractedReviewsDataCollection_bestbuy.xlsx",
    dtype=dtypes,
 )

### Functions

In [3]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r'\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-', removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True):
    if type(rawText) != str:
        return rawText
    procText = rawText
        
    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText,'html.parser').get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove)>0:
        procText = re.sub(charsToRemove,' ',procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r'\d+',' ',procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace('\n',' ').replace('\r', '')

    # Remove special characters
    if len(specialCharsToRemove)>0:
        procText = re.sub(specialCharsToRemove,' ',procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower() 

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(' +', ' ', procText)

    return procText

In [4]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ''):
        return np.nan
    else:
        return word_tokenize(words)

In [5]:
# Function to create text from words
def recreateText(words):
    if type(words) == list:
        temp_str = (' ').join(words)
        return temp_str
    else:
        return np.nan

In [6]:
# Function to break texts into sentences
def tokenize_sentences(texts):
    s_token = sent_tokenize(texts)
    return s_token

In [7]:
# Function to remove stop words
def removeStopWords(t, stop_words):
    if type(t) == list:
        return [w for w in t if not w in stop_words]
    else:
        return np.nan

### Analysis

In [8]:
def tokenize_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return ". ".join(sentences)

In [9]:
listOfSentences = ds.text.apply(tokenize_sentences)

In [10]:
# Create a dataframe with only the description
processedReviews = pd.DataFrame(data=ds.text.apply(textPreProcess,charsToRemove ='', removeLineBreaks=False, removeNumbers=False).values, index=ds.index, columns=['PreProcessedText'])



In [11]:
# Check first review
ds.text[0]

'Apple makes the best cellphone on the market hands down'

In [12]:
# Sentences of first review
listOfSentences[0]

'Apple makes the best cellphone on the market hands down'

In [13]:
# Create DataFrame for sentences
sentences = pd.DataFrame(data=[item for elem in listOfSentences for item in elem], columns=['BaseText'])

In [14]:
# Add a column with the review ID
sentencesPerReview = []
for elem in listOfSentences:
  sentencesPerReview.append(len(elem))
sentences['user'] = np.repeat(ds['user'].values,sentencesPerReview)

In [15]:
# Preprocess text 
sentences['PreProcessedText'] = sentences['BaseText'].apply(textPreProcess)



In [16]:
# Get words
sentences['Words'] =  sentences['PreProcessedText'].apply(tokenize_words)

In [17]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Adventure.4.Eva.r\AppData\Roaming\nltk_data..
[nltk_data]     .
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
sentences['WordsCleaned'] = sentences['Words'].apply(removeStopWords,stop_words=stop_words)

In [19]:
# Recreate sentence without stopwords
sentences['ProcessedText'] = sentences['WordsCleaned'].apply(recreateText)

In [20]:
# Create sentiment analysis object
analyser = SentimentIntensityAnalyzer()

In [21]:
# To test, let's evaluate first sentence of first review
# Scales:
#   compound: -1:most extreme negative, 1:most extreme positive
#     positive: compound >=0.05
#     neutral: -0.05<compound<0.05
#     negative: compound <= -0.05
#   pos, neu, neg: proportion of text that are positive, neutral or negative
score = analyser.polarity_scores(sentences['ProcessedText'][0])
print(sentences['ProcessedText'][0],score)

 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound': 0.0}


In [22]:
# Process sentiment for all sentences
all_scores = []
for t in (sentences['ProcessedText'][:]):
  score = analyser.polarity_scores(t)
  all_scores.append(score)
sentences['Sentiment'] = [c['compound'] for c in all_scores]

In [23]:
# Compute review's sentiment as the mean sentiment from its sentences
meanByReview = sentences.groupby('user')['Sentiment'].mean()

# Consider sentences with no result as neutral (0)
meanByReview = meanByReview.fillna(0)

# Add column Sentiment to reviews Dataframe
ds['Sentiment'] = meanByReview[ds['user']].values

In [24]:
bins = pd.IntervalIndex.from_tuples([(-1.1, -0.05), (-0.05, 0.05), (0.05, 1)], closed='right')
x = pd.cut(ds['Sentiment'].to_list(), bins)
x = x.set_categories(['Negative','Neutral','Positive'])
ds['Polarity'] = x

In [25]:
# Analysis examples:
# Mean by hotel 
ex1 = ds.groupby('device')['Sentiment'].mean().to_frame()
ex1

Unnamed: 0_level_0,Sentiment
device,Unnamed: 1_level_1
Apple - iPhone 14 128GB - Midnight (AT&T),0.013088
Apple - iPhone 14 128GB - Midnight (Verizon),0.015183
Apple - iPhone 14 128GB - Purple (T-Mobile),0.014373
Apple - iPhone 14 256GB - Midnight (AT&T),0.014143
Apple - iPhone 14 256GB - Midnight (T-Mobile),0.016579
Apple - iPhone 14 256GB - Purple (Verizon),0.01149
Apple - iPhone 14 512GB - Midnight (AT&T),0.017878
Apple - iPhone 14 512GB - Yellow (Verizon),0.016684


In [37]:
# Analysis examples:
# Mean by hotel stars and type
ex2 = ds[['device','rating','Sentiment']].groupby(['device','rating'], as_index=False).mean()
ex2

Unnamed: 0,device,rating,Sentiment
0,Apple - iPhone 14 128GB - Midnight (AT&T),1.0,0.0074
1,Apple - iPhone 14 128GB - Midnight (AT&T),2.0,0.022272
2,Apple - iPhone 14 128GB - Midnight (AT&T),3.0,0.01119
3,Apple - iPhone 14 128GB - Midnight (AT&T),4.0,0.011075
4,Apple - iPhone 14 128GB - Midnight (AT&T),5.0,0.013437
5,Apple - iPhone 14 128GB - Midnight (Verizon),1.0,0.014204
6,Apple - iPhone 14 128GB - Midnight (Verizon),2.0,0.011615
7,Apple - iPhone 14 128GB - Midnight (Verizon),3.0,0.014149
8,Apple - iPhone 14 128GB - Midnight (Verizon),4.0,0.012105
9,Apple - iPhone 14 128GB - Midnight (Verizon),5.0,0.015766
