Sentiment Reddit by Abrar

In [1]:
# Import packages
import csv
import pandas as pd
import numpy as np
import nltk 
import re
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 

In [2]:
# Load dataset
dtypes = {'Id':'category','type':'category','subreddit':'category','score':'float64','title':'category','reviews':'category','author':'category'}
ds= pd.read_excel("reddit_FINAL_v12.xlsx", sheet_name="Sheet1", engine='openpyxl', dtype=dtypes)


In [3]:
# Check first rows
ds.head()

Unnamed: 0,Id,type,subreddit,title,reviews,author,date_published,score
0,1,post,iphone,Iphone 14 leather cases like andar brand but i...,Are there any? I don't wanna wait a month to r...,New-Analysis8054,2023-04-29 07:22:04,1.0
1,2,comment,iphone,Iphone 14 leather cases like andar brand but i...,Mujjo and Solo Pelle both make great leather c...,ShortOnCoffee,2023-04-29 07:52:29,1.0
2,3,comment,iphone,Iphone 14 leather cases like andar brand but i...,I use Mujjo. Another brand is Decoded.,uwGrootsheid,2023-04-29 10:29:29,1.0
3,4,post,iphone,Why is my screen time incorrect?,Barely used my phone today but it shows 3 hours.,tyler_ness,2023-04-28 21:07:35,1.0
4,5,comment,iphone,Why is my screen time incorrect?,It shows your usage from your other devices li...,lovekorra,2023-04-29 05:45:35,1.0


In [4]:
# Text preprocessing
def textPreProcess(rawText, removeHTML=True, charsToRemove = r'\?|\.|\!|\;|\.|\"|\,|\(|\)|\&|\:|\-', removeNumbers=True, removeLineBreaks=False, specialCharsToRemove = r'[^\x00-\xfd]', convertToLower=True, removeConsecutiveSpaces=True):
    if type(rawText) != str:
        return rawText
    procText = rawText
        
    # Remove HTML
    if removeHTML:
        procText = BeautifulSoup(procText,'html.parser').get_text()

    # Remove punctuation and other special characters
    if len(charsToRemove)>0:
        procText = re.sub(charsToRemove,' ',procText)

    # Remove numbers
    if removeNumbers:
        procText = re.sub(r'\d+',' ',procText)

    # Remove line breaks
    if removeLineBreaks:
        procText = procText.replace('\n',' ').replace('\r', '')

    # Remove special characters
    if len(specialCharsToRemove)>0:
        procText = re.sub(specialCharsToRemove,' ',procText)

    # Normalize to lower case
    if convertToLower:
        procText = procText.lower() 

    # Replace multiple consecutive spaces with just one space
    if removeConsecutiveSpaces:
        procText = re.sub(' +', ' ', procText)

    return procText

In [5]:
# Tokenize words
def tokenize_words(words):
    if (type(words) != str) or (word_tokenize(words) == ''):
        return np.nan
    else:
        return word_tokenize(words)

In [6]:
# Function to create text from words
def recreateText(words):
    if type(words) == list:
        temp_str = (' ').join(words)
        return temp_str
    else:
        return np.nan

In [7]:
# Function to break texts into sentences
def tokenize_sentences(texts):
    s_token = sent_tokenize(texts)
    return s_token

In [8]:
# Function to remove stop words
def removeStopWords(t, stop_words):
    if type(t) == list:
        return [w for w in t if not w in stop_words]
    else:
        return np.nan

In [9]:
## Analysis

In [10]:
def tokenize_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return ". ".join(sentences)

In [11]:
# Because a review can express multiple opinions, let's analyze opinions by sentence

# Break reviews' into a list of lists sentences
listOfSentences = ds.reviews.apply(tokenize_sentences)

In [12]:
# Create a dataframe with only the description
processedReviews = pd.DataFrame(data=ds.reviews.apply(textPreProcess,charsToRemove ='', removeLineBreaks=False, removeNumbers=False).values, index=ds.index, columns=['PreProcessedText'])



In [13]:
# Check first review
ds.reviews[0]

"Are there any? I don't wanna wait a month to receive it, i live in italy if that matters"

In [14]:
listOfSentences[0]

"Are there any?. I don't wanna wait a month to receive it, i live in italy if that matters"

In [17]:
# Split sentences and count the number of sentences per review
reviews_str = ds['reviews'].astype(str)
sentences = pd.DataFrame(reviews_str.str.split('.').tolist(), index=ds['Id']).stack()
sentencesPerReview = [len(elem) for elem in reviews_str.str.split('.')]
 
# Create a new DataFrame with the sentences and their respective IDs
sentences.name = 'sentence'
sentences.index.names = ['Id', 'sentence_no']
sentences = sentences.reset_index().set_index('Id')











In [18]:
import re
import string

def text_preprocess(text):
    """
    Preprocess text data by converting to lowercase, removing punctuation and digits, and removing extra whitespaces.
    """
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Preprocess text 
sentences['PreProcessedText'] = sentences['sentence'].apply(text_preprocess)


In [19]:
# Get words
sentences['Words'] =  sentences['PreProcessedText'].apply(tokenize_words)



In [20]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asifa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
# Remove stopwords
stop_words = set(stopwords.words('english'))
sentences['WordsCleaned'] = sentences['Words'].apply(removeStopWords,stop_words=stop_words)

In [22]:
# Recreate sentence without stopwords
sentences['ProcessedText'] = sentences['WordsCleaned'].apply(recreateText)


In [23]:
# Create sentiment analysis object
analyser = SentimentIntensityAnalyzer()

In [24]:
# To test, let's evaluate first sentence of first review
# Scales:
#   compound: -1:most extreme negative, 1:most extreme positive
#     positive: compound >=0.05
#     neutral: -0.05<compound<0.05
#     negative: compound <= -0.05
#   pos, neu, neg: proportion of text that are positive, neutral or negative
score = analyser.polarity_scores(sentences['ProcessedText'][0])
print(sentences['ProcessedText'][0],score)

dont wan na wait month receive live italy matters {'neg': 0.0, 'neu': 0.879, 'pos': 0.121, 'compound': 0.0258}


In [25]:
# Process sentiment for all sentences
all_scores = []
for t in (sentences['ProcessedText'][:]):
  score = analyser.polarity_scores(t)
  all_scores.append(score)
sentences['Sentiment'] = [c['compound'] for c in all_scores]

In [26]:
# Compute review's sentiment as the mean sentiment from its sentences
meanByReview = sentences.groupby('Id')['Sentiment'].mean()

# Consider sentences with no result as neutral (0)
meanByReview = meanByReview.fillna(0)

# Add column Sentiment to reviews Dataframe
ds['Sentiment'] = meanByReview[ds['Id']].values

In [27]:
bins = pd.IntervalIndex.from_tuples([(-1.1, -0.05), (-0.05, 0.05), (0.05, 1)], closed='right')
x = pd.cut(ds['Sentiment'].to_list(), bins)
x = x.set_categories(['Negative','Neutral','Positive'])
ds['Polarity'] = x

In [28]:
# Analysis examples:
# By title
ex1 = ds.groupby('title')['Sentiment'].mean().to_frame()
ex1

Unnamed: 0_level_0,Sentiment
title,Unnamed: 1_level_1
Apple 20w charger or Anker 20w Nano,0.129564
Apple care,0.000000
"Applecare+, is it worth it?",0.034008
BEST CASE FOR IPHONE 14,0.206869
Black Bar at Top,0.133020
...,...
iPhone not turning on,0.203391
"my iphone 14 camera automatically edits my photos, i cant figure out how to stop it",0.044548
phone getting really hot,0.002800
weird group chat text issue,-0.000131


In [33]:
# Analysis examples:
#By comment type and score
ex2 = ds[['type','score','Sentiment']].groupby(['type','score'], as_index=False).mean()
ex2

Unnamed: 0,type,score,Sentiment
0,comment,-222.0,0.0258
1,comment,-82.0,0.0000
2,comment,-32.0,-0.2083
3,comment,-31.0,0.0000
4,comment,-25.0,-0.5106
...,...,...,...
137,post,260.0,
138,post,283.0,
139,post,310.0,
140,post,375.0,0.0000
