# Loading and Preprocessing Data

In [1]:
import pandas as pd
# Reading JSON file into a DataFrame
df = pd.read_json("C:\\Users\\tejas\\Downloads\\Arts_Crafts_and_Sewing_5.json",lines=True)
# Displaying a random sample of 10 rows from the DataFrame
df.sample(10)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
152735,5,False,"11 9, 2014",A8VWKNNK7KBCN,B001LQZOLW,,Pamela Berube,"Great quality, easy to work with, melts down e...",Five Stars,1415491200,,
441464,5,True,"04 10, 2016",A18KHKRH2RT2SK,B004KYQ3BC,,Debbie,Love it,Five Stars,1460246400,,
221266,4,True,"08 31, 2014",A5QT5T1XK2GAB,B004YIEQHC,{'Size:': ' 10 Needles'},Gina Darling &amp;amp; Fila Service Crew,H aven't needed to change needles but through ...,Does what its supposed to...,1409443200,,
385614,5,True,"10 31, 2017",A3G4WY29D96VN8,B013XHWJHI,{'Size:': ' 2 Pack'},Virginia Gal,Love these for the price. Others were too expe...,Five Stars,1509408000,,
426715,5,True,"06 8, 2017",A17YDFAA7KA043,B001682BTS,,constance manista,Great,Five Stars,1496880000,,
322010,5,True,"05 25, 2015",A2SGYP5YN66TXO,B00HJTOV3C,,Donna E Reese,Great addition to my quilling supplies.,Five Stars,1432512000,,
157972,5,True,"09 15, 2017",A2QR500LKU56BR,B001R4T4VE,,Sherlock Holmes Fan,"Great product! So light, and compact, love them!!",Five Stars,1505433600,,
320710,5,True,"11 23, 2016",A58KVCCLMANJL,B00H1NTVZE,{'Color:': ' 6mm'},Lucy,Nice beads from China,Five Stars,1479859200,,[https://images-na.ssl-images-amazon.com/image...
124037,3,True,"05 8, 2010",A39NEMK0UIVUI4,B0016N6CMU,"{'Size:': ' 1.75 lb', 'Color:': ' White'}",James D. Atchley,It is very easy to manipulate from the start. ...,Easy to use,1273276800,57.0,
161746,5,True,"01 31, 2017",A11GNGD7MD4Y74,B001UCDQJ4,,J&amp;S B,I like the clear bobbins because they show the...,Very Nice!!,1485820800,,


# Sentiment Analysis Setup

# Get the Bing Liu lexicon

In [2]:
# Importing the preprocessing module from the scikit-learn library
from sklearn import preprocessing
# Importing the Natural Language Toolkit (NLTK)
import nltk
# Downloading the opinion lexicon dataset from NLTK
nltk.download('opinion_lexicon')
# Importing the opinion_lexicon dataset from the NLTK corpus module
from nltk.corpus import opinion_lexicon
# Importing the word_tokenize function from the NLTK tokenize module
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\tejas\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\opinion_lexicon.zip.


In [3]:
# Printing the total number of words in the opinion_lexicon dataset
print('Total number of words in opinion lexicon', len(opinion_lexicon.words()))
# Printing examples of positive words in the opinion_lexicon dataset
print('Examples of positive words in opinion lexicon',
opinion_lexicon.positive()[:10])
# Printing examples of negative words in the opinion_lexicon dataset
print('Examples of negative words in opinion lexicon',
opinion_lexicon.negative()[:10])

Total number of words in opinion lexicon 6789
Examples of positive words in opinion lexicon ['a+', 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'accessible', 'acclaim', 'acclaimed', 'acclamation']
Examples of negative words in opinion lexicon ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted']


# Applying Sentiment Analysis

In [4]:
# Let's create a dictionary which we can use for scoring our review text
# Downloading the 'punkt' tokenizer from NLTK, which is required for word tokenization
nltk.download('punkt')
# Renaming the column 'reviewText' to 'text' for consistency and clarity
df.rename(columns={"reviewText": "text"}, inplace=True)
# Assigning positive and negative scores for sentiment analysis
pos_score = 1
neg_score = -1
# Creating an empty dictionary to store word scores for sentiment analysis
word_dict = {}

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tejas\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


# Data Exploration

In [6]:
# Adding the positive words to the dictionary
for word in opinion_lexicon.positive():
    word_dict[word] = pos_score

# Iterating Through Negative Words in Opinion Lexicon

In [7]:
# Adding the negative words to the dictionary
for word in opinion_lexicon.negative():
    word_dict[word] = neg_score

# Function to Calculate Sentiment Score using Bing Liu's Lexicon

In [8]:
# Define a function to calculate the sentiment score of a given text using Bing Liu's lexicon.
def bing_liu_score(text):
    sentiment_score = 0 # Initialize the sentiment score to zero.
    bag_of_words = word_tokenize(text.lower())
    # Iterate over each word in the bag of words.
    for word in bag_of_words:
        # Check if the word exists in the word dictionary.
        if word in word_dict:
            sentiment_score += word_dict[word]
    return sentiment_score # Return the final sentiment score of the text.

# Data Preprocessing and Sentiment Analysis

In [9]:
# Fill missing values in the 'text' column with 'no review'
df['text'].fillna('no review', inplace=True)
# Apply the bing_liu_score function to calculate sentiment scores for each text in the 'text' column and create a new column 'Bing_Liu_Score'
df['Bing_Liu_Score'] = df['text'].apply(bing_liu_score)

# Displaying Selected Columns for Analysis

In [10]:
# Displaying the first 10 rows of selected columns ('overall', 'text', 'Bing_Liu_Score')
df[['overall',"text", 'Bing_Liu_Score']].head(10)

Unnamed: 0,overall,text,Bing_Liu_Score
0,4,Contains some interesting stitches.,1
1,5,I'm a fairly experienced knitter of the one-co...,22
2,4,Great book but the index is terrible. Had to w...,0
3,5,I purchased the Kindle edition which is incred...,4
4,5,Very well laid out and very easy to read.\n\nT...,5
5,5,"Beginning her career as a freelance knitter, M...",15
6,5,This is a terrific stitch handbook (and I have...,9
7,4,The book needs to be coil bound. The content i...,1
8,5,I really am enjoying this book! I like the siz...,12
9,5,Just received this book and looked over it cov...,6


# Grouping and Aggregating Sentiment Scores

In [11]:
# Grouping the DataFrame by the 'overall' column and calculating the mean of 'Bing_Liu_Score' for each group
df.groupby('overall').agg({'Bing_Liu_Score':'mean'})

Unnamed: 0_level_0,Bing_Liu_Score
overall,Unnamed: 1_level_1
1,-0.255049
2,0.566098
3,1.158796
4,2.028146
5,2.130005
