## Open the dataset

In [4]:
# Importing the pandas library and aliasing it as 'pd'
import pandas as pd

# Reading a JSON file into a pandas DataFrame
df = pd.read_json('C:\\Users\\himav\\Downloads\\Arts_Crafts_and_Sewing_5.json', lines=True)

# Sampling 10 random rows from the DataFrame and displaying them.
df.sample(10)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
297029,5,True,"04 17, 2017",A6BONAW80O4P3,B00D05ZZOS,,gins moss,love the colors,nice,1492387200,,
369578,5,True,"01 21, 2017",A1B4W9762MFXYO,B00USFI9W8,{'Color:': ' Salt & Pepper'},Shirley,Great thread. Nice assortment of colors.,Great thread.,1484956800,,
276732,5,True,"10 30, 2016",A2ZGA89FE61NFO,B00AM495JG,,WWCitizen,Excellent quality. Perfect for a number of cra...,Excellent quality,1477785600,,
95043,5,False,"09 15, 2017",A1N0ZK1K8FY8WF,B000Y00DDK,,Tamarra Strauss,Great!,Five Stars,1505433600,,
352936,5,True,"06 26, 2017",A2YNF6Q9GM6DJ2,B00NU55HGE,,Dannette Herring,nice,Five Stars,1498435200,,
461085,5,True,"03 13, 2018",A1QEW4ZH8IKVXX,B00CE0PH66,"{'Size:': ' 9 SM 9 LG', 'Color:': ' White SM M...",Carol the Bookworm,These have worked well for every skein size I ...,Works extremely well,1520899200,,
480101,5,True,"05 12, 2018",A1Z5SL05N79VLK,B00P04JNRM,,Rita Brand,I love this! It holds the fabric in place whil...,Accurate strips in no time!,1526083200,,
95218,4,True,"04 22, 2017",A28EGJJAGBQOJ5,B000Y00DDK,,FiestyBerry,Great value.,Four Stars,1492819200,,
219306,5,True,"02 12, 2016",A3JDNMUI86DTTS,B004WFUH2U,"{'Size:': ' 5-Inch by 7-Inch', 'Style Name:': ...",Loretta Dillon,This is the finest watercolor paper I've ever ...,Excellent Quality,1455235200,8.0,
186782,1,True,"07 17, 2015",A3I84697ERH6KR,B002U6YFUS,,Abraham G.,martha steward punches never cut right.. alway...,frustrating!!!,1437091200,,


## Get the Bing Liu lexicon

In [5]:
# Importing preprocessing module from scikit-learn
from sklearn import preprocessing

# Importing the Natural Language Toolkit library
import nltk

# Downloading the opinion lexicon dataset from NLTK
nltk.download('opinion_lexicon')

# Importing the opinion lexicon dataset from NLTK
from nltk.corpus import opinion_lexicon

# Importing the word_tokenize function from NLTK
from nltk.tokenize import word_tokenize

# Printing the total number of words in the opinion lexicon dataset
print('Total number of words in opinion lexicon', len(opinion_lexicon.words()))

# Printing examples of positive words from the opinion lexicon dataset
print('Examples of positive words in opinion lexicon', opinion_lexicon.positive()[:10])

# Printing examples of negative words from the opinion lexicon dataset
print('Examples of negative words in opinion lexicon', opinion_lexicon.negative()[:10])

Total number of words in opinion lexicon 6789
Examples of positive words in opinion lexicon ['a+', 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'accessible', 'acclaim', 'acclaimed', 'acclamation']
Examples of negative words in opinion lexicon ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted']


[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\himav\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


## Create the "Dictionary"

In [7]:
#Let's create a dictionary which we can use for scoring our review text

# Downloading necessary NLTK data if not already downloaded
nltk.download('punkt')

# Assuming df is a DataFrame containing reviews and you're renaming the 'reviewText' column to 'text'
df.rename(columns={"reviewText": "text"}, inplace=True)

# Define positive and negative scores
pos_score = 1
neg_score = -1

# Create an empty dictionary to store word scores
word_dict = {}

# Adding the positive words to the dictionary with a pisitive score
for word in opinion_lexicon.positive():
    word_dict[word] = pos_score
    
# Adding the negative words to the dictionary with a negative score
for word in opinion_lexicon.negative():
    word_dict[word] = neg_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\himav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Function to do the math

In [8]:
def bing_liu_score(text): 
    sentiment_score = 0 # Initializing sentiment score to zero
    bag_of_words = word_tokenize(text.lower()) # Tokenizing the text into words and converting to lowercase
    for word in bag_of_words: # Looping through each word in the bag of words
        if word in word_dict: # Checking if the word is in the sentiment dictionary
            sentiment_score += word_dict[word]  # If the word is in the dictionary, adding its sentiment score to the total score
    return sentiment_score # Returning the calculated sentiment score

## Make sure all texts are non-empty and are type text then call the function

In [9]:
# Filling any NaN values in the 'text' column with 'no review'
df['text'].fillna('no review', inplace=True)

# Applying the bing_liu_score function to each element in the 'text' column and creating a new column 'Bing_Liu_Score' to store the results
df['Bing_Liu_Score'] = df['text'].apply(bing_liu_score)

## Output

In [10]:
# Displaying the first 10 rows of selected columns ('overall', 'text', 'Bing_Liu_Score') of the DataFrame
df[['overall',"text", 'Bing_Liu_Score']].head(10)

Unnamed: 0,overall,text,Bing_Liu_Score
0,4,Contains some interesting stitches.,1
1,5,I'm a fairly experienced knitter of the one-co...,22
2,4,Great book but the index is terrible. Had to w...,0
3,5,I purchased the Kindle edition which is incred...,4
4,5,Very well laid out and very easy to read.\n\nT...,5
5,5,"Beginning her career as a freelance knitter, M...",15
6,5,This is a terrific stitch handbook (and I have...,9
7,4,The book needs to be coil bound. The content i...,1
8,5,I really am enjoying this book! I like the siz...,12
9,5,Just received this book and looked over it cov...,6


## Calculating mean

In [11]:
# Grouping the DataFrame by the 'overall' column and calculating the mean of 'Bing_Liu_Score' for each group
df.groupby('overall').agg({'Bing_Liu_Score':'mean'})

Unnamed: 0_level_0,Bing_Liu_Score
overall,Unnamed: 1_level_1
1,-0.255049
2,0.566098
3,1.158796
4,2.028146
5,2.130005
