# LEXICON SCORING ALGORITHM - 2/7/24

## Importing all necessary libraries

In [31]:
# Import all the necessary Libraries
import pandas as pd

## Reading the data

In [32]:
# Reading the data 
df = pd.read_csv("C:/Users/LENOVO/OneDrive/Desktop/TextBasedAnalysis/books_data.csv")
df.head(10)

Unnamed: 0.1,Unnamed: 0,Books,Authors,Language,First_Published,Sales_in_millions
0,0,A Tale of Two Cities,Charles Dickens,English,1859,200.0
1,1,The Little Prince (Le Petit Prince),Antoine de Saint-Exupéry,French,1943,200.0
2,2,Harry Potter and the Philosopher's Stone,J. K. Rowling,English,1997,120.0
3,3,And Then There Were None,Agatha Christie,English,1939,100.0
4,4,Dream of the Red Chamber (紅樓夢),Cao Xueqin,Chinese,1791,100.0
5,5,The Hobbit,J. R. R. Tolkien,English,1937,100.0
6,6,She: A History of Adventure,H. Rider Haggard,English,1887,83.0
7,7,The Da Vinci Code,Dan Brown,English,2003,80.0
8,8,Harry Potter and the Chamber of Secrets,J. K. Rowling,English,1998,77.0
9,9,Harry Potter and the Prisoner of Azkaban,J. K. Rowling,English,1999,65.0


## Exploring Opinion Lexicon in NLTK Library

In [34]:
# Import necessary Libraries
from sklearn import preprocessing
import nltk
nltk.download('opinion_lexicon')
# Importing the opinion_lexicon corpus, which contains positive&negative opinion words
from nltk.corpus import opinion_lexicon
# Importing word_tokenize function, which is used for tokenizing words
from nltk.tokenize import word_tokenize

print('Total number of words in opinion lexicon', len(opinion_lexicon.words()))
print('Examples of positive words in opinion lexicon',
      opinion_lexicon.positive()[:10])
print('Examples of negative words in opinion lexicon',
      opinion_lexicon.negative()[:10])

Total number of words in opinion lexicon 6789
Examples of positive words in opinion lexicon ['a+', 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'accessible', 'acclaim', 'acclaimed', 'acclamation']
Examples of negative words in opinion lexicon ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted']


[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


## Creation of Dictionary for Sentiment Analysis

In [35]:
# Let's create a dictionary which we can use for scoring our review text
nltk.download('punkt')
df.rename(columns={"reviewText": "text"}, inplace=True)
pos_score = 1
neg_score = -1
word_dict = {}
 
# Adding the positive words to the dictionary
for word in opinion_lexicon.positive():
        word_dict[word] = pos_score
      
# Adding the negative words to the dictionary
for word in opinion_lexicon.negative():
        word_dict[word] = neg_score


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Calculating Sentiment Score with Bing Liu Lexicon

In [36]:
#Creating a fuunction text
def bing_liu_score(text):
    #Initializing the sentiment score
    sentiment_score = 0
    #Tokenizing the input text into words and convert them to lowercase
    bag_of_words = word_tokenize(text.lower())
     # creating loop to check each word in the bag of words
    for word in bag_of_words:
        #Checking if the word exists in the sentiment dictionary
        if word in word_dict:
            # If the word exist, adding its sentiment score to the sentiment score
            sentiment_score += word_dict[word]
    return sentiment_score  #Returning the sentiment score for the text

In [40]:
# Fill NaN values in the 'text' column
df['Books'].fillna('no review', inplace=True)
#creating new column 'Bing_Liu_Score' to store the scores by applying  bing_liu_score to calculate sentiment scores for each text
df['Bing_Liu_Score'] = df['Books'].apply(bing_liu_score)

In [41]:
# Displaying the first 10 rows of the DataFrame with few columns
df[["Books", 'Bing_Liu_Score']].head(10)

Unnamed: 0,Books,Bing_Liu_Score
0,A Tale of Two Cities,0
1,The Little Prince (Le Petit Prince),0
2,Harry Potter and the Philosopher's Stone,0
3,And Then There Were None,0
4,Dream of the Red Chamber (紅樓夢),0
5,The Hobbit,0
6,She: A History of Adventure,0
7,The Da Vinci Code,0
8,Harry Potter and the Chamber of Secrets,0
9,Harry Potter and the Prisoner of Azkaban,-1


## Calculating mean sentiment score

In [43]:
# Grouping 'overall' rating and calculating the mean sentiment score using the Bing Liu Lexicon
df.groupby('Books').agg({'Bing_Liu_Score':'mean'})

Unnamed: 0_level_0,Bing_Liu_Score
Books,Unnamed: 1_level_1
A Brief History of Time,0.0
A Child's First Library Of Learning,0.0
A Message to Garcia,0.0
A Series of Unfortunate Events,-1.0
A Song of Ice and Fire,0.0
...,...
竜馬がゆく (Ryoma ga Yuku),0.0
连环画 铁道游击队 (Picture-and-story book Railway Guerilla),0.0
銀河英雄伝説 (Legend of the Galactic Heroes),0.0
青春の門 (The Gate of Youth),0.0
