# Sentiment Analysis

## Reading Data-File 

In [5]:
import pandas as pd #Importing pandas as pd
file="C:/Users/prave/OneDrive/Documents/archive/test.csv"
df=pd.read_csv(file) #Reading the excel File
df.head(20) #printing the first 20 rows

Unnamed: 0,test_id,description_x,description_y,same_security
0,0,semtech corp,semtech corporation,
1,1,vanguard mid cap index,vanguard midcap index - a,
2,2,spdr gold trust gold shares,spdr gold trust spdr gold shares,
3,3,vanguard total bond index adm,vanguard total bond market index,
4,4,oakmark international fund class i,oakmark international cl i,
5,5,pfizer inc div: 1.200,pfizer inc com,
6,6,spartan global ex us index fid adv cl,sptn glb xus idx adv,
7,7,vanguard total bond market idx-adm,vanguard total bond market index fund investor...,
8,8,banco latinoamericano de exportacio class e co...,banco latinoamericano come-e,
9,9,baidu inc fadr 1 adr reps 0.1 ord shs,baidu inc spons ads repr 0.10 ord cls a us0.00005,


## EXPLORtion of  Opinion Lexicon in NLTK Library

In [6]:
from sklearn import preprocessing #Importing preprocessing module from sklearn
import nltk #Importing Natural Language Toolkit library 
nltk.download('opinion_lexicon')#Downloading the opinion lexicon dataset from NLTK
from nltk.corpus import opinion_lexicon#Importing the opinion_lexicon corpus, which contains positive&negative opinion words
from nltk.tokenize import word_tokenize#Importing word_tokenize function, which is used for tokenizing words

print('Total number of words in opinion lexicon', len(opinion_lexicon.words()))#Printing total no of words in opinion lexicon
print('Examples of positive words in opinion lexicon',
      opinion_lexicon.positive()[:10]) # Printing examples of positive words in the opinion lexicon
print('Examples of negative words in opinion lexicon',
      opinion_lexicon.negative()[:10]) # Printing examples of negative words in the opinion lexicon

Total number of words in opinion lexicon 6789
Examples of positive words in opinion lexicon ['a+', 'abound', 'abounds', 'abundance', 'abundant', 'accessable', 'accessible', 'acclaim', 'acclaimed', 'acclamation']
Examples of negative words in opinion lexicon ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted']


[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\prave\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


## Creation of Dictionary for Sentiment Analysis

In [7]:
# Let's create a dictionary which we can use for scoring our review text

# Downloading punkt from NLTK library
nltk.download('punkt')

# Renaming the column 'reviewText' to 'Modules' in the DataFrame
df.rename(columns={"reviewText": "description_y"}, inplace=True)

# Assigning positive and negative scores
pos_score = 1
neg_score = -1

# Initializing an empty dictionary
word_dict = {}
 
# Adding the positive words to the dictionary
for word in opinion_lexicon.positive():
        word_dict[word] = pos_score
      
# Adding the negative words to the dictionary
for word in opinion_lexicon.negative():
        word_dict[word] = neg_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prave\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Calculating Sentiment Score with Bing Liu Lexicon

In [8]:
#Creating a fuunction text
def bing_liu_score(Modules):
    #Initializing the sentiment score
    sentiment_score = 0
    #Tokenizing the input text into words and convert them to lowercase
    bag_of_words = word_tokenize(Modules.lower())
     # creating loop to check each word in the bag of words
    for word in bag_of_words:
        #Checking if the word exists in the sentiment dictionary
        if word in word_dict:
            # If the word exist, adding its sentiment score to the sentiment score
            sentiment_score += word_dict[word]
    return sentiment_score  #Returning the sentiment score for the text

In [9]:
# Fill NaN values in the 'text' column
df['description_y'].fillna('no review', inplace=True)
#creating new column 'Bing_Liu_Score' to store the scores by applying  bing_liu_score to calculate sentiment scores for Module column
df['Bing_Liu_Score'] = df['description_y'].apply(bing_liu_score)

In [10]:
# Displaying the first 5 rows of the DataFrame with few columns
df[["description_y", 'Bing_Liu_Score']].head(5)

Unnamed: 0,description_y,Bing_Liu_Score
0,semtech corporation,0
1,vanguard midcap index - a,0
2,spdr gold trust spdr gold shares,3
3,vanguard total bond market index,0
4,oakmark international cl i,0


## Calculating Mean Sentiment Score

In [11]:
df.groupby('description_y').agg({'Bing_Liu_Score':'mean'})

Unnamed: 0_level_0,Bing_Liu_Score
description_y,Unnamed: 1_level_1
accenture plc ireland,0.0
adobe systems incorporated,0.0
advanced micro devices inc,1.0
af europac growth r6,0.0
afs international growth & income fund cl f1,0.0
...,...
xinyuan real estate co ltd spon adr,0.0
yamana gold inc com,1.0
yandex n.v. com usd0.01 cl a,0.0
zimmer biomet holdings inc com,0.0
