In [None]:
import pandas as pd
from operator import itemgetter
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
import html2text
import numpy as np
import re
import nltk

from spacy.lang.en import STOP_WORDS
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords

In [None]:
#Long stop words list using 3 differnt python stopword libraries
stops = list(set(stopwords.words('english') + list(set(ENGLISH_STOP_WORDS)) + list(set(STOP_WORDS)) + ["http"]))

In [None]:
#This Block loads the Lexicon and creates a data structure for the emotion-itensity

fileEmotion = "emotion_itensity.txt"
table = pd.read_csv(fileEmotion,  names=["word", "emotion", "itensity"], sep='\t')
#create the dictionary with the word/emotion/score
emotion_dic = dict()
lmtzr = WordNetLemmatizer()
for index, row in table.iterrows():
    #add first as it is given in the lexicon
    temp_key = row['word'] + '#' + row['emotion']
    emotion_dic[temp_key] = row['itensity']
    #add in the normal noun form
    temp_key_n = lmtzr.lemmatize(row['word']) + '#' + row['emotion']
    emotion_dic[temp_key_n] = row['itensity']
    #add in the normal verb form
    temp_key_v = lmtzr.lemmatize(row['word'], 'v') + '#' + row['emotion']
    emotion_dic[temp_key_v] = row['itensity']

In [None]:
#create h to clean descriptions in case they are in html format
h = html2text.HTML2Text()
h.ignore_links = True

In [None]:
#function that get the emotion itensity
def getEmotionItensity(word,emotion):
    key = word + "#" + emotion
    try:
        return emotion_dic[key]
    except:
        return 0.0

In [None]:
#Check if the word is in the Lexicon
def isWordInEmotionFile(word):
    result = [(key) for key in emotion_dic.keys() if key.startswith(word + "#")]
    if len(result) == 0:
        return False
    else:
        return True

In [None]:
#Stopping checker 
def isStopWord(word):
    if word in stops:
        return True
    else:
        return False

In [None]:
#Assign the emotion itensity to the dictionary
def calculateEmotion(emotions, word):
    emotions["Anger"] += getEmotionItensity(word, "anger")
    emotions["Anticipation"] += getEmotionItensity(word, "anticipation")
    emotions["Disgust"] += getEmotionItensity(word, "disgust")
    emotions["Fear"] += getEmotionItensity(word, "fear")
    emotions["Joy"] += getEmotionItensity(word, "joy")
    emotions["Sadness"] += getEmotionItensity(word, "sadness")
    emotions["Surprise"] += getEmotionItensity(word, "surprise")
    emotions["Trust"] += getEmotionItensity(word, "trust")

In [None]:
#get the emotion vector of a given text
def getEmotionVector(text):
    #create the initial emotions
    emotions = {"Anger": 0.0,
                "Anticipation": 0.0,
                "Disgust": 0.0,
                "Fear": 0.0,
                "Joy": 0.0,
                "Sadness": 0.0,
                "Surprise": 0.0,
                "Trust": 0.0,
                "Objective": 0.0}
    #parse the description
    str = re.sub("[^a-zA-Z]+", " ", text)
    pat = re.compile(r'[^a-zA-Z ]+')
    str = re.sub(pat, '', str).lower()
    #split string
    splits = str.split()

    #iterate over words array
    for split in splits:
        if not isStopWord(split):
            #first check if the word is in its natural form
            if isWordInEmotionFile(split): 
                calculateEmotion(emotions, split)
            elif isWordInEmotionFile(lmtzr.lemmatize(split)):
                calculateEmotion(emotions, lmtzr.lemmatize(split))
            elif isWordInEmotionFile(lmtzr.lemmatize(split, 'v')):
                calculateEmotion(emotions, lmtzr.lemmatize(split, 'v'))   
            else:
                emotions["Objective"] += 1
    total = sum(emotions.values())
    for key in sorted(emotions.keys()):
        try:
            emotions[key] = (1.0 / total) * emotions[key]
        except:
            emotions[key] = 0
    return emotions

In [None]:
#This block just test the functions above
#open description file
file = open("description.txt","r")
str_f = file.read()
file.close()
formatedDescription = h.handle(str_f)
results = getEmotionVector(formatedDescription)

print(results)

import matplotlib.pyplot as plt
from pylab import rcParams

plt.barh(range(len(results)), list(results.values()), align='center')
plt.yticks(range(len(results)), list(results.keys()))

#plt.xlabel('Emotion')
plt.title('Emotion Itensity with Objective')
plt.show()

del results['Objective']
plt.barh(range(len(results)), list(results.values()), align='center')
plt.yticks(range(len(results)), list(results.keys()))

#plt.xlabel('Emotion')
plt.title('Emotion Itensity')
plt.show()



In [None]:
#This block run over all the books on the CSV file
#before executing this cell, make sure the csv file has a column called "emotion_NRC_objective"
csv_file = 'complete_pl.csv'
books = pd.read_csv(csv_file)
for index, row in books.iterrows():
    description = h.handle(row['description'])
    books.loc[index, 'emotion_NRC_objective'] = str(getEmotionVector(description))
#output file    
books.to_csv('complete_pl_output.csv')