In [13]:
import pandas as pd
import numpy as np
import hazm
from tabulate import tabulate

# Read Persian NRC .v1

In this section we read the latset version of NRC and create columns for each emotion 

In [19]:
NRC_df = pd.read_csv('NRC-Emotion-Intensity-Lexicon-v1.txt', sep='\t')
NRC_df

Unnamed: 0,word,emotion,emotion-intensity-score
0,outraged,anger,0.964
1,brutality,anger,0.959
2,hatred,anger,0.953
3,hateful,anger,0.940
4,terrorize,anger,0.939
...,...,...,...
9894,fugitive,trust,0.141
9895,divorce,trust,0.133
9896,mistakes,trust,0.133
9897,bait,trust,0.133


In [20]:
emotions = NRC_df.emotion.unique()#['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']

# create a column for each emotion
for emotion in emotions:
    NRC_df[emotion] = NRC_df[['emotion','emotion-intensity-score']].apply(lambda x: x['emotion-intensity-score'] if x['emotion'] == emotion else 0, axis=1)

# drop non-useful cols
NRC_df = NRC_df.drop(labels=['emotion-intensity-score','emotion'], axis=1)

NRC_df


Unnamed: 0,word,anger,anticipation,disgust,fear,joy,sadness,surprise,trust
0,outraged,0.964,0.0,0.0,0.0,0.0,0.0,0.0,0.000
1,brutality,0.959,0.0,0.0,0.0,0.0,0.0,0.0,0.000
2,hatred,0.953,0.0,0.0,0.0,0.0,0.0,0.0,0.000
3,hateful,0.940,0.0,0.0,0.0,0.0,0.0,0.0,0.000
4,terrorize,0.939,0.0,0.0,0.0,0.0,0.0,0.0,0.000
...,...,...,...,...,...,...,...,...,...
9894,fugitive,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.141
9895,divorce,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.133
9896,mistakes,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.133
9897,bait,0.000,0.0,0.0,0.0,0.0,0.0,0.0,0.133


# Read SentiWordNet

SentiWordNet v3.0.0 (1 June 2010)

Andrea Esuli

SentiWordNet is distributed under the Attribution-ShareAlike 4.0 Unported (CC BY-SA 4.0) license.

http://creativecommons.org/licenses/by-sa/4.0/

 For any information about SentiWordNet

 Web: http://sentiwordnet.isti.cnr.it

 SentiWordNet v3.0 is based on WordNet version 3.0.

 WordNet website: http://wordnet.princeton.edu/

 The pair (POS,ID) uniquely identifies a WordNet (3.0) synset.
 The values PosScore and NegScore are the positivity and negativity
 score assigned by SentiWordNet to the synset.
 
 The objectivity score can be calculated as:
 
 ObjScore = 1 - (PosScore + NegScore)

 SynsetTerms column reports the terms, with sense number, belonging
 to the synset (separated by spaces).

In [6]:
SentiWordNet_file = 'SentiWordNet_3.0.0.txt'
princetoneSWN_df = pd.read_csv(SentiWordNet_file, sep='\t', dtype=str)
princetoneSWN_df['PosScore'] = princetoneSWN_df['PosScore'].astype('float')
princetoneSWN_df['NegScore'] = princetoneSWN_df['NegScore'].astype('float')
princetoneSWN_df['ID'].astype('object')
princetoneSWN_df['ObjScore'] = 1 - (princetoneSWN_df['PosScore'] + princetoneSWN_df['NegScore'])
princetoneSWN_df.head()

Unnamed: 0,POS,ID,PosScore,NegScore,SynsetTerms,Gloss,ObjScore
0,a,1740,0.125,0.0,able#1,(usually followed by `to') having the necessar...,0.875
1,a,2098,0.0,0.75,unable#1,(usually followed by `to') not having the nece...,0.25
2,a,2312,0.0,0.0,dorsal#2 abaxial#1,facing away from the axis of an organ or organ...,1.0
3,a,2527,0.0,0.0,ventral#2 adaxial#1,nearest to or facing toward the axis of an org...,1.0
4,a,2730,0.0,0.0,acroscopic#1,facing or on the side toward the apex,1.0


In [7]:
# drop NaN values
princetoneSWN_df.dropna(inplace=True)
princetoneSWN_df.isna().sum()

POS            0
ID             0
PosScore       0
NegScore       0
SynsetTerms    0
Gloss          0
ObjScore       0
dtype: int64

# Read Persian-SntiWordNet
This file is the work of https://github.com/Text-Mining/Persian-Sentiment-Resources
and the file downloaded from their github

In [9]:
persianSWN_df = pd.read_csv('PersianSWN.csv', header=None, sep='\t')
persianSWN_df.columns = ['synset_id', 'word', 'confidence_value', 'pos_val', 'neg_val']
persianSWN_df

Unnamed: 0,synset_id,word,confidence_value,pos_val,neg_val
0,00001740-a,توانا,1.00,0.125,0.000
1,00051373-a,توانا,0.45,0.375,0.250
2,00306314-a,توانا,1.00,0.125,0.000
3,00306663-a,توانا,0.53,0.125,0.000
4,00308015-a,توانا,0.53,0.000,0.000
...,...,...,...,...,...
259500,02768702-v,سایه انداختن,0.91,0.000,0.000
259501,02771020-v,ابر کردن,0.24,0.000,0.250
259502,02771756-v,عملیات جنگی,0.24,0.000,0.000
259503,02771888-v,مه تا,0.60,0.000,0.125


# join PersianSWN with PrincetoneSWN  

In this section we can obtain the tuples synsetTerms, persian word and the confidence values.

In [11]:
# create 'synset_id' column for PrincetoneSWN
princetoneSWN_df['synset_id'] = princetoneSWN_df['ID'] + '-' + princetoneSWN_df['POS']

# join 2 dtaframes on Their 'synset_id' column
SWN_joined_df = pd.merge(princetoneSWN_df, persianSWN_df, how='inner', on = 'synset_id')
SWN_joined_df = SWN_joined_df[['synset_id','SynsetTerms','word','confidence_value','pos_val','neg_val']]
SWN_joined_df

Unnamed: 0,synset_id,SynsetTerms,word,confidence_value,pos_val,neg_val
0,00001740-a,able#1,توانا,1.00,0.125,0.000
1,00001740-a,able#1,قادر,0.24,0.125,0.000
2,00002098-a,unable#1,عاجز,1.00,0.000,0.750
3,00002098-a,unable#1,ناتوان,0.75,0.000,0.750
4,00002098-a,unable#1,ضعیف,0.67,0.000,0.750
...,...,...,...,...,...,...
259500,02771888-v,fog_up#1,مه تا,0.60,0.000,0.125
259501,02771997-v,coal#1 char#1,انجام دادن,0.24,0.000,0.000
259502,02771997-v,coal#1 char#1,زغال کردن,1.00,0.000,0.000
259503,02772202-v,haze#1,گرفته بودن,0.60,0.125,0.250


# Assign row to each persian word , english term

In some rows in the Princetone-SWN we have more than 1 term assigned to the synset-id. 

In this part we want to seperate these words and create a row for each english-persian word pair.

In [16]:
synset_terms = []
for index, row in SWN_joined_df.iterrows():
    # remove pos hashtags 
    terms = [term.split('#')[0] for term in row['SynsetTerms'].split(' ')]
    for term in terms:
        synset_terms.append([row['synset_id'], term, row['word'], row['confidence_value'], row['pos_val'], row['neg_val']])

SWN_df = pd.DataFrame(synset_terms, columns=['synset_id', 'english_word', 'persian_word','confidence_value','pos_val', 'neg_val'])
SWN_df

Unnamed: 0,synset_id,english_word,persian_word,confidence_value,pos_val,neg_val
0,00001740-a,able,توانا,1.00,0.125,0.00
1,00001740-a,able,قادر,0.24,0.125,0.00
2,00002098-a,unable,عاجز,1.00,0.000,0.75
3,00002098-a,unable,ناتوان,0.75,0.000,0.75
4,00002098-a,unable,ضعیف,0.67,0.000,0.75
...,...,...,...,...,...,...
652141,02771997-v,char,انجام دادن,0.24,0.000,0.00
652142,02771997-v,coal,زغال کردن,1.00,0.000,0.00
652143,02771997-v,char,زغال کردن,1.00,0.000,0.00
652144,02772202-v,haze,گرفته بودن,0.60,0.125,0.25


# Join the table with NRC dataframe 
By merging the NRC table of first section to the final SWN we created before, we can find the emotions for the persian words that we have.

In [39]:
SWN_NRC_joined_df = pd.merge(SWN_df, NRC_df, how='inner', left_on = 'english_word', right_on='word')
SWN_NRC_joined_df

Unnamed: 0,synset_id,english_word,persian_word,confidence_value,pos_val,neg_val,word,anger,anticipation,disgust,fear,joy,sadness,surprise,trust
0,00002098-a,unable,عاجز,1.00,0.0,0.75,unable,0.0,0.000,0.0,0.0,0.0,0.359,0.000,0.000
1,00002098-a,unable,ناتوان,0.75,0.0,0.75,unable,0.0,0.000,0.0,0.0,0.0,0.359,0.000,0.000
2,00002098-a,unable,ضعیف,0.67,0.0,0.75,unable,0.0,0.000,0.0,0.0,0.0,0.359,0.000,0.000
3,00002098-a,unable,سست,0.35,0.0,0.75,unable,0.0,0.000,0.0,0.0,0.0,0.359,0.000,0.000
4,00002098-a,unable,بیحال,0.24,0.0,0.75,unable,0.0,0.000,0.0,0.0,0.0,0.359,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178361,02673965-v,excel,برجسته بودن,0.14,0.0,0.25,excel,0.0,0.000,0.0,0.0,0.0,0.000,0.000,0.609
178362,02673965-v,excel,برتری داشتن بر,0.30,0.0,0.25,excel,0.0,0.461,0.0,0.0,0.0,0.000,0.000,0.000
178363,02673965-v,excel,برتری داشتن بر,0.30,0.0,0.25,excel,0.0,0.000,0.0,0.0,0.5,0.000,0.000,0.000
178364,02673965-v,excel,برتری داشتن بر,0.30,0.0,0.25,excel,0.0,0.000,0.0,0.0,0.0,0.000,0.445,0.000


In [40]:
# drop the useless columns
SWN_NRC_joined_df = SWN_NRC_joined_df.drop(labels=['word'], axis=1)

In [41]:
# example of rows for a persian word
SWN_NRC_joined_df[SWN_NRC_joined_df['persian_word'] == 'توانا']

Unnamed: 0,synset_id,english_word,persian_word,confidence_value,pos_val,neg_val,anger,anticipation,disgust,fear,joy,sadness,surprise,trust
14662,02038994-a,hardy,توانا,0.26,0.500,0.250,0.000,0.0,0.0,0.000,0.250,0.0,0.0,0.000
14663,02038994-a,hardy,توانا,0.26,0.500,0.250,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.492
20781,02295098-a,classical,توانا,0.56,0.500,0.125,0.000,0.0,0.0,0.000,0.106,0.0,0.0,0.000
25060,02295098-a,definitive,توانا,0.56,0.500,0.125,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.609
27946,01830403-a,important,توانا,0.47,0.000,0.125,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.633
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54007,00032299-r,mighty,توانا,0.47,0.125,0.000,0.172,0.0,0.0,0.000,0.000,0.0,0.0,0.000
54008,00032299-r,mighty,توانا,0.47,0.125,0.000,0.000,0.0,0.0,0.203,0.000,0.0,0.0,0.000
54009,00032299-r,mighty,توانا,0.47,0.125,0.000,0.000,0.0,0.0,0.000,0.438,0.0,0.0,0.000
54010,00032299-r,mighty,توانا,0.47,0.125,0.000,0.000,0.0,0.0,0.000,0.000,0.0,0.0,0.469


# Weighted average of values for each persian word

In Persian-SWN we have confidence value for each translation and we have the emotion intensity score for each english word. so we can average the weighted scores of english words that have the same persian word tranlation to obtain the emotion intensity score for persian words.




In [44]:
### calculate the weighted scores for sentiment and emotion values
for col in ['pos_val','neg_val','anger','anticipation','disgust','fear','joy','sadness','surprise','trust']:
    SWN_NRC_joined_df[col] = SWN_NRC_joined_df['confidence_value'] * SWN_NRC_joined_df[col]
    
SWN_NRC_joined_df[SWN_NRC_joined_df['persian_word'] == 'توانا']

Unnamed: 0,synset_id,english_word,persian_word,confidence_value,pos_val,neg_val,anger,anticipation,disgust,fear,joy,sadness,surprise,trust
14662,02038994-a,hardy,توانا,0.26,0.13000,0.06500,0.00000,0.0,0.0,0.00000,0.06500,0.0,0.0,0.00000
14663,02038994-a,hardy,توانا,0.26,0.13000,0.06500,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.12792
20781,02295098-a,classical,توانا,0.56,0.28000,0.07000,0.00000,0.0,0.0,0.00000,0.05936,0.0,0.0,0.00000
25060,02295098-a,definitive,توانا,0.56,0.28000,0.07000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.34104
27946,01830403-a,important,توانا,0.47,0.00000,0.05875,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.29751
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54007,00032299-r,mighty,توانا,0.47,0.05875,0.00000,0.08084,0.0,0.0,0.00000,0.00000,0.0,0.0,0.00000
54008,00032299-r,mighty,توانا,0.47,0.05875,0.00000,0.00000,0.0,0.0,0.09541,0.00000,0.0,0.0,0.00000
54009,00032299-r,mighty,توانا,0.47,0.05875,0.00000,0.00000,0.0,0.0,0.00000,0.20586,0.0,0.0,0.00000
54010,00032299-r,mighty,توانا,0.47,0.05875,0.00000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.22043


In [48]:
SWN_NRC_emotion_aggregated_df = SWN_NRC_joined_df.groupby(by=['persian_word']).agg({'pos_val':'mean',
                                                                                    'neg_val':'mean',
                                                                                    'anticipation':'mean',
                                                                                    'anger':'mean',
                                                                                    'disgust':'mean',
                                                                                    'fear':'mean',
                                                                                    'joy':'mean',
                                                                                    'sadness':'mean',
                                                                                    'surprise':'mean',
                                                                                    'trust':'mean'
                                                                                    })
SWN_NRC_emotion_aggregated_df['word'] = SWN_NRC_emotion_aggregated_df.index

In [37]:
# save file
SWN_NRC_emotion_aggregated_df.to_csv('Persian_Emotion_Intensity_Lexicons.csv', index=False)

In [51]:
# test the words 
SWN_NRC_emotion_aggregated_df[SWN_NRC_emotion_aggregated_df['word'].isin(['خوب','بد','زشت','زیبا','مهربان','خشن'])]

Unnamed: 0_level_0,pos_val,neg_val,anticipation,anger,disgust,fear,joy,sadness,surprise,trust,word
persian_word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
بد,0.02691,0.24858,0.005054,0.046322,0.057487,0.053182,0.0,0.052354,0.002038,0.0,بد
خشن,0.05213,0.154722,0.004011,0.047863,0.069231,0.026935,0.009379,0.01947,0.0,0.020109,خشن
خوب,0.17895,0.020533,0.030831,0.00011,7e-05,0.000403,0.065335,0.000268,0.014419,0.05716,خوب
زشت,0.024471,0.225506,0.00173,0.038034,0.077494,0.039411,0.000809,0.036605,0.003276,0.000546,زشت
زیبا,0.19907,0.035872,0.026642,0.0,0.0,0.000663,0.114674,0.000105,0.008379,0.040937,زیبا
مهربان,0.337629,0.067004,0.032492,0.008838,0.002294,0.007414,0.10324,0.013715,0.005429,0.09706,مهربان
