In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score as kappa
from itertools import combinations as com

In [None]:
path= r'C:\Users\Lampros\A1\HW2'
files = os.listdir(path) # Get the files from the path

In [3]:
filenames = [f for f in files if f[-4:] == 'xlsx'] # Choose only excel files.
print(filenames)

['f3662001.xlsx', 'f3662003.xlsx', 'f3662006.xlsx', 'f3662009.xlsx', 'f3662010.xlsx', 'f3662011.xlsx', 'f3662013.xlsx', 'f3662014.xlsx', 'f3662015.xlsx', 'f3662016.xlsx', 'f3662018.xlsx', 'f3662020.xlsx', 'f3662021.xlsx', 'f3662023.xlsx']


In [4]:
os.chdir(path) # Change Directory so that it can read the excel files.
coders = pd.DataFrame() # Initiate the Data Frame
for f in filenames:
    df = pd.read_excel(f)
    df['ID'] = int(f[1:-5]) # Create the ID column with the file's names as the annotators
    if int(f[-6]) < 5:
        df['Group'] = 'G2'
    else:
        df['Group'] = 'G1'
    coders = coders.append(df) # Create the Data Frame
coders

Unnamed: 0,Id_verse,Text,Subjectivity,Sentiment,Primary Emotion,Seocondary Emotions,ID,Group,Emotion Primary,Emotion Secondary,Secondary Emotion
0,Z_376,"Ελάτε, σεις οι σκλάβες, πέστε μου την πάσα αλή...",0.0,0,,,3662001,G2,,,
1,Z_377,Που πηγεν η Αντρομάχη φεύγοντας η χιονοβραχιον...,1.0,-,suspense,,3662001,G2,,,
2,Z_378,"Μήνα σε κάποια συννυφάδα της, μη σε κουνιάδα ε...",0.0,0,,,3662001,G2,,,
3,Z_379_380,"για ανέβη με τις καλοπλέξουδες αρχόντισσες, πο...",0.0,0,,,3662001,G2,,,
4,Z_382_387,"Έχτορα, τώρα αφού με πρόσταξες να πω την πάσα ...",0.0,0,,,3662001,G2,,,
...,...,...,...,...,...,...,...,...,...,...,...
96,,,,,,,3662023,G2,,,
97,,,,,,,3662023,G2,,,
98,,Mνησικακία: η προσθήκη του συγκεκριμένου συναι...,,,,,3662023,G2,,,
99,,Νοσταλγία για την πατρίδα/ το σπίτι/ τους δικο...,,,,,3662023,G2,,,


In [5]:
# Check what unique data each wanted column has.
print('Sentiment unique data: ', coders.Sentiment.unique(), '\nSubjectivity unique data: ', coders.Subjectivity.unique(), '\nPrimary Emotion unique data:\n', coders['Primary Emotion'].unique())

Sentiment unique data:  [0 '-' 'm' '+' nan ' +' ' m' ' -' ' - ' '  -' '_' '              -'
 '             -' '     -' '       -' '          +' '         m'
 '        -' '  +' '    +' '      m' '              +' '    m' '   +'
 '   m' '  m' '    -' '               -'] 
Subjectivity unique data:  [ 0.  1. nan] 
Primary Emotion unique data:
 [nan 'suspense' 'empathy' 'fury' 'compassion' 'admiration' 'compasssion'
 'mercy' 'relief' 'empathy ' 'sadness' 'question' 'pride' 'irony'
 'distress' 'sorrow' 'worry' 'awe' 'surprise' 'pity' 'hope' 'loneliness'
 ' hope' 'grief' 'injustice' 'respect' 'pain' 'longing' 'anger'
 'willingness' 'certainty' 'fear' 'sorrow ' 'reproach' 'hate' 'love' 'joy'
 'jealousy' 'complaint' 'guilt' 'satisfaction' 'αγωνία' 'απορία' 'φόβος'
 'άγχος' 'λύπη' 'αποτροπιασμός' 'οργή' 'αγάπη' 'λαχτάρα' 'δυσφορία'
 'ντροπή' 'υπερηφάνεια' 'σιγουριά' 'απάθεια' 'καημός' 'δυστυχία'
 'απαξίωση' 'δέος' 'ενσυναίσθηση' 'επίπληξη' 'έλεος' 'παράπονο' 'ειρωνεία'
 'μετάνοια' 'πένθος' 'συμπ

In [6]:
 # Strip the column of unwanted whitespace. Zero's become NaNs but later we will turn all NaNs back to zeros.
coders.Sentiment = coders.Sentiment.str.strip()
# Replace the sentiments given with values  -1 0 1. The symbol _ is taken into account as a minus so it is converted to -1
coders.Sentiment.replace(['m', '+', '-', '_'], [1, 1, -1, -1], inplace=True)
# Check if there are NaNs in Id_verse before making it an index.
coders['Id_verse'].isnull().sum()

5

In [7]:
# Delete all the rows that contain missing values as Text.
coders = coders.loc[coders['Text'].notnull()]
# Delete all the rows that contain missing vallues as Id_verse since there is no data for these rows except for Text.
coders = coders.loc[coders['Id_verse'].notnull()]
# Drop all the unwanted columns.
coders.drop(['Seocondary Emotions', 'Emotion Primary', 'Emotion Secondary', 'Secondary Emotion'], axis=1, inplace=True)

In [8]:
# Check if there are missing values in Subjectivity and Sentiment Columns.
coders.isna().sum()

Id_verse             0
Text                 0
Subjectivity         8
Sentiment          346
Primary Emotion    412
ID                   0
Group                0
dtype: int64

In [9]:
# Fill all the Sentiment NaNs with 0. Furthermore, assuming that NaNs are equivalent to a zero for the annotator (We assume that the annotator left the cell blank instead of putting zero) we replace NaNs with 0s for Subjectivity as well. Finally, replace NaN's with neutral emotion in Primary Emotions.
coders.Subjectivity.fillna(0, inplace=True)
coders.Sentiment.fillna(0, inplace=True)
coders['Primary Emotion'].fillna('neutral', inplace=True)

In [10]:
# Check that the missing values problem is resolved in each Column.
coders.isna().sum()

Id_verse           0
Text               0
Subjectivity       0
Sentiment          0
Primary Emotion    0
ID                 0
Group              0
dtype: int64

In [11]:
coders.sample(10)

Unnamed: 0,Id_verse,Text,Subjectivity,Sentiment,Primary Emotion,ID,Group
78,Ω_602_604,Κι η Νιόβη ακόμα η καλοπλέξουδη στερνά να φάει...,0.0,0.0,neutral,3662021,G2
88,Ω_641_642,Μα τώρα και κρασί κατέβασα φλογάτο απ᾿ το λαιμ...,1.0,1.0,compassion - συμπόνια,3662018,G1
45,Ω_509_512,"Μαζί τους έπνιξαν οι θύμησες, τον έναν του αντ...",1.0,-1.0,grief,3662009,G1
51,Ω_525_526,Τέτοια οι θεοί μαθές στους άμοιρους θνητούς έκ...,1.0,1.0,αποδοχή,3662023,G2
69,Ω_563_564,"Κι εγώ το νιώθω, Πρίαμε, σίγουρα, το νου μου δ...",1.0,-1.0,reproach,3662010,G2
22,Z_450_454,Μα τόσο για των Τρωών δε νοιάζομαι τα πάθη όπο...,0.0,0.0,neutral,3662001,G2
91,Ω_656_658,"Μον᾿ έλα τώρα, δωσ᾿ μου απόκριση και πες την π...",0.0,0.0,neutral,3662003,G2
47,Ω_518,Άμοιρε εσύ και που ποτίστηκες πικρά φαρμάκια τ...,1.0,1.0,compassion,3662021,G2
21,Z_448_449,Θα ξημερώσει μέρα κάποτε πού θα χαθεί το κάστρ...,1.0,-1.0,fear,3662021,G2
7,Z_410_413,"Μ᾿ αν είναι να σε χάσω,\nν᾿ ανοίξει η γη να μπ...",1.0,-1.0,απελπισία,3662023,G2


In [12]:
# Check the unique ID's and their type.
coders.ID.unique()

array([3662001, 3662003, 3662006, 3662009, 3662010, 3662011, 3662013,
       3662014, 3662015, 3662016, 3662018, 3662020, 3662021, 3662023],
      dtype=int64)

In [13]:
# Convert the whole ID
coders.ID = coders.ID.astype('str')
# Set Group, ID and Id_verse as indexes
coders.set_index(['Group', 'ID', 'Id_verse'], inplace=True)

In [14]:
coders

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Text,Subjectivity,Sentiment,Primary Emotion
Group,ID,Id_verse,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
G2,3662001,Z_376,"Ελάτε, σεις οι σκλάβες, πέστε μου την πάσα αλή...",0.0,0.0,neutral
G2,3662001,Z_377,Που πηγεν η Αντρομάχη φεύγοντας η χιονοβραχιον...,1.0,-1.0,suspense
G2,3662001,Z_378,"Μήνα σε κάποια συννυφάδα της, μη σε κουνιάδα ε...",0.0,0.0,neutral
G2,3662001,Z_379_380,"για ανέβη με τις καλοπλέξουδες αρχόντισσες, πο...",0.0,0.0,neutral
G2,3662001,Z_382_387,"Έχτορα, τώρα αφού με πρόσταξες να πω την πάσα ...",0.0,0.0,neutral
G2,...,...,...,...,...,...
G2,3662023,Ω_656_658,"Μον᾿ έλα τώρα, δωσ᾿ μου απόκριση και πες την π...",1.0,1.0,συμπόνια
G2,3662023,Ω_660_661,Αν να με αφήσεις θες τον Έχτορα τον αντρειανό ...,1.0,1.0,αγωνία
G2,3662023,Ω_662_663,"Κλεισμένοι στο καστρί βρισκόμαστε, το ξέρεις, ...",1.0,-1.0,φόβος
G2,3662023,Ω_664_667,Μέρες εννιά μες στο παλάτι μου θα τον μοιρολογ...,1.0,-1.0,πένθος


In [15]:
coders.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Text,Subjectivity,Sentiment,Primary Emotion
Group,ID,Id_verse,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
G2,3662021,Z_388_389,Πήγε λοιπόν τρεχάτη κι έφτασε στο καστροτείχι ...,1.0,-1.0,worry
G1,3662016,Ω_556_558,"Κι έτσι να γύρεις στην πατρίδα σου γερός, αφού...",0.0,0.0,neutral
G1,3662006,Z_414_419,Τον κύρη μου ο Αχιλλέας τον σκότωσεν ο αρχοντο...,1.0,1.0,irony
G2,3662014,Ω_553_554,ακόμα ο γιος μου\nπαραριγμένος κάπου κοίτεται ...,1.0,1.0,grief
G2,3662021,Z_441_443,"Κι εγώ όλα τούτα τα στοχάζομαι, καλή μου, αλή...",1.0,1.0,pride
G1,3662018,Ω_566_567,οι φυλάκτορες θα τόνε βλέπαν κι ούτε\nτο σύρτη...,1.0,-1.0,disdain - απαξίωση
G1,3662015,Ω_529_530,Κι αν δώσει ο Δίας ο κεραυνόχαρος μαζί απ᾿ τα ...,1.0,-1.0,αδικία
G1,3662006,Ω_637_640,τι μάτι ακόμα εγώ δε σφάλιξα στα βλέφαρα από κ...,1.0,-1.0,compassion
G2,3662014,Ω_572_575,Τότε ο Αχιλλέας σα λιόντας πήδηξε κι όξω απ᾿ τ...,1.0,1.0,grief
G2,3662010,Ω_641_642,Μα τώρα και κρασί κατέβασα φλογάτο απ᾿ το λαιμ...,1.0,1.0,satisfaction


In [16]:
# Confirm that the indices were changed correctly
coders.index.names

FrozenList(['Group', 'ID', 'Id_verse'])

In [17]:
# Make lists of each available combination of annotators for each group using the combinations tool from the library "itertools".
# Reminder that we imported combinations as com, hence the com after the list.
G1comb = list(com(coders.loc['G1'].index.get_level_values(0).unique(), 2))
G2comb = list(com(coders.loc['G2'].index.get_level_values(0).unique(), 2))

# Check if the combinations are correct.
print('G1 Combinations:')
for i in range(len(G1comb)):
    print(G1comb[i][0], 'with' ,G1comb[i][1])
print('G2 Combinations:')
for i in range(len(G2comb)):
    print(G2comb[i][0], 'with' , G2comb[i][1])

G1 Combinations:
3662006 with 3662009
3662006 with 3662015
3662006 with 3662016
3662006 with 3662018
3662009 with 3662015
3662009 with 3662016
3662009 with 3662018
3662015 with 3662016
3662015 with 3662018
3662016 with 3662018
G2 Combinations:
3662001 with 3662003
3662001 with 3662010
3662001 with 3662011
3662001 with 3662013
3662001 with 3662014
3662001 with 3662020
3662001 with 3662021
3662001 with 3662023
3662003 with 3662010
3662003 with 3662011
3662003 with 3662013
3662003 with 3662014
3662003 with 3662020
3662003 with 3662021
3662003 with 3662023
3662010 with 3662011
3662010 with 3662013
3662010 with 3662014
3662010 with 3662020
3662010 with 3662021
3662010 with 3662023
3662011 with 3662013
3662011 with 3662014
3662011 with 3662020
3662011 with 3662021
3662011 with 3662023
3662013 with 3662014
3662013 with 3662020
3662013 with 3662021
3662013 with 3662023
3662014 with 3662020
3662014 with 3662021
3662014 with 3662023
3662020 with 3662021
3662020 with 3662023
3662021 with 3662023


In [18]:
# Initialize lists for each cohen kappa.
k11 = []
k12 = []
k21 = []
k22 = []
#Use for loops in order to run through the combinations of the annotators and get a kappa value for each combination.
for i in range (0, len(G1comb)):
    k11.append(kappa(coders.loc['G1', G1comb[i][0]].Sentiment, coders.loc['G1', G1comb[i][1]].Sentiment))
    k12.append(kappa(coders.loc['G1', G1comb[i][0]].Subjectivity, coders.loc['G1', G1comb[i][1]].Subjectivity))

for j in range (0, len(G2comb)):
    k21.append(kappa(coders.loc['G2', G2comb[j][0]].Sentiment, coders.loc['G2', G2comb[j][1]].Sentiment))
    k22.append(kappa(coders.loc['G2', G2comb[j][0]].Subjectivity, coders.loc['G2', G2comb[j][1]].Subjectivity))

k11 = np.array(k11)
k12 = np.array(k12)
k21 = np.array(k21)
k22 = np.array(k22)
# Compute the mean value for each kappa in the list of kappas and print it to compare the values.
print(f'G1 Sentiment Cohen kappa: {np.mean(k11):.4f}\nG1 Subjectivity Cohen kappa: {np.mean(k12):.4f}\nG2 Sentiment Cohen kappa: {np.mean(k21):.4f}\nG2 Subjectivity Cohen kappa: {np.mean(k22):.4f}')

G1 Sentiment Cohen kappa: 0.1275
G1 Subjectivity Cohen kappa: 0.0815
G2 Sentiment Cohen kappa: 0.1802
G2 Subjectivity Cohen kappa: 0.1607


In [19]:
# Again, initialize lists for each group for sentiment and subjectivity. These lists will contain the agreement for each pair in the combinations list.
G1Sen = []
G1Sub = []
G2Sen = []
G2Sub = []
# Use for loops in order to go throught the lists of combinations. For each group/sentiment/subjectivity pair, a dataframe is computed. Then using percentage agreement with logical not xor as in the data_annotations pdf, we append each agreement percentage of each pair of the combinations list into the lists initialized above.
for i in range (0, len(G1comb)):
    # First make the data frame consisting of each pair's choices. 
    annots11 = pd.DataFrame(pd.concat([coders.loc['G1', G1comb[i][0]].Sentiment, coders.loc['G1', G1comb[i][1]].Sentiment], axis=1))
    # Change column names so that not both columns of the Data Frame are called by the same name (Sentiment).
    annots11.columns = ['Sentiment11', 'Sentiment12']
    # Use not logical XOR to find the agreements between each row. After that, the matrix is appended to the list.
    G1Sen.append(annots11.apply(lambda r: not np.logical_xor(r.Sentiment11, r.Sentiment12), axis=1))
    
    annots12 = pd.DataFrame(pd.concat([coders.loc['G1', G1comb[i][0]].Subjectivity, coders.loc['G1', G1comb[i][1]].Subjectivity], axis=1))
    annots12.columns = ['Subjectivity11', 'Subjectivity12']
    G1Sub.append(annots12.apply(lambda r: not np.logical_xor(r.Subjectivity11, r.Subjectivity12), axis=1))

for j in range (0, len(G2comb)):
    annots21 = pd.DataFrame(pd.concat([coders.loc['G2', G2comb[j][0]].Sentiment, coders.loc['G2', G2comb[j][1]].Sentiment], axis=1))
    annots21.columns = ['Sentiment21', 'Sentiment22']
    G2Sen.append(annots21.apply(lambda r: not np.logical_xor(r.Sentiment21, r.Sentiment22), axis=1))
    
    annots22 = pd.DataFrame(pd.concat([coders.loc['G2', G2comb[j][0]].Subjectivity, coders.loc['G2', G2comb[j][1]].Subjectivity], axis=1))
    annots22.columns = ['Subjectivity21', 'Subjectivity22']
    G2Sub.append(annots22.apply(lambda r: not np.logical_xor(r.Subjectivity21, r.Subjectivity22), axis=1))
# Compute mean agreement percentage for each Group.
print(f'Percentage Agreement of G1 for Sentiment: {np.mean(G1Sen)*100:.2f}%\nPercentage Agreement of G1 for Subjecctivity: {np.mean(G1Sub)*100:.2f}%\nPercentage Agreement of G2 for Sentiment: {np.mean(G2Sen)*100:.2f}%\nPercentage Agreement of G2 for Subjectivity: {np.mean(G2Sub)*100:.2f}%')

Percentage Agreement of G1 for Sentiment: 56.04%
Percentage Agreement of G1 for Subjecctivity: 56.25%
Percentage Agreement of G2 for Sentiment: 71.47%
Percentage Agreement of G2 for Subjectivity: 72.11%


Cohen kappa results shows us that there is a higher chance that the annotators of group G2 would agree between them by chance than that of group G1. This is confirmed when comparing the percentage agreement values between them. It is obvious that the group which tried to predict the feelings that the author wanted to provoke agree more between one another as shown in the percentages.