In [None]:
Calculating annotators agreements during the annotation 

In [36]:
import pandas as pd
import numpy as np

In [37]:
from sklearn.metrics import cohen_kappa_score
from statsmodels.stats.inter_rater import fleiss_kappa

In [38]:
# Count how many times each category was chosen for each item
def count_categories(ratings, num_categories):
    counts = np.zeros((ratings.shape[0], num_categories), dtype=int)
    for i, item_ratings in enumerate(ratings):
        for rating in item_ratings:
            counts[i, rating - 1] += 1  # Assuming categories are 1-indexed
    return counts

def FleissKappa(annotation_round, corpus) :
    if annotation_round > 0 :
        df_round = corpus[corpus['annotation_round'] == annotation_round]
        corpus_array = df_round[['A1','A2','A3']].to_numpy()
        num_categories = len(set(corpus_array.flatten()))
        category_counts = count_categories(corpus_array, num_categories)
        kappa = fleiss_kappa(category_counts)
        print("Fleiss' Kappa score : {} for round {}".format(kappa, annotation_round))
    else :
        corpus_array = corpus[['A1','A2','A3']].to_numpy()
        num_categories = len(set(corpus_array.flatten()))
        category_counts = count_categories(corpus_array, num_categories)
        kappa = fleiss_kappa(category_counts)
        print("Fleiss' Kappa score : {} for the whole dataset".format(kappa))


In [39]:
def kappa(annotation_round, corpus) :
    if annotation_round > 0 :
        df_round = corpus[corpus['annotation_round'] == annotation_round]
        y1 = df_round['A1'].tolist()
        y2 = df_round['A2'].tolist()
        y3 = df_round['A3'].tolist()
        print("Kappa score between A1 and A2 : {} for round {}".format(cohen_kappa_score(y1, y2), annotation_round))
        print("Kappa score between A1 and A3 : {} for round {}".format(cohen_kappa_score(y1, y3), annotation_round))
        print("Kappa score between A2 and A3 : {} for round {}".format(cohen_kappa_score(y2, y3), annotation_round))
    else :
        y1 = corpus['A1'].tolist()
        y2 = corpus['A2'].tolist()
        y3 = corpus['A3'].tolist()
        print("Kappa score between A1 and A2 : {} for the whole dataset".format(cohen_kappa_score(y1, y2)))
        print("Kappa score between A1 and A3 : {} for the whole dataset".format(cohen_kappa_score(y1, y3)))
        print("Kappa score between A2 and A3 : {} for the whole dataset".format(cohen_kappa_score(y2, y3)))


def kappa_final(annotation_round, corpus) :
    if annotation_round > 0 :
        df_round = corpus[corpus['annotation_round'] == annotation_round]
        y1 = df_round['A1'].tolist()
        y2 = df_round['A2'].tolist()
        y3 = df_round['A3'].tolist()
        final = df_round['FINAL'].tolist()
        print("Kappa score between A1 and final label : {} for round {}".format(cohen_kappa_score(y1, final), annotation_round))
        print("Kappa score between A2 and final label : {} for round {}".format(cohen_kappa_score(y2, final), annotation_round))
        print("Kappa score between A3 and final label : {} for round {}".format(cohen_kappa_score(y3, final), annotation_round))
    else : 
        y1 = corpus['A1'].tolist()
        y2 = corpus['A2'].tolist()
        y3 = corpus['A3'].tolist()
        final = corpus['relevance_label'].tolist()
        print("Kappa score between A1 and final label : {} for the whole dataset".format(cohen_kappa_score(y1, final)))
        print("Kappa score between A2 and final label : {} for the whole dataset".format(cohen_kappa_score(y2, final)))
        print("Kappa score between A3 and final label : {} for the whole dataset".format(cohen_kappa_score(y3, final)))


In [40]:
def agreement_score(annotation_round, corpus) :
    if annotation_round > 0 :
        df_round = corpus[corpus['annotation_round'] == annotation_round]
        agreement = df_round[(df_round['A1'] == df_round['A2']) & (df_round['A2'] == df_round['A3'])]
        print("Agreement score : {} for round {}".format(len(agreement.index)*100/len(df_round.index), annotation_round))
    else:
        agreement = corpus[(corpus['A1'] == corpus['A2']) & (corpus['A2'] == corpus['A3'])]
        print("Agreement score : {} for the whole dataset".format(len(agreement.index)*100/len(corpus.index)))

In [41]:
def agreement_score2(corpus) :
    agreement = corpus[(corpus['A2'] == corpus['A3'])]
    print("Agreement score : {}".format(len(agreement.index)*100/len(corpus.index)))

def kappa_2(corpus) :
    y2 = corpus['A2'].tolist()
    y3 = corpus['A3'].tolist()
    final = corpus['relevance_label'].tolist()
    print("Kappa score between A2 and A3 : {}".format(cohen_kappa_score(y2, y3)))
    print("Kappa score between A2 and final label : {}".format(cohen_kappa_score(y2, final)))
    print("Kappa score between A3 and final label : {}".format(cohen_kappa_score(y3, final)))
    

## 1. Inter-annotator agreement, 3 annotators

In [42]:
excel = "corpus_three_annot.xlsx"
corpus_three_annot = pd.read_excel(excel)
corpus_three_annot.head()

Unnamed: 0,id_segment,id_segment_old,text_segment,annotation_round,A1,A2,A3,relevance_label,relevance_type_norm
0,1-s2.0-S0301479717300713-main_99,1-s2.0-S0301479717300713-main.pdf.tei_48,"In total, 12 documents were identified as rele...",1,1,0,0,0,
1,1-s2.0-S0304387816300670-main_355,1-s2.0-S0304387816300670-main.pdf.tei_119,While our model and the specification below fo...,1,1,0,0,0,
2,1-s2.0-S0140196313000608-main_210,1-s2.0-S0140196313000608-main.pdf.tei_110,Other areas have experienced a decrease in [cr...,1,1,2,1,1,LULCC
3,1-s2.0-S0169204617300270-main_74,1-s2.0-S0169204617300270-main.pdf.tei_35,"We use the term ""[urban] sprawl"" to describe a...",1,2,0,0,0,
4,1-s2.0-S095937809800003X-main_240,1-s2.0-S095937809800003X-main.pdf.tei_69,"Thus, changes in [land use] strategies result ...",1,1,2,2,2,"PRACTICES, LULCC"


In [43]:
corpus_three_annot_12 = corpus_three_annot.copy()
corpus_three_annot_12[['A1', 'A2', 'A3','relevance_label']] = corpus_three_annot_12[['A1', 'A2', 'A3','relevance_label']].replace(2,1)

In [44]:
corpus_three_annot['A1'] = corpus_three_annot['A1'].apply(lambda x: int(x))
corpus_three_annot['A2'] = corpus_three_annot['A2'].apply(lambda x: int(x))
corpus_three_annot['A3'] = corpus_three_annot['A3'].apply(lambda x: int(x))

### 1.1. Classes 1 & 2 separated

#### Fleiss Kappa

In [45]:
FleissKappa(1, corpus = corpus_three_annot)
FleissKappa(2, corpus = corpus_three_annot)
FleissKappa(3, corpus = corpus_three_annot)
FleissKappa(0, corpus = corpus_three_annot)

Fleiss' Kappa score : 0.29908376963350775 for round 1
Fleiss' Kappa score : 0.4191406970311634 for round 2
Fleiss' Kappa score : 0.4184073107049608 for round 3
Fleiss' Kappa score : 0.3929715117488043 for the whole dataset


#### Agreement score

In [46]:
agreement_score(annotation_round = 1, corpus = corpus_three_annot)

Agreement score : 33.333333333333336 for round 1


In [47]:
agreement_score(annotation_round = 2, corpus = corpus_three_annot)

Agreement score : 60.0 for round 2


In [48]:
agreement_score(annotation_round = 3, corpus = corpus_three_annot)

Agreement score : 49.20634920634921 for round 3


In [49]:
agreement_score(annotation_round = 0, corpus = corpus_three_annot)

Agreement score : 47.5609756097561 for the whole dataset


### 1.2. Classes 1 & 2 merged

#### Fleiss Kappa

In [50]:
FleissKappa(1, corpus = corpus_three_annot_12)
FleissKappa(2, corpus = corpus_three_annot_12)
FleissKappa(3, corpus = corpus_three_annot_12)
FleissKappa(0, corpus = corpus_three_annot_12)

Fleiss' Kappa score : 0.3045454545454547 for round 1
Fleiss' Kappa score : 0.4485294117647058 for round 2
Fleiss' Kappa score : 0.5321782178217822 for round 3
Fleiss' Kappa score : 0.45908848614072506 for the whole dataset


#### Agreement score

In [51]:
agreement_score(annotation_round = 1, corpus = corpus_three_annot_12)

Agreement score : 49.01960784313726 for round 1


In [52]:
agreement_score(annotation_round = 2, corpus = corpus_three_annot_12)

Agreement score : 64.0 for round 2


In [53]:
agreement_score(annotation_round = 3, corpus = corpus_three_annot_12)

Agreement score : 65.07936507936508 for round 3


In [54]:
agreement_score(annotation_round = 0, corpus = corpus_three_annot_12)

Agreement score : 59.75609756097561 for the whole dataset


## 2. Inter-annotator agreement, 2 annotators

In [55]:
excel = "corpus_two_annot.xlsx"
corpus_two_annot = pd.read_excel(excel)
corpus_two_annot.head()

Unnamed: 0,id_segment,id_segment_old,text_segment,A2,relevance_type_A2,A3,relevance_type_A3,relevance_type,relevance_label,relevance_type_norm
0,1-s2.0-S0304387816300670-main_112,1-s2.0-S0301479717300713-main.pdf.tei_82,"(2004)'}], '#text': 'With regard to population...",1,DRIVERS,2,Drivers,DRIVERS,1,DRIVERS
1,1-s2.0-S235198941500102X-main_233,1-s2.0-S235198941500102X-main.pdf.tei_62,"(%)', '100', '37.25', '93.25', '75.58', '94.24...",0,,0,TABLE,,0,
2,s10113-015-0891-1_24,1-s2.0-S0168192311001122-main.pdf.tei_80,"On the other hand, whereas the combination of ...",0,,0,,,0,
3,1-s2.0-S0167880913003502-main_358,1-s2.0-S0167880913003502-main.pdf.tei_123,Whilst Blanco-Canqui and stress those similar ...,0,,0,,,0,
4,1-s2.0-S030438781000043X-mainext_610,1-s2.0-S030438781000043X-mainext.pdf.tei_75,"In addition to land ownership, the regressions...",0,,0,,,0,


### 2.1. Classes 1 & 2 separated

In [57]:
kappa_2(corpus_two_annot)

Kappa score between A2 and A3 : 0.6989473684210525
Kappa score between A2 and final label : 0.8519079104596511
Kappa score between A3 and final label : 0.8484304932735426


In [58]:
agreement_score2(corpus_two_annot)

Agreement score : 86.98224852071006


### 2.2. Classes 1 & 2 merged

In [59]:
corpus_two_annot_12 = corpus_two_annot.copy()
corpus_two_annot_12[['A2', 'A3','relevance_label']] = corpus_two_annot_12[[ 'A2', 'A3','relevance_label']].replace(2,1)

In [60]:
kappa_2(corpus=corpus_two_annot_12)
agreement_score2(corpus=corpus_two_annot_12)

Kappa score between A2 and A3 : 0.8208163986570065
Kappa score between A2 and final label : 0.9115800488315312
Kappa score between A3 and final label : 0.9104081993285033
Agreement score : 92.89940828402366
