In [34]:
import json
from sklearn.metrics import cohen_kappa_score
from sklearn.preprocessing import MultiLabelBinarizer

## ASSIGNMENT TASK 3 : INTER ANNOTATOR AGREEMENT

Task 3: Implementing Inter-Annotator Agreement
Export the annotations in either JSON or CSV files.
Using Pyt hon code, calculate Cohen’s Kappa & Fleiss Kappa.
Use Cohen’s Kappa for the NLP Dataset Task.
Use Fleiss Kappa for the CV Dataset Task. Get the third annotation from any other teams and then calculate the Fleiss Kappa.
Output the agreement score and interpret its significance.





---
# Cohen's Kappa for NLP Annotations


In [51]:
def load_annotations1(file_path):
    """
    this function loads annotations from a JSON file, extract and sorts POS tags by "start".

    Parameters:
        file_path (str): Path to the JSON file.

    Returns:
        pos_tags (list): List of lists containing POS tags for each sentence.
        data (list): List of the original data with sorted POS tags.
    """
    import json

    with open(file_path, "r") as file:
        data = json.load(file)  # Load the JSON file

    pos_tags = []
    for item in data:
        if "pos" in item:  # Check if the key "pos" exists
            # Sort the "pos" list by the "start" field
            item["pos"].sort(key=lambda x: x["start"])

            # Extract the POS tags after sorting
            tags = [pos["labels"][0] for pos in item["pos"]]
            pos_tags.append(tags)
        else:
            print(f"Skipping item with missing 'pos' key: {item}")

    return pos_tags


def load_annotations2(file_path):
    """
    Load annotations from a JSON file, extract and sort POS tags by "start".

    Parameters:
        file_path (str): Path to the JSON file.

    Returns:
        pos_tags (list): List of lists containing POS tags for each sentence.
        data (list): List of the original data with sorted POS tags.
    """
    import json

    with open(file_path, "r") as file:
        data = json.load(file)  # Load the JSON file

    pos_tags = []
    for item in data:
        if "label" in item:  # Check if the key "pos" exists
            # Sort the "pos" list by the "start" field
            item["label"].sort(key=lambda x: x["start"])

            # Extract the POS tags after sorting
            tags = [label["labels"][0] for label in item["label"]]
            pos_tags.append(tags)
        else:
            print(f"Skipping item with missing 'pos' key: {item}")

    return pos_tags




def align_annotations(annotator1_data, annotator2_data):
    """
    Align the POS tags of two annotators for comparison.
    Assumes both annotators annotate the same data points in the same order.
    """
    aligned_annotator1 = []
    aligned_annotator2 = []
    for tags1, tags2 in zip(annotator1_data, annotator2_data):
        min_len = min(len(tags1), len(tags2))  # Handle differing tag lengths
        aligned_annotator1.extend(tags1[:min_len])
        aligned_annotator2.extend(tags2[:min_len])
    return aligned_annotator1, aligned_annotator2



def calculate_cohens_kappa(annotator1_tags, annotator2_tags):
    """
    Calculate Cohen's Kappa score between two annotators' POS tag sequences.

    Parameters:
        annotator1_tags (list of lists): POS tags from Annotator 1.
        annotator2_tags (list of lists): POS tags from Annotator 2.

    Returns:
        float: Cohen's Kappa score.
    """
    # Ensure both annotators have the same number of sentences
    if len(annotator1_tags) != len(annotator2_tags):
        raise ValueError("Annotator data lengths do not match!")

    # Calculate Cohen's Kappa score
    kappa = cohen_kappa_score(annotator1_tags, annotator2_tags)

    return kappa








In [54]:
# File paths for the two annotators
annotator1_file = "/content/NLP-TEAM2 - OM.json"
annotator2_file = "/content/NLP-TEAM2-PRANAV.json"
# Load annotations
annotator1_data = load_annotations1(annotator1_file)
annotator2_data = load_annotations2(annotator2_file)


In [55]:
aligned_annotator1, aligned_annotator2 = align_annotations(annotator1_data, annotator2_data)

In [57]:
for i in range(len(aligned_annotator1)):
  print(f'tag no. : {i} ----> {aligned_annotator1[i]}   :    {aligned_annotator2[i]}')        # Aligned tags for both the annotatora=s have been printed against each other



tag no. : 0 ----> X   :    X
tag no. : 1 ----> PROPN   :    PROPN
tag no. : 2 ----> ADP   :    PRON
tag no. : 3 ----> NOUN   :    NOUN
tag no. : 4 ----> ADP   :    PROPN
tag no. : 5 ----> PROPN   :    PRON
tag no. : 6 ----> ADP   :    NOUN
tag no. : 7 ----> NOUN   :    X
tag no. : 8 ----> X   :    X
tag no. : 9 ----> X   :    PROPN
tag no. : 10 ----> PROPN   :    X
tag no. : 11 ----> X   :    PROPN
tag no. : 12 ----> PROPN   :    X
tag no. : 13 ----> PROPN   :    PROPN
tag no. : 14 ----> X   :    X
tag no. : 15 ----> PROPN   :    NOUN
tag no. : 16 ----> PROPN   :    PROPN
tag no. : 17 ----> ADP   :    NOUN
tag no. : 18 ----> NOUN   :    NOUN
tag no. : 19 ----> X   :    X
tag no. : 20 ----> PROPN   :    PROPN
tag no. : 21 ----> PROPN   :    PROPN
tag no. : 22 ----> CONJ   :    CONJ
tag no. : 23 ----> PROPN   :    PROPN
tag no. : 24 ----> PROPN   :    PROPN
tag no. : 25 ----> ADP   :    PRON
tag no. : 26 ----> ADV   :    ADJ
tag no. : 27 ----> PROPN   :    PROPN
tag no. : 28 ----> X   : 

In [66]:
#Calculating the score
kappa_score = calculate_cohens_kappa(aligned_annotator1, aligned_annotator2)

# Output the result
print(f"Cohen's Kappa Score: {kappa_score:.4f}")
if kappa_score > 0.8:
    interpretation = "Almost Perfect Agreement"
elif kappa_score > 0.6:
    interpretation = "Substantial Agreement"
elif kappa_score > 0.4:
    interpretation = "Moderate Agreement"
elif kappa_score > 0.2:
    interpretation = "Fair Agreement"
elif kappa_score > 0:
    interpretation = "Slight Agreement"
else:
    interpretation = "No Agreement"

print(f"Interpretation: {interpretation}")



Cohen's Kappa Score: 0.5991
Interpretation: Moderate Agreement


---
The obtained Cohen’s Kappa score is 0.5991, which falls into the moderate agreement category,

A score of 0.5991 indicates that there is a moderate level
of agreement between the two raters or models. In other words, while the two models or raters do agree on the categorization of the data fairly often, they also have some level of disagreement.



---

# Fleiss's Kappa for CV Annotations



In [74]:
import json
import numpy as np
from statsmodels.stats.inter_rater import fleiss_kappa
from collections import defaultdict
import statsmodels

In [89]:
import json
import numpy as np

# Load all the json files (assuming the files are named file1.json, file2.json, file3.json)
files = ['/content/CV-TEAM2-PRANAV.json', '/content/CV-TEAM2-OM.json', '/content/CV-TEAM2-3.json']


def fleiss_kappa_score(files):

    # Data structure to hold the results
    annotations = []

    # Iterates through each file and collect the annotations
    for idx, file in enumerate(files):
        img_id = []
        annote = []
        with open(file, 'r') as f:
            data = json.load(f)
            annotator = idx + 1  # Annotator number starts from 1
            for entry in data:
                classification = entry["choice"]
                annote.append(classification)
                img_id.append(entry["image"])
        annotations.append (annote)
    for i in range(len(annote)) :
        print(f'{img_id[i]}---> ann1:{annotations[0][i]},  ann2:{annotations[1][i]},  ann3:{annotations[2][i]}')
        ## This part loads the annotations files from three annotators and creates
        ## a 2-D array containing category assignment with subjects in rows and raters in columns.
    table = statsmodels.stats.inter_rater.aggregate_raters(annotations)
    ## function to convert the 2-D array into a compatible format for the fleiss kappa calculator function
    print(table)
    score = statsmodels.stats.inter_rater.fleiss_kappa(table[0], method='fleiss')
    ## This calculates the fleiss kappa score
    return score


In [90]:
score = fleiss_kappa_score(files)


/data/upload/5/bde75fcc-img_20.jpg---> ann1:No Truck,  ann2:No Truck,  ann3:No Truck
/data/upload/5/f3565be8-img_21.jpg---> ann1:No Truck,  ann2:No Truck,  ann3:Truck
/data/upload/5/fef703fc-img_22.jpg---> ann1:No Truck,  ann2:No Truck,  ann3:No Truck
/data/upload/5/c2c304f9-img_23.jpg---> ann1:No Truck,  ann2:No Truck,  ann3:No Truck
/data/upload/5/23489a8b-img_24.jpg---> ann1:No Truck,  ann2:Truck,  ann3:No Truck
/data/upload/5/0fe71ede-img_25.jpg---> ann1:No Truck,  ann2:No Truck,  ann3:No Truck
/data/upload/5/01c5ba37-img_26.jpg---> ann1:Truck,  ann2:Truck,  ann3:Truck
/data/upload/5/3143fd82-img_27.jpg---> ann1:Truck,  ann2:Truck,  ann3:Truck
/data/upload/5/344f540d-img_28.jpg---> ann1:No Truck,  ann2:No Truck,  ann3:No Truck
/data/upload/5/05703423-img_29.jpg---> ann1:Truck,  ann2:Truck,  ann3:Truck
/data/upload/5/d76e87d3-img_30.jpg---> ann1:No Truck,  ann2:Truck,  ann3:Truck
/data/upload/5/d92ba013-img_31.jpg---> ann1:No Truck,  ann2:Truck,  ann3:No Truck
/data/upload/5/9738303

In [91]:
print(f'Fleiss kappa Score =  {score}')

Fleiss kappa Score =  -0.015037593984962468




---
The Fleiss' Kappa score of -0.0150 indicates poor agreement between the raters, even worse than what would be expected by chance. This negative value suggests that the raters are consistently in disagreement, possibly due to unclear or ambiguous categories, inconsistent raters, or a flawed classification system. In this case, the score indicates a need for revising the categorization process, improving rater consistency, or addressing issues with the quality of the data to ensure better agreement.


---

---

