In [None]:
import os
import pandas as pd
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"


# Loading Data

In [None]:
annotator1_file = "data/annotated/FoSE2026-MR.xlsx"
annotator2_file = "data/annotated/FoSE2026-Mehil.xlsx"

In [102]:
def get_worked_for_se_data(file_path, sheet_name):
    df = pd.read_excel(
    file_path,
    sheet_name=sheet_name,
    header=0,              # Row to use as column names (0 = first row)
    skiprows=6,            # Number of rows to skip at the start
    nrows=500,             # Number of rows to read (optional)
    usecols='A:D',
    engine='openpyxl')
    return df

In [103]:
def get_not_worked_for_se_data(file_path, sheet_name):
    df = pd.read_excel(
    file_path,
    sheet_name=sheet_name,
    header=0,              # Row to use as column names (0 = first row)
    skiprows=6,            # Number of rows to skip at the start
    nrows=500,             # Number of rows to read (optional)
    usecols='A:D',
    engine='openpyxl')
    return df

In [104]:
def get_one_change_to_make_data(file_path,sheet_name):
    df = pd.read_excel(
    file_path,
    sheet_name=sheet_name,
    header=0,              # Row to use as column names (0 = first row)
    skiprows=6,            # Number of rows to skip at the start
    nrows=500,    
    usecols='A:D',
    engine='openpyxl')
    return df

In [105]:
worked_for_se = get_worked_for_se_data(annotator1_file, "Q7")
worked_for_se_annotated = get_worked_for_se_data(annotator2_file, "Q7")

In [106]:
not_worked_for_se = get_not_worked_for_se_data(annotator1_file, "Q8")
not_worked_for_se_annotated = get_not_worked_for_se_data(annotator2_file, "Q8")

In [None]:
one_change = get_one_change_to_make_data(annotator1_file, "Q9")
one_change_annotated = get_one_change_to_make_data(annotator2_file, "Q9")

# Encoding the responses

In [None]:
def extract_responses_of_interest(response_df):
    response_id_text_dict = dict()
    for idx, row in response_df.iterrows():
        # assigning the presence of four columns
        response_id = row.iloc[1]
        col_of_interest = row.iloc[3]
          
        if  pd.notna(col_of_interest) and str(col_of_interest).strip():  
            if col_of_interest.lower() == 'yes':
                response_id_text_dict[response_id] = 'yes'
            else:
                response_id_text_dict[response_id] = 'no'
        else:
            response_id_text_dict[response_id] = 'no'

    return response_id_text_dict

In [None]:
worked_for_se_map = extract_responses_of_interest(worked_for_se)
worked_for_se_annotated_map = extract_responses_of_interest(worked_for_se_annotated)

In [110]:
not_worked_for_se_map = extract_responses_of_interest(not_worked_for_se)
not_worked_for_se_annotated_map = extract_responses_of_interest(not_worked_for_se_annotated)

In [111]:
one_change_map = extract_responses_of_interest(one_change)
one_change_annotated_map = extract_responses_of_interest(one_change_annotated)  

In [112]:
def prepare_for_agreement_analysis(ground_map, annotated_map):
    values1 = []
    values2 = []
    keys = set(ground_map.keys()).union(set(annotated_map.keys()))
    
    for key in keys:
        values1.append(ground_map[key] if key in ground_map else 'no')
        values2.append(annotated_map[key] if key in annotated_map else 'no')
    return values1, values2

# Cohen's Kappa score calculation

In [None]:
from sklearn.metrics import cohen_kappa_score

In [118]:
values1, values2 = prepare_for_agreement_analysis(worked_for_se_map, worked_for_se_annotated_map)
print (cohen_kappa_score(values1, values2))

0.9352850539291218


In [119]:
values1,values2 = prepare_for_agreement_analysis(not_worked_for_se_map, not_worked_for_se_annotated_map)
print (cohen_kappa_score(values1, values2))

0.9130674761970471


In [120]:
values1, values2 = prepare_for_agreement_analysis(one_change_map, one_change_annotated_map)
print (cohen_kappa_score(values1, values2))

0.9752628324056896
