# Kappa score calculation

In [2]:
# Project imports
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score

# Project variables
inputFile = "input.csv"

##### Import CSV
Import the CSV and drop collums that are not relevant for this script.

In [3]:
data = pd.read_csv(inputFile)

# Drop the index, thread and email ID
data.drop("#", inplace=True, axis=1)
data.drop("Tread Id", inplace=True, axis=1)
data.drop("Email ID", inplace=True, axis=1)
data.drop("Agreement?", inplace=True, axis=1)
data.drop("Decided", inplace=True, axis=1)

# Drop NaN value rows
data.dropna(inplace=True)

# Display
data.head(5)

Unnamed: 0,Marten,Rick,Krishan
0,not-ak,not-ak,not-ak
1,technology,not-ak,"technology,existence,existence-behavioral"
2,technology,not-ak,"technology,existence,existence-behavioral"
3,not-ak,not-ak,not-ak
4,process,process,process


In [4]:
# Get unique labels used in the input set.
# Used to verify validate validity of the data.
uniques = pd.unique(data.values.ravel("K"))
uniques = pd.Series(uniques).map(lambda x: x.split(',')).explode().unique()

# Display unqiues
for label in uniques:
    print(label)

not-ak
technology
process
existence
existence-structural
existence-behavioral
property
existence-behaviour


##### Convert to AK and Not-AK
Used for comparisons of Kappa score for checking if the group is on the same line for this property.

In [7]:
# Hierachy for most important to least important labels
labelHierachy = ["technology", "process", "property", "existence", "not-ak"]

def f(row):
    # This can be nicer in one loop probably (or with a 3 one-liners) but I dont really care tbh.
    for label in labelHierachy:
        if label in row[0]:
            row[0] = label
            if label == "technology" or label == "process":
                row[0] = "executive"
            break
    for label in labelHierachy:
        if label in row[1]:
            row[1] = label
            if label == "technology" or label == "process":
                row[1] = "executive"
            break
    for label in labelHierachy:
        if label in row[2]:
            row[2] = label
            if label == "technology" or label == "process":
                row[2] = "executive"
            break
    return row

copy = data.copy()

copy.apply(f, axis=1)
print(copy)

        Marten       Rick    Krishan
0       not-ak     not-ak     not-ak
1    executive     not-ak  executive
2    executive     not-ak  executive
3       not-ak     not-ak     not-ak
4    executive  executive  executive
..         ...        ...        ...
99   existence  existence  existence
100     not-ak     not-ak     not-ak
101  existence  existence  existence
102     not-ak     not-ak     not-ak
103     not-ak     not-ak     not-ak

[99 rows x 3 columns]


In [8]:
def calculateKappa(df, start=0, end=None):
    if end == None:
        end = df.shape[0]

    colLength = df.shape[1]
    output = pd.DataFrame(columns=df.columns, index=df.columns)

    output

    for rIdx in range(0, colLength):
        part1 = df.iloc[start:end, rIdx]
        for cIdx in range(0, colLength):
            part2 = df.iloc[start:end, cIdx]
            output.iloc[cIdx, rIdx] = cohen_kappa_score(part1, part2)

    # Print the output
    print("--------------------------------------------")
    print(f"Created from rows {start} till {end}")
    print(output.to_latex())

calculateKappa(copy, end=50)
calculateKappa(copy, start=51, end = 103)

--------------------------------------------
Created from rows 0 till 50
\begin{tabular}{llll}
\toprule
{} &    Marten &      Rick &   Krishan \\
\midrule
Marten  &       1.0 &  0.469112 &    0.6557 \\
Rick    &  0.469112 &       1.0 &  0.490229 \\
Krishan &    0.6557 &  0.490229 &       1.0 \\
\bottomrule
\end{tabular}

--------------------------------------------
Created from rows 51 till 103
\begin{tabular}{llll}
\toprule
{} &    Marten &      Rick &   Krishan \\
\midrule
Marten  &       1.0 &  0.528796 &  0.425757 \\
Rick    &  0.528796 &       1.0 &  0.415426 \\
Krishan &  0.425757 &  0.415426 &       1.0 \\
\bottomrule
\end{tabular}



  print(output.to_latex())
  print(output.to_latex())
