# Kappa score calculation

In [18]:
# Project imports
import pandas as pd
import numpy as np
from sklearn.metrics import cohen_kappa_score

# Project variables
inputFile = "input.csv"

##### Import CSV
Import the CSV and drop collums that are not relevant for this script.

In [19]:
data = pd.read_csv(inputFile)

# Drop the index, thread and email ID
data.drop("#", inplace=True, axis=1)
data.drop("Tread Id", inplace=True, axis=1)
data.drop("Email ID", inplace=True, axis=1)
data.drop("Agreement?", inplace=True, axis=1)
data.drop("Decided", inplace=True, axis=1)

# Drop NaN value rows
data.dropna(inplace=True)

# Display
data.head(5)

Unnamed: 0,Marten,Rick,Krishan
0,not-ak,not-ak,not-ak
1,technology,not-ak,"technology,existence,existence-behavioral"
2,technology,not-ak,"technology,existence,existence-behavioral"
3,not-ak,not-ak,not-ak
4,process,process,process


In [20]:
# Get unique labels used in the input set.
# Used to verify validate validity of the data.
uniques = pd.unique(data.values.ravel("K"))
uniques = pd.Series(uniques).map(lambda x: x.split(',')).explode().unique()

# Display unqiues
for label in uniques:
    print(label)

not-ak
technology
process
existence
existence-structural
existence-behavioral
property
existence-behaviour


##### Convert to AK and Not-AK
Used for comparisons of Kappa score for checking if the group is on the same line for this property.

In [21]:
# Hierachy for most important to least important labels
labelHierachy = ["technology", "process", "property", "existence", "not-ak"]

def f(row):
    # This can be nicer in one loop probably (or with a 3 one-liners) but I dont really care tbh.
    for i in range(0,3):
        for label in labelHierachy:
            if label in row[i]:
                row[i] = label
                if label == "technology" or label == "process":
                    row[i] = "executive"
                break
    return row

copy = data.copy()

copy.apply(f, axis=1)
print(copy)

        Marten       Rick    Krishan
0       not-ak     not-ak     not-ak
1    executive     not-ak  executive
2    executive     not-ak  executive
3       not-ak     not-ak     not-ak
4    executive  executive  executive
5    executive  executive  executive
6       not-ak     not-ak     not-ak
7       not-ak     not-ak     not-ak
8       not-ak     not-ak     not-ak
9       not-ak     not-ak     not-ak
10      not-ak     not-ak     not-ak
11      not-ak     not-ak     not-ak
12      not-ak     not-ak     not-ak
13      not-ak     not-ak     not-ak
14      not-ak     not-ak     not-ak
15      not-ak     not-ak     not-ak
16      not-ak     not-ak     not-ak
17      not-ak     not-ak     not-ak
18      not-ak     not-ak     not-ak
19   executive     not-ak     not-ak
20      not-ak     not-ak     not-ak
21      not-ak     not-ak     not-ak
22      not-ak     not-ak     not-ak
23      not-ak     not-ak  executive
24   executive     not-ak     not-ak
25      not-ak     not-ak     not-ak
2

In [32]:
def calculateKappa(df, start=0, end=None):
    if end == None:
        end = df.shape[0]

    colLength = df.shape[1]
    output = pd.DataFrame(columns=df.columns, index=df.columns)

    output

    for rIdx in range(0, colLength):
        part1 = df.iloc[start:end, rIdx]
        for cIdx in range(0, colLength):
            part2 = df.iloc[start:end, cIdx]
            output.iloc[cIdx, rIdx] = cohen_kappa_score(part1, part2)

    # Print the output
    print("--------------------------------------------")
    print(f"Created from rows {start} till {end}")
    print(output.to_latex())

calculateKappa(copy, end=50)
calculateKappa(copy, start=51, end = 100)
calculateKappa(copy, start=101)

--------------------------------------------
Created from rows 0 till 50
\begin{tabular}{llll}
\toprule
{} &    Marten &      Rick &   Krishan \\
\midrule
Marten  &       1.0 &  0.469112 &    0.6557 \\
Rick    &  0.469112 &       1.0 &  0.490229 \\
Krishan &    0.6557 &  0.490229 &       1.0 \\
\bottomrule
\end{tabular}

--------------------------------------------
Created from rows 51 till 100
\begin{tabular}{llll}
\toprule
{} &    Marten &      Rick &   Krishan \\
\midrule
Marten  &       1.0 &   0.53274 &  0.430233 \\
Rick    &   0.53274 &       1.0 &  0.422018 \\
Krishan &  0.430233 &  0.422018 &       1.0 \\
\bottomrule
\end{tabular}

--------------------------------------------
Created from rows 101 till 150
\begin{tabular}{llll}
\toprule
{} &    Marten &      Rick &   Krishan \\
\midrule
Marten  &       1.0 &  0.833333 &  0.688663 \\
Rick    &  0.833333 &       1.0 &   0.77451 \\
Krishan &  0.688663 &   0.77451 &       1.0 \\
\bottomrule
\end{tabular}



  print(output.to_latex())
  print(output.to_latex())
  print(output.to_latex())
