# Preparation

This is a **Python Notebook**. To make it work, you need to press "play" on the code cells.  
*Remember to always run cells in the right order and never skip one!*  
In case of doubt, you can always restart from the beginning.

First, let's clone the GitHub repository.

In [None]:
!git clone https://github.com/SimoneRebora/CMCLS

Let's update the GitHub repository, if changes have been made.

In [None]:
%cd CMCLS
!git pull
%cd ..

Then we load the dataset

In [None]:
# load all necessary packages
import os
import pandas as pd
from sklearn.metrics import cohen_kappa_score

# load the dataset

df = pd.read_csv("CMCLS/materials/annotation_dataset.csv", index_col=0)
df

We need to split the dataset into groups, so to have five different dataframes for each group

In [None]:
# Sample list defining the groups
value_types = ["1 : A", "2 : B", "3 : C", "4 : D", "5 : E"]

# Separate DataFrames for each condition
df_A = pd.DataFrame()
df_B = pd.DataFrame()
df_C = pd.DataFrame()
df_D = pd.DataFrame()
df_E = pd.DataFrame()

# Loop through each row in the original DataFrame
for _, row in df.iterrows():
    row_index = row["Q01"]  # Use Q01 as the index for the new row
    if row["Q02"] == value_types[0]:
        row_df = row.filter(regex='^Q03').to_frame().T
        row_df.index = [row_index]
        df_A = pd.concat([df_A, row_df])
    elif row["Q02"] == value_types[1]:
        row_df = row.filter(regex='^Q04').to_frame().T
        row_df.index = [row_index]
        df_B = pd.concat([df_B, row_df])
    elif row["Q02"] == value_types[2]:
        row_df = row.filter(regex='^Q05').to_frame().T
        row_df.index = [row_index]
        df_C = pd.concat([df_C, row_df])
    elif row["Q02"] == value_types[3]:
        row_df = row.filter(regex='^Q06').to_frame().T
        row_df.index = [row_index]
        df_D = pd.concat([df_D, row_df])
    elif row["Q02"] == value_types[4]:
        row_df = row.filter(regex='^Q07').to_frame().T
        row_df.index = [row_index]
        df_E = pd.concat([df_E, row_df])

# print dimensions of the datasets
print("Group A", df_A.shape)
print("Group B", df_B.shape)
print("Group C", df_C.shape)
print("Group D", df_D.shape)
print("Group E", df_E.shape)

# Group A


Check the content of the dataset

In [None]:
df_A

Calculate inter-annotator agreement (using Cohen's Kappa)

In [None]:
# Select the two columns to compare
col1 = df_A.iloc[0].values.astype(int)
col2 = df_A.iloc[1].values.astype(int)

# Calculate Cohen's Kappa score
cohen_kappa_score(col1, col2)

# Group B

Check the content of the dataset

In [None]:
df_B

Calculate inter-annotator agreement (using Cohen's Kappa)

In [None]:
# Select the two columns to compare
col1 = df_B.iloc[0].values.astype(int)
col2 = df_B.iloc[1].values.astype(int)

# Calculate Cohen's Kappa score
cohen_kappa_score(col1, col2)

# Group C

Check the content of the dataset

In [None]:
df_C

Calculate inter-annotator agreement (using Cohen's Kappa)

In [None]:
# Select the two columns to compare
col1 = df_C.iloc[0].values.astype(int)
col2 = df_C.iloc[1].values.astype(int)

# Calculate Cohen's Kappa score
cohen_kappa_score(col1, col2)

# Group D

Check the content of the dataset

In [None]:
df_D

Calculate inter-annotator agreement (using Cohen's Kappa)

In [None]:
# Select the two columns to compare
col1 = df_D.iloc[0].values.astype(int)
col2 = df_D.iloc[1].values.astype(int)

# Calculate Cohen's Kappa score
cohen_kappa_score(col1, col2)

# Group E

Check the content of the dataset

In [None]:
df_E

Calculate inter-annotator agreement (using Cohen's Kappa)

In [None]:
# Select the two columns to compare
col1 = df_E.iloc[0].values.astype(int)
col2 = df_E.iloc[1].values.astype(int)

# Calculate Cohen's Kappa score
cohen_kappa_score(col1, col2)

# Curation

Join all annotation datasets to a single one.

In [None]:
# Concatenate DataFrames horizontally
concatenated_df = pd.concat([df_A[:2].reset_index(drop=True),
                             df_B[:2].reset_index(drop=True),
                             df_C[:2].reset_index(drop=True),
                             df_D[:2].reset_index(drop=True),
                             df_E[:2].reset_index(drop=True)], axis=1)

# Rotate by transposing
rotated_df = concatenated_df.transpose()

provenance_labels = ['A'] * len(df_A.columns) + ['B'] * len(df_B.columns) + \
                    ['C'] * len(df_C.columns) + ['D'] * len(df_D.columns) + \
                    ['E'] * len(df_E.columns)

rotated_df['group'] = provenance_labels

# Rename columns to row_1, row_2, ...
rotated_df.columns = ['annotator_1', 'annotator_2', 'group']

# Replacing values: 1 -> "evaluation", 2 -> "report"
rotated_df.replace({1: "evaluation", 2: "report"}, inplace=True)

# Display the final DataFrame
rotated_df

Add the annotated texts.

**Note:** Now you need to upload the dataset files to the Notebook, by using the "Files" panel on the left.  
(you can find them in the Moodle, *Hands-on 7*)

In [None]:
# read source texts

texts_A = pd.read_csv("Group_A.csv")
texts_B = pd.read_csv("Group_B.csv")
texts_C = pd.read_csv("Group_C.csv")
texts_D = pd.read_csv("Group_D.csv")
texts_E = pd.read_csv("Group_E.csv")

# reduce to what was annotated

texts_A = texts_A[:len(df_A.columns)]
texts_B = texts_B[:len(df_B.columns)]
texts_C = texts_C[:len(df_C.columns)]
texts_D = texts_D[:len(df_D.columns)]
texts_E = texts_E[:len(df_E.columns)]

# Concatenate DataFrames vertically
full_df = pd.concat([texts_A, texts_B, texts_C, texts_D, texts_E], axis=0)

full_df = full_df.reset_index(drop=True)
rotated_df = rotated_df.reset_index(drop=True)

# Add annotations
result = pd.concat([full_df, rotated_df], axis=1, ignore_index=False)
result['curation'] = result.apply(lambda row: row['annotator_1'] if row['annotator_1'] == row['annotator_2'] else None, axis=1)
result

Save full annotated dataset for curation

In [None]:
# save the result to csv

result.to_csv("curation.csv", index=False)