In [1]:
import json
from os.path import join

import pandas as pd
from lxml import html
from sklearn.metrics import cohen_kappa_score
from sklearn.preprocessing import MultiLabelBinarizer
import krippendorff

In [35]:
filePath = "iaa.json"

def sort_annotation_by_id(data):
    return sorted(data, key=lambda x: x['id'])

def read_json(filePath):
    with open(filePath, 'r') as fh:
        jsonData = json.load(fh)
    annotater1 = sort_annotation_by_id([i for i in jsonData if i['annotator'] == 1])
    annotater2 = sort_annotation_by_id([i for i in jsonData if i['annotator'] == 2])
    annotater3 = sort_annotation_by_id([i for i in jsonData if i['annotator'] == 3])
    return [annotater1, annotater2, annotater3]

def get_labels_for_all_annotator(annotations):
    df = pd.DataFrame()
    temp = pd.DataFrame()
    for annotator in annotations:
        for i in annotator:
            temp['id'] = [i['id']]
            temp['annotator'] = [i['annotator']]
            temp['label_text'] = [get_labels_and_text(i)]
            df = pd.concat([df, temp])
    t = list()
    for i,row in df.iterrows():
        t.append([r[0] for r in row['label_text']])
    flattened_list = [item for sublist in t for item in sublist]
    entites = set(flattened_list)
    return [df.reset_index(drop=True), entites]

def get_labels_and_text(annotations):
    return [[ann['hypertextlabels'][0], ann['text'] ] for ann in annotations['ner']]

def get_entities_labels_grouped(df, entities):
    final_df = pd.DataFrame()
    for i in sorted(df.id.unique()):
        temp = df[df['id'] == i].reset_index(drop=True)
        for annotator in [1,2,3]:
            temp_annotator = pd.DataFrame()
            annotator_row = temp[temp['annotator'] == annotator].reset_index(drop=True)
            temp_annotator['annotator'] = [annotator]
            for entity in entities:
                temp_annotator['entity'] = entity
                entity_labeled_text = list()
                for lt in annotator_row['label_text'][0]:
                    if lt[0] == entity:
                        entity_labeled_text.append(lt[1])
                temp_annotator['entity_label_text'] = [entity_labeled_text]
                final_df = pd.concat([final_df, temp_annotator]).reset_index(drop=True)
    return final_df.groupby(['annotator', 'entity'])['entity_label_text'].agg(lambda x: flatten_list_of_lists(x)).reset_index()

def flatten_list_of_lists(list_of_lists):
    return [item for sublist in list_of_lists for item in sublist]

def prepare_annotations_binary(df, entity):
    entity_df = df[df['entity'] == entity]
    return entity_df['binary_vector'].tolist()
def create_binary_vector(labels, all_labels):
    return [1 if label in labels else 0 for label in all_labels]

def binary_each_entity(df):# Create a binary vector for each annotator and entity
    unique_labels = df.groupby('entity')['entity_label_text'].apply(lambda x: set([label for sublist in x for label in sublist]))
    binary_vectors = []
    for _, row in df.iterrows():
        all_labels = unique_labels[row['entity']]
        binary_vector = create_binary_vector(row['entity_label_text'], all_labels)
        binary_vectors.append(binary_vector)
    df['binary_vector'] = binary_vectors
    return df

def kappa_score(df):
    df = binary_each_entity(df)
    entities = df['entity'].unique()
    alpha_scores = {}
    
    for entity in entities:
        if entity == 'model_id' or entity == 'cancer_stage' or entity == "host_strain":
            continue
        print(entity)
        annotations = prepare_annotations_binary(df, entity)
        alpha = krippendorff.alpha(reliability_data=annotations, level_of_measurement='nominal')
        alpha_scores[entity] = alpha
    return alpha_scores

In [3]:
sorted_annotations = read_json(filePath)
split_set = pd.read_csv("Corpus-split - positive.csv")
df, entities = get_labels_for_all_annotator(sorted_annotations, split_set)
df = get_entities_labels_grouped(df, entities)
alpha_scores = kappa_score(df)

In [4]:
alpha_scores

{'age_category': -0.07407407407407396,
 'biomarker': 0.20531222659667536,
 'cancer_grade': 0.14814814814814825,
 'cancer_stage': 0.375,
 'clinical_trial': 0.11982082866741317,
 'diagnosis': 0.23908056358381502,
 'genetic_effect': 0.1853098890135928,
 'host_strain': 0.2222222222222222,
 'model_type': 0.06412797934627268,
 'molecular_char': 0.09920987654320979,
 'response_to_treatment': 0.005229945736243802,
 'sample_type': 0.003343621399176877,
 'treatment': 0.15332657885849366,
 'tumour_type': 0.033977348434377164}

In [28]:
def get_kappa_score(split):
    sorted_annotations = read_json(filePath)
    split_set = pd.read_csv("Corpus-split - positive.csv")
    split_set = split_set[split_set['split'] == split]
    df, entities = get_labels_for_all_annotator(split_annotations(sorted_annotations, split_set))
    df = get_entities_labels_grouped(df, entities)
    alpha_scores = kappa_score(df)
    return alpha_scores

def split_annotations(annotations, split):
    split_anno = list()
    for annotation in annotations:
        sa = list()
        for annotater in annotation:
            if annotater['html'] in split['abstract'].to_list():
                sa.append(annotater)
        split_anno.append(sa)
    return split_anno

In [29]:
get_kappa_score('dev')

{'biomarker': 0.3177215189873418,
 'cancer_grade': -0.11111111111111116,
 'cancer_stage': 0.0,
 'diagnosis': 0.24831081081081086,
 'genetic_effect': 0.46153846153846156,
 'host_strain': -0.25,
 'model_type': 0.11980216156805279,
 'molecular_char': 0.08994708994708989,
 'response_to_treatment': 0.16746411483253587,
 'sample_type': 0.375,
 'treatment': 0.1457246085909274,
 'tumour_type': 0.2222222222222222}

In [36]:
get_kappa_score('test')

biomarker
clinical_trial
diagnosis
genetic_effect
model_type
molecular_char
response_to_treatment
sample_type
treatment
tumour_type


{'biomarker': 0.08504854368932024,
 'clinical_trial': 0.0,
 'diagnosis': 0.19518963922294175,
 'genetic_effect': 0.2100840336134454,
 'model_type': 0.0472934472934472,
 'molecular_char': 0.056774193548386975,
 'response_to_treatment': -0.15999999999999992,
 'sample_type': 0.034965034965035,
 'treatment': 0.002941176470588114,
 'tumour_type': 0.14814814814814814}

In [25]:
split_set['abstract'].to_list()

['There remains an unmet need for preclinical models to enable personalized therapy for ovarian cancer (OC) patients. Here we evaluate the capacity of patient-derived organoids (PDOs) to predict clinical drug response and functional consequences of tumor heterogeneity. We included 36 whole-genome-characterized PDOs from 23 OC patients with known clinical histories. OC PDOs maintain the genomic features of the original tumor lesion and recapitulate patient response to neoadjuvant carboplatin/paclitaxel combination treatment. PDOs display inter- and intrapatient drug response heterogeneity to chemotherapy and targeted drugs, which can be partially explained by genetic aberrations. PDO drug screening identifies high responsiveness to at least one drug for 88% of patients. PDOs are valuable preclinical models that can provide insights into drug response for individual patients with OC, complementary to genetic testing. Generating PDOs of multiple tumor locations can improve clinical decisi