In [2]:
import json
from os import mkdir
import csv
from utils import get_files, exists, join
import pandas as pd
from lxml import html
from sklearn.metrics import cohen_kappa_score
from sklearn.preprocessing import MultiLabelBinarizer
import krippendorff

# Generate Gold Annotations

In [3]:
def read_json(filePath):
    with open(filePath, 'r') as fh:
        jsonData = json.load(fh)
    return jsonData
    
 
def write_json(outputFolder, fileName, jsonData):
    with open(join(outputFolder, fileName), 'w') as fh:
        json.dump(jsonData, fh, sort_keys=True, indent=4)   

def make_dir(path):
    if not exists(path):
        mkdir(path)

corpus_dir = "corpus-json"
output_dir = "gold-annotations"
make_dir(output_dir)
make_dir(join(output_dir, "adjudicated_gold"))
make_dir(join(output_dir, "too_add_adjudicated_gold"))
files = get_files(corpus_dir)

In [4]:
def categorize_annotations(annotations):
    # Create dictionaries to store counts of each unique annotation
    counts = {}

    # Separate entities and relationships
    for annotation in annotations:
        if 'startOffset' in annotation:  # It's an entity
            key = (annotation['startOffset'], annotation['endOffset'], tuple(annotation['tags']), annotation['textProvided'])
        elif 'from' in annotation:  # It's a relationship
            key = (annotation['from_text'], annotation['to_text'], tuple(annotation['tags']))

        if key not in counts:
            counts[key] = []
        counts[key].append(annotation)

    matched_annotations = []
    mismatched_annotations = []

    # Separate matched and mismatched annotations
    for key, value in counts.items():
        if len(value) >=2:  # All 3 annotators agree
            matched_annotations.extend(value)
        else:
            mismatched_annotations.extend(value)

    return process_annotations(matched_annotations), mismatched_annotations

def process_annotations(annotations):
    unique_annotations = []
    seen_annotations = set()

    for annotation in annotations:
        # Create a reduced annotation without the 'annotator' key
        if 'startOffset' in annotation:  # It's an entity
            reduced_annotation = {
                'id': annotation['id'],
                'startOffset': annotation['startOffset'],
                'endOffset': annotation['endOffset'],
                'tags': annotation['tags'],
                'textProvided': annotation['textProvided']
            }
            annotation_tuple = (reduced_annotation['startOffset'], reduced_annotation['endOffset'],
                                tuple(reduced_annotation['tags']), reduced_annotation['textProvided'])
        elif 'from' in annotation:  # It's a relationship
            reduced_annotation = {
                'from': annotation['from'],
                'from_text': annotation['from_text'],
                'to': annotation['to'],
                'to_text': annotation['to_text'],
                'tags': annotation['tags']
            }
            annotation_tuple = (reduced_annotation['from_text'], reduced_annotation['to_text'],
                                tuple(reduced_annotation['tags']))

        # Check if this annotation is unique
        if annotation_tuple not in seen_annotations:
            seen_annotations.add(annotation_tuple)
            unique_annotations.append(reduced_annotation)

    return unique_annotations

def write_mismatched_to_tsv(file_path, mismatched_annotations):
    # Define TSV headers
    headers = ["id",'Annotator', 'Type', 'StartOffset', 'EndOffset', 'From', 'To', 'Tags', 'TextProvided']

    # Prepare data for TSV
    rows = []
    for annotation in mismatched_annotations:
        row = []
        if 'startOffset' in annotation:  # Entity annotation
            row = [
                annotation['id'],
                annotation['annotator'],
                'Entity',
                annotation['startOffset'],
                annotation['endOffset'],
                '',  # No 'from' for entities
                '',  # No 'to' for entities
                ','.join(annotation['tags']),
                annotation['textProvided']
            ]
        elif 'from' in annotation:  # Relationship annotation
            row = [
                "",
                annotation['annotator'],
                'Relationship',
                annotation['from'],
                annotation['to'], 
                annotation['from_text'],
                annotation['to_text'],
                ','.join(annotation['tags']),
                ''  # No 'textProvided' for relationships
            ]
        if row != []:
            rows.append(row)

    # Write to TSV
    with open(file_path, 'w', newline='') as file:
        writer = csv.writer(file, delimiter='\t')
        writer.writerow(headers)
        writer.writerows(rows)

In [35]:
for file in files:
    try:
        jsonData = read_json(join(corpus_dir, file))
        adjudicated_gold = categorize_annotations(jsonData)
        write_json(join(output_dir, "adjudicated_gold"), f"gold_{file.replace('.txt', '')}", adjudicated_gold[0])
        #write_json(join(output_dir, "too_add_adjudicated_gold"), f"to_add_gold_{file}", adjudicated_gold[1])
        write_mismatched_to_tsv(join(output_dir, "too_add_adjudicated_gold", f"to_add_{file}.tsv".replace(".txt.json", "")), adjudicated_gold[1])
    except Exception as e:
        print(f"Failed: {file} error: {e}")

Failed: .DS_Store error: 'utf-8' codec can't decode byte 0x80 in position 3131: invalid start byte


In [57]:
filePath = "iaa.json"

In [5]:
filePath = "iaa.json"

def sort_annotation_by_id(data):
    return sorted(data, key=lambda x: x['id'])

def read_json(filePath):
    with open(filePath, 'r') as fh:
        jsonData = json.load(fh)
    annotater1 = sort_annotation_by_id([i for i in jsonData if i['annotator'] == 1])
    annotater2 = sort_annotation_by_id([i for i in jsonData if i['annotator'] == 2])
    annotater3 = sort_annotation_by_id([i for i in jsonData if i['annotator'] == 3])
    return [annotater1, annotater2, annotater3]

def get_labels_for_all_annotator(annotations):
    df = pd.DataFrame()
    temp = pd.DataFrame()
    for annotator in annotations:
        for i in annotator:
            temp['id'] = [i['id']]
            temp['annotator'] = [i['annotator']]
            temp['label_text'] = [get_labels_and_text(i)]
            df = pd.concat([df, temp])
    t = list()
    for i,row in df.iterrows():
        t.append([r[0] for r in row['label_text']])
    flattened_list = [item for sublist in t for item in sublist]
    entites = set(flattened_list)
    return [df.reset_index(drop=True), entites]

def get_labels_and_text(annotations):
    return [[ann['hypertextlabels'][0], ann['text'] ] for ann in annotations['ner']]

def get_entities_labels_grouped(df, entities):
    final_df = pd.DataFrame()
    for i in sorted(df.id.unique()):
        temp = df[df['id'] == i].reset_index(drop=True)
        for annotator in [1,2,3]:
            temp_annotator = pd.DataFrame()
            annotator_row = temp[temp['annotator'] == annotator].reset_index(drop=True)
            temp_annotator['annotator'] = [annotator]
            for entity in entities:
                temp_annotator['entity'] = entity
                entity_labeled_text = list()
                for lt in annotator_row['label_text'][0]:
                    if lt[0] == entity:
                        entity_labeled_text.append(lt[1])
                temp_annotator['entity_label_text'] = [entity_labeled_text]
                final_df = pd.concat([final_df, temp_annotator]).reset_index(drop=True)
    return final_df.groupby(['annotator', 'entity'])['entity_label_text'].agg(lambda x: flatten_list_of_lists(x)).reset_index()

def flatten_list_of_lists(list_of_lists):
    return [item for sublist in list_of_lists for item in sublist]

def prepare_annotations_binary(df, entity):
    entity_df = df[df['entity'] == entity]
    return entity_df['binary_vector'].tolist()
def create_binary_vector(labels, all_labels):
    return [1 if label in labels else 0 for label in all_labels]

def binary_each_entity(df):# Create a binary vector for each annotator and entity
    unique_labels = df.groupby('entity')['entity_label_text'].apply(lambda x: set([label for sublist in x for label in sublist]))
    binary_vectors = []
    for _, row in df.iterrows():
        all_labels = unique_labels[row['entity']]
        binary_vector = create_binary_vector(row['entity_label_text'], all_labels)
        binary_vectors.append(binary_vector)
    df['binary_vector'] = binary_vectors
    return df

def kappa_score(df):
    df = binary_each_entity(df)
    entities = df['entity'].unique()
    alpha_scores = {}
    
    for entity in entities:
        if entity == 'model_id' or entity == 'cancer_stage' or entity == "host_strain":
            continue
        print(entity)
        annotations = prepare_annotations_binary(df, entity)
        alpha = krippendorff.alpha(reliability_data=annotations, level_of_measurement='nominal')
        alpha_scores[entity] = alpha
    return alpha_scores

In [60]:
sorted_annotations = read_json(filePath)
split_set = pd.read_csv("Corpus-split - positive.csv")
df, entities = get_labels_for_all_annotator(sorted_annotations, split_set)
df = get_entities_labels_grouped(df, entities)
alpha_scores = kappa_score(df)

TypeError: get_labels_for_all_annotator() takes 1 positional argument but 2 were given

In [4]:
alpha_scores

{'age_category': -0.07407407407407396,
 'biomarker': 0.20531222659667536,
 'cancer_grade': 0.14814814814814825,
 'cancer_stage': 0.375,
 'clinical_trial': 0.11982082866741317,
 'diagnosis': 0.23908056358381502,
 'genetic_effect': 0.1853098890135928,
 'host_strain': 0.2222222222222222,
 'model_type': 0.06412797934627268,
 'molecular_char': 0.09920987654320979,
 'response_to_treatment': 0.005229945736243802,
 'sample_type': 0.003343621399176877,
 'treatment': 0.15332657885849366,
 'tumour_type': 0.033977348434377164}

In [6]:
def get_kappa_score(split):
    sorted_annotations = read_json(filePath)
    split_set = pd.read_csv("Corpus-split - positive.csv")
    split_set = split_set[split_set['split'] == split]
    df, entities = get_labels_for_all_annotator(split_annotations(sorted_annotations, split_set))
    df = get_entities_labels_grouped(df, entities)
    alpha_scores = kappa_score(df)
    return alpha_scores

def split_annotations(annotations, split):
    split_anno = list()
    for annotation in annotations:
        sa = list()
        for annotater in annotation:
            if annotater['html'] in split['abstract'].to_list():
                sa.append(annotater)
        split_anno.append(sa)
    return split_anno

In [29]:
get_kappa_score('dev')

{'biomarker': 0.3177215189873418,
 'cancer_grade': -0.11111111111111116,
 'cancer_stage': 0.0,
 'diagnosis': 0.24831081081081086,
 'genetic_effect': 0.46153846153846156,
 'host_strain': -0.25,
 'model_type': 0.11980216156805279,
 'molecular_char': 0.08994708994708989,
 'response_to_treatment': 0.16746411483253587,
 'sample_type': 0.375,
 'treatment': 0.1457246085909274,
 'tumour_type': 0.2222222222222222}

In [36]:
get_kappa_score('test')

biomarker
clinical_trial
diagnosis
genetic_effect
model_type
molecular_char
response_to_treatment
sample_type
treatment
tumour_type


{'biomarker': 0.08504854368932024,
 'clinical_trial': 0.0,
 'diagnosis': 0.19518963922294175,
 'genetic_effect': 0.2100840336134454,
 'model_type': 0.0472934472934472,
 'molecular_char': 0.056774193548386975,
 'response_to_treatment': -0.15999999999999992,
 'sample_type': 0.034965034965035,
 'treatment': 0.002941176470588114,
 'tumour_type': 0.14814814814814814}

In [18]:
from itertools import combinations
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score

def create_binary_matrix(df, entity):
    """Create a binary matrix of annotations for a specific entity."""
    subset = df[df['entity'] == entity]
    mlb = MultiLabelBinarizer()
    binary_matrix = mlb.fit_transform(subset['entity_label_text'])
    return binary_matrix, mlb.classes_

# Function to calculate F1 scores for each pair of annotators
def calculate_pairwise_f1_scores(df, entity):
    binary_matrix, labels = create_binary_matrix(df, entity)
    annotator_pairs = list(combinations(range(binary_matrix.shape[0]), 2))  # Generate all pairs
    f1_scores = {}
    for (i, j) in annotator_pairs:
        f1 = f1_score(binary_matrix[i], binary_matrix[j], average='macro')
        f1_scores[(i+1, j+1)] = f1

    return f1_scores

sorted_annotations = read_json(filePath)
split_set = pd.read_csv("Corpus-split - positive.csv")
split_set = split_set[split_set['split'] == "train"]
df, entities = get_labels_for_all_annotator(split_annotations(sorted_annotations, split_set))
df = get_entities_labels_grouped(df, entities)

# Compute F1 scores for each entity
all_f1_scores = {}
entities = df['entity'].unique()

for entity in entities:
    f1_scores = calculate_pairwise_f1_scores(df, entity)
    all_f1_scores[entity] = f1_scores

# Display the F1 scores
for entity, scores in all_f1_scores.items():
    print(f"F1 Scores for entity '{entity}':")
    for (annotator1, annotator2), f1 in scores.items():
        print(f"Annotator {annotator1} vs Annotator {annotator2}: F1 Score = {f1:.2f}")


F1 Scores for entity 'age_category':
Annotator 1 vs Annotator 2: F1 Score = 0.40
Annotator 1 vs Annotator 3: F1 Score = 0.40
Annotator 2 vs Annotator 3: F1 Score = 1.00
F1 Scores for entity 'biomarker':
Annotator 1 vs Annotator 2: F1 Score = 0.57
Annotator 1 vs Annotator 3: F1 Score = 0.61
Annotator 2 vs Annotator 3: F1 Score = 0.67
F1 Scores for entity 'cancer_grade':
Annotator 1 vs Annotator 2: F1 Score = 0.33
Annotator 1 vs Annotator 3: F1 Score = 0.33
Annotator 2 vs Annotator 3: F1 Score = 1.00
F1 Scores for entity 'cancer_stage':
Annotator 1 vs Annotator 2: F1 Score = 0.83
Annotator 1 vs Annotator 3: F1 Score = 0.25
Annotator 2 vs Annotator 3: F1 Score = 0.49
F1 Scores for entity 'clinical_trial':
Annotator 1 vs Annotator 2: F1 Score = 0.71
Annotator 1 vs Annotator 3: F1 Score = 0.46
Annotator 2 vs Annotator 3: F1 Score = 0.68
F1 Scores for entity 'diagnosis':
Annotator 1 vs Annotator 2: F1 Score = 0.60
Annotator 1 vs Annotator 3: F1 Score = 0.54
Annotator 2 vs Annotator 3: F1 Sco

In [9]:
df.annotator.unique()

array([1, 2, 3])

In [10]:
entities

array(['age_category', 'biomarker', 'cancer_grade', 'cancer_stage',
       'clinical_trial', 'diagnosis', 'genetic_effect', 'host_strain',
       'model_id', 'model_type', 'molecular_char',
       'response_to_treatment', 'sample_type', 'treatment', 'tumour_type'],
      dtype=object)