## Importing and Installing libraries

In [6]:
%pip install --upgrade --quiet scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1.2 -> 24.3.1
[notice] To update, run: C:\Users\TIRATH BHATHAWALA\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, cohen_kappa_score

## Creating datasets

In [1]:
data = []
annotation_keys = ['CLA', 'JUS', 'DEP', 'FAI', 'CON', 'ENG', 'ACC', 'CST', 'NOV', 'ETH']

In [2]:
import os
import re
import pandas as pd

def extract_text_and_annotations(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    annotations_pattern = r'\[\[(.*?)\]\]'
    annotations_match = re.search(annotations_pattern, content)

    if annotations_match:
        annotations = annotations_match.group(1).split(',')
        annotations = [ann.strip('[]') for ann in annotations]
        cleaned_content = re.sub(annotations_pattern, '', content).strip()
    else:
        annotations = [''] * 10  
        cleaned_content = content.strip()

    annotation_dict = dict(zip(annotation_keys, annotations))
    return cleaned_content, annotation_dict

In [None]:
def create_dataset(directory):

    for filename in os.listdir(directory):
        if filename.endswith('.txt'):

            index_pattern = r'ICLR2018-(.*?)_annotated\.txt'
            index_match = re.search(index_pattern, filename)
            index = index_match.group(1) if index_match else None 

            file_path = os.path.join(directory, filename)
            text_content, annotations = extract_text_and_annotations(file_path)

            data.append({'Index': index, 'Text': text_content, **annotations})
            df = pd.DataFrame(data)
            return df

In [4]:
true_labels_df = create_dataset('../../datasets/llama_annotated/')

In [5]:
true_labels_df.head()

Unnamed: 0,Index,Text,CLA,JUS,DEP,FAI,CON,ENG,ACC,CST,NOV,ETH
0,B11bwYgfM-R1,The idea of using cross-task transfer performa...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-POS,ENG-POS,ACC-POS,CST-POS,NOV-NEG,ETH-NEG


In [23]:
folders = {
    "Llama": '../../datasets/llama_annotated/',
    "Mistral": '../../datasets/mistral_annotated/',
    "Gemini": '../../datasets/gemini_annotated/',
    "GPT": '../../datasets/suggested_gpt_annotated/',
}

In [24]:
datasets = {}
for model, folder_path in folders.items():
    datasets[model] = create_dataset(folder_path)

In [25]:
all_data = pd.concat(datasets.values(), keys=datasets.keys(), names=['Model'])

In [None]:
all_data.to_csv('../../code/results/aggregated/all_models_annotated_reviews_dataset.csv', index=False)

In [27]:
llm_subset = all_data.head(100)

In [28]:
llm_subset[:100]

Unnamed: 0_level_0,Unnamed: 1_level_0,Index,Text,CLA,JUS,DEP,FAI,CON,ENG,ACC,CST,NOV,ETH
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Llama,0,B11bwYgfM-R1,The idea of using cross-task transfer performa...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-POS,ENG-POS,ACC-POS,CST-POS,NOV-POS,ETH-POS
Llama,1,B11bwYgfM-R1,The idea of using cross-task transfer performa...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-POS,ENG-POS,ACC-POS,CST-POS,NOV-NEG,ETH-NEG
Mistral,0,B11bwYgfM-R1,The idea of using cross-task transfer performa...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-POS,ENG-POS,ACC-POS,CST-POS,NOV-POS,ETH-POS
Mistral,1,B11bwYgfM-R1,The idea of using cross-task transfer performa...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-POS,ENG-POS,ACC-POS,CST-POS,NOV-NEG,ETH-NEG
Mistral,2,B11bwYgfM-R1,The idea of using cross-task transfer performa...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-NEG,ENG-POS,ACC-POS,CST-POS,NOV-POS,ETH-NEG
Gemini,0,B11bwYgfM-R1,The idea of using cross-task transfer performa...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-POS,ENG-POS,ACC-POS,CST-POS,NOV-POS,ETH-POS
Gemini,1,B11bwYgfM-R1,The idea of using cross-task transfer performa...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-POS,ENG-POS,ACC-POS,CST-POS,NOV-NEG,ETH-NEG
Gemini,2,B11bwYgfM-R1,The idea of using cross-task transfer performa...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-NEG,ENG-POS,ACC-POS,CST-POS,NOV-POS,ETH-NEG
Gemini,3,B11bwYgfM-R1,The idea of using cross-task transfer performa...,CLA-NEG,JUS-NEG,DEP-NEG,FAI-NEG,CON-NEG,ENG-NEG,ACC-NEG,CST-NEG,NOV-NEG,ETH-NEG
GPT,0,B11bwYgfM-R1,The idea of using cross-task transfer performa...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-POS,ENG-POS,ACC-POS,CST-POS,NOV-POS,ETH-POS


In [29]:
true_labels_df = true_labels_df.reset_index()
llm_subset = llm_subset.reset_index()

In [30]:
merged_df = pd.merge(true_labels_df, llm_subset, on='Index', suffixes=('_true', '_pred'))

In [31]:
merged_df.drop(columns=['index', 'level_1', 'Text_pred'], inplace=True)
merged_df.rename(columns={'Text_true': 'Text'}, inplace=True)

In [32]:
merged_df.head()

Unnamed: 0,Index,Text,CLA_true,JUS_true,DEP_true,FAI_true,CON_true,ENG_true,ACC_true,CST_true,...,CLA_pred,JUS_pred,DEP_pred,FAI_pred,CON_pred,ENG_pred,ACC_pred,CST_pred,NOV_pred,ETH_pred
0,B11bwYgfM-R1,The idea of using cross-task transfer performa...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-POS,ENG-POS,ACC-POS,CST-POS,...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-POS,ENG-POS,ACC-POS,CST-POS,NOV-POS,ETH-POS
1,B11bwYgfM-R1,The idea of using cross-task transfer performa...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-POS,ENG-POS,ACC-POS,CST-POS,...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-POS,ENG-POS,ACC-POS,CST-POS,NOV-NEG,ETH-NEG
2,B11bwYgfM-R1,The idea of using cross-task transfer performa...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-POS,ENG-POS,ACC-POS,CST-POS,...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-POS,ENG-POS,ACC-POS,CST-POS,NOV-POS,ETH-POS
3,B11bwYgfM-R1,The idea of using cross-task transfer performa...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-POS,ENG-POS,ACC-POS,CST-POS,...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-POS,ENG-POS,ACC-POS,CST-POS,NOV-NEG,ETH-NEG
4,B11bwYgfM-R1,The idea of using cross-task transfer performa...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-POS,ENG-POS,ACC-POS,CST-POS,...,CLA-POS,JUS-POS,DEP-POS,FAI-POS,CON-NEG,ENG-POS,ACC-POS,CST-POS,NOV-POS,ETH-NEG


## Performing analysis

In [24]:
def calculate_cohens_kappa(true_labels, predicted_labels):
    kappa = cohen_kappa_score(true_labels, predicted_labels)
    return kappa

In [87]:
def calculate_metrics(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='binary', pos_label='CLA-POS')
    recall = recall_score(true_labels, predicted_labels, average='binary', pos_label='CLA-POS')
    f1 = f1_score(true_labels, predicted_labels, average='binary', pos_label='CLA-POS')
    return accuracy, precision, recall, f1

In [29]:
results = []
llm_models = ['Gemini', 'Llama', 'GPT', 'Mistral']
df = datasets

In [88]:
for model in llm_models:
    true_labels = merged_df['CLA_true']  
    predicted_labels = merged_df[f'CLA_pred']

    accuracy, precision, recall, f1 = calculate_metrics(true_labels, predicted_labels)
    kappa = calculate_cohens_kappa(true_labels, predicted_labels)

    results.append({
        'Model': model,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Cohen\'s Kappa': kappa
    })

In [89]:
results_df = pd.DataFrame(results)

In [90]:
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Cohen's Kappa
0,Gemini,0.991071,0.985507,1.0,0.992701,0.981208
1,Llama,0.991071,0.985507,1.0,0.992701,0.981208
2,GPT,0.991071,0.985507,1.0,0.992701,0.981208
3,Mistral,0.991071,0.985507,1.0,0.992701,0.981208


In [None]:
results_df.to_csv('../../code/results/aggregated/quantitative_analysis_results.csv', index=False)