For each dataset (2 HR, 2 SR, 4 SLO):

    - For each model (XLM-R-base, XLM-R-large, CSEBert, SloBERTa, BERTić, multiple versions of XLM-R-BERTić and XLM-R-SloBERTić):


        - fine-tune the model and evaluate it - 5 times

## Setup and Dataset Importing

In [1]:
# Define the gpu on the gpu machine
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=5

import evaluate
from datetime import datetime
import pandas as pd
import numpy as np
import json
from simpletransformers.ner import NERModel, NERArgs
from sklearn.metrics import classification_report, f1_score
from tqdm.autonotebook import tqdm as notebook_tqdm
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import logging
import sklearn
from numba import cuda
import argparse
import gc
import torch
import time

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=5


In [2]:
# Import the dataset

# Code for python script
"""
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("dataset", help="path to the dataset in JSON format")
    args = parser.parse_args()

# Define the path to the dataset
dataset_path = args.dataset
"""
# Define the path to the dataset
dataset_path = "datasets/hr500k.conllup_extracted.json"

# Load the json file
with open(dataset_path, "r") as file:
    json_dict = json.load(file)

# Open the train, eval and test dictionaries as DataFrames
train_df = pd.DataFrame(json_dict["train"])
test_df = pd.DataFrame(json_dict["test"])
dev_df = pd.DataFrame(json_dict["dev"])

# Change the sentence_ids to numbers
test_df['sentence_id'] = pd.factorize(test_df['sentence_id'])[0]
train_df['sentence_id'] = pd.factorize(train_df['sentence_id'])[0]
dev_df['sentence_id'] = pd.factorize(dev_df['sentence_id'])[0]

# Define the labels
LABELS = json_dict["labels"]
print(LABELS)

print(train_df.shape, test_df.shape, dev_df.shape)
print(train_df.head())



['O', 'B-loc', 'B-org', 'B-per', 'I-per', 'B-deriv-per', 'I-org', 'I-loc', 'B-misc', 'I-misc', 'I-deriv-per']
(398681, 3) (51190, 3) (49764, 3)
     sentence_id      words labels
717            0      Kazna      O
718            0  medijskom      O
719            0     mogulu      O
720            0   obnovila      O
721            0   raspravu      O


## Analysing model files

In [None]:
ner_list = dir(NERArgs)

import torch

optimizer_state = torch.load("model/checkpoint-48000/training_args.bin")

attributes = list(dir(optimizer_state))


# Find the intersection of the sets
common_elements = list(set(attributes).intersection(set(ner_list)))

print(common_elements)

In [None]:
print(optimizer_state.resume_from_checkpoint)

In [None]:
optimizer_state.warmup_ratio = 0.06
optimizer_state.learning_rate = 1e-5
optimizer_state.fp16 = True
optimizer_state.logging_steps = 50
#optimizer_state.n_gpu = 1
optimizer_state.gradient_accumulation_steps = 1
optimizer_state.output_dir = "outputs/"
optimizer_state.num_train_epochs = 1
optimizer_state.resume_from_checkpoint = True
optimizer_state.ignore_data_skip = True


# Save arguments with new attributes
torch.save(optimizer_state, "model/checkpoint-48000/training_args.bin")

# Result Analysis - Result list and creation of results summary

In [14]:
# Create a new file for results
#with open("ner-results-custom-large.txt", "w") as file:
#    file.write("Date\tModel\tRun\tDataset\tMicro F1\tMacro F1\tLabel Report\n")

In [45]:
# Import the txt with results
import pandas as pd

results = pd.read_csv("ner-results-all.txt", sep="\t")
#results = pd.read_csv("ner-results-our-models.txt", sep="\t")
#results = pd.read_csv("ner-results-all-models.txt", sep="\t")

results

Unnamed: 0,Date,Model,Run,Dataset,Micro F1,Macro F1,Label Report
0,07/09/2023 10:35:30,xlm-r-base,xlm-r-base-0,datasets/hr500k.conllup_extracted.json,0.988318,0.903373,{'B-deriv-per': {'precision': 0.97058823529411...
1,07/09/2023 10:41:24,xlm-r-base,xlm-r-base-1,datasets/hr500k.conllup_extracted.json,0.988572,0.904130,{'B-deriv-per': {'precision': 0.91428571428571...
2,07/09/2023 10:47:18,xlm-r-base,xlm-r-base-2,datasets/hr500k.conllup_extracted.json,0.988396,0.901676,"{'B-deriv-per': {'precision': 1.0, 'recall': 0..."
3,07/09/2023 10:51:39,csebert,csebert-0,datasets/hr500k.conllup_extracted.json,0.990506,0.916010,{'B-deriv-per': {'precision': 0.91891891891891...
4,07/09/2023 10:56:00,csebert,csebert-1,datasets/hr500k.conllup_extracted.json,0.990721,0.919912,{'B-deriv-per': {'precision': 0.91891891891891...
...,...,...,...,...,...,...,...
331,11/09/2023 09:48:08,xlmrl_sl-bcms-24,xlmrl_sl-bcms-24-2,datasets/hr500k.conllup_extracted.json,0.990819,0.924200,{'B-deriv-per': {'precision': 0.97297297297297...
332,11/09/2023 10:12:38,xlmrl_sl-bcms-30,xlmrl_sl-bcms-30-2,datasets/reldi-normtagner-sr.conllup_extracted...,0.986955,0.764297,{'B-deriv-per': {'precision': 0.66666666666666...
333,11/09/2023 11:09:39,xlmrl_sl-bcms-42,xlmrl_sl-bcms-42-1,datasets/reldi-normtagner-sr.conllup_extracted...,0.987283,0.805255,{'B-deriv-per': {'precision': 0.85714285714285...
334,11/09/2023 11:20:33,xlmrl_sl-bcms-42,xlmrl_sl-bcms-42-2,datasets/reldi-normtagner-sr.conllup_extracted...,0.987064,0.793350,{'B-deriv-per': {'precision': 0.85714285714285...


In [42]:
results[results.duplicated(keep=False)].sort_values(by=["Run", "Dataset"])

Unnamed: 0,Date,Model,Run,Dataset,Micro F1,Macro F1,Label Report


In [46]:
results["Macro F1"] = results["Macro F1"].round(4)

# Pivot the DataFrame to rearrange columns into rows
pivot_df = results.pivot(index='Run', columns='Dataset', values='Macro F1')

# Reset the index to have 'Model' as a column
pivot_df.reset_index(inplace=True)

pivot_df

Dataset,Run,datasets/hr500k.conllup_extracted.json,datasets/reldi-normtagner-hr.conllup_extracted.json,datasets/reldi-normtagner-sr.conllup_extracted.json,datasets/set.sr.plus.conllup_extracted.json
0,bertic-0,0.9215,0.7942,0.8118,0.9343
1,bertic-1,0.9271,0.7745,0.8212,0.9406
2,bertic-2,0.9251,0.8070,0.7605,0.9319
3,csebert-0,0.9160,0.7870,0.7493,0.9241
4,csebert-1,0.9199,0.7971,0.7400,0.9199
...,...,...,...,...,...
79,xlmrl_sl-bcms-48-1,0.9264,0.8193,0.8311,0.9450
80,xlmrl_sl-bcms-48-2,0.9267,0.7985,0.8233,0.9513
81,xlmrl_sl-bcms-6-0,0.9258,0.8046,0.7948,0.9467
82,xlmrl_sl-bcms-6-1,0.9227,0.8123,0.8182,0.9297


In [47]:
# Save the df
#pivot_df.to_csv("ner-results-summary-table.csv")
pivot_df.to_csv("ner-results-summary-table-all-models.csv")

In [34]:
# Let's analyze the df with all the predictions
import numpy as np

pred_df = pd.read_csv("datasets/hr500k.conllup_extracted.json-test_df-with-predictions.csv", index_col = 0)

# Analyze instances where models are wrong
pred_df["match"] = np.where(pred_df["labels"] != pred_df["y_pred_xlm-r-large_0"], "no", "yes")
pred_df.match.value_counts()

match
yes    50676
no       514
Name: count, dtype: int64

## Create aggregated results

In [48]:
# Join information for model and dataset
results["model-dataset"] = [x[0]+"-"+x[1] for x in list(zip(results["Model"].to_list(), results["Dataset"].to_list()))]
results.head()

Unnamed: 0,Date,Model,Run,Dataset,Micro F1,Macro F1,Label Report,model-dataset
0,07/09/2023 10:35:30,xlm-r-base,xlm-r-base-0,datasets/hr500k.conllup_extracted.json,0.988318,0.9034,{'B-deriv-per': {'precision': 0.97058823529411...,xlm-r-base-datasets/hr500k.conllup_extracted.json
1,07/09/2023 10:41:24,xlm-r-base,xlm-r-base-1,datasets/hr500k.conllup_extracted.json,0.988572,0.9041,{'B-deriv-per': {'precision': 0.91428571428571...,xlm-r-base-datasets/hr500k.conllup_extracted.json
2,07/09/2023 10:47:18,xlm-r-base,xlm-r-base-2,datasets/hr500k.conllup_extracted.json,0.988396,0.9017,"{'B-deriv-per': {'precision': 1.0, 'recall': 0...",xlm-r-base-datasets/hr500k.conllup_extracted.json
3,07/09/2023 10:51:39,csebert,csebert-0,datasets/hr500k.conllup_extracted.json,0.990506,0.916,{'B-deriv-per': {'precision': 0.91891891891891...,csebert-datasets/hr500k.conllup_extracted.json
4,07/09/2023 10:56:00,csebert,csebert-1,datasets/hr500k.conllup_extracted.json,0.990721,0.9199,{'B-deriv-per': {'precision': 0.91891891891891...,csebert-datasets/hr500k.conllup_extracted.json


In [49]:
agg_results = pd.concat([results.groupby("model-dataset")["Macro F1"].mean(), results.groupby("model-dataset")["Macro F1"].std()], axis = 1)

# Rename columns
agg_results.columns = ["Macro F1", "Std"]
agg_results

Unnamed: 0_level_0,Macro F1,Std
model-dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
bertic-datasets/hr500k.conllup_extracted.json,0.924567,0.002838
bertic-datasets/reldi-normtagner-hr.conllup_extracted.json,0.791900,0.016372
bertic-datasets/reldi-normtagner-sr.conllup_extracted.json,0.797833,0.032671
bertic-datasets/set.sr.plus.conllup_extracted.json,0.935600,0.004493
csebert-datasets/hr500k.conllup_extracted.json,0.917867,0.001955
...,...,...
xlmrl_sl-bcms-48-datasets/set.sr.plus.conllup_extracted.json,0.948767,0.003326
xlmrl_sl-bcms-6-datasets/hr500k.conllup_extracted.json,0.920267,0.007071
xlmrl_sl-bcms-6-datasets/reldi-normtagner-hr.conllup_extracted.json,0.788867,0.034137
xlmrl_sl-bcms-6-datasets/reldi-normtagner-sr.conllup_extracted.json,0.789800,0.031202


In [50]:
# Reset index
agg_results.reset_index(inplace=True)

agg_results.head()


Unnamed: 0,model-dataset,Macro F1,Std
0,bertic-datasets/hr500k.conllup_extracted.json,0.924567,0.002838
1,bertic-datasets/reldi-normtagner-hr.conllup_ex...,0.7919,0.016372
2,bertic-datasets/reldi-normtagner-sr.conllup_ex...,0.797833,0.032671
3,bertic-datasets/set.sr.plus.conllup_extracted....,0.9356,0.004493
4,csebert-datasets/hr500k.conllup_extracted.json,0.917867,0.001955


In [51]:
# Split the 'model_dataset' column into 'model' and 'dataset' columns
agg_results[['Model', 'Dataset']] = agg_results['model-dataset'].str.split('-datasets/', n=1, expand=True)

# Pivot the DataFrame to the desired structure
pivot_agg_results = agg_results.pivot(index='Model', columns='Dataset', values=['Macro F1', 'Std'])

# Flatten the column MultiIndex
pivot_agg_results.columns = [f'{col[1]}-{col[0]}' for col in pivot_agg_results.columns]

# Reset index and display the final DataFrame
final_agg_results = pivot_agg_results.reset_index()

final_agg_results.head()


Unnamed: 0,Model,hr500k.conllup_extracted.json-Macro F1,reldi-normtagner-hr.conllup_extracted.json-Macro F1,reldi-normtagner-sr.conllup_extracted.json-Macro F1,set.sr.plus.conllup_extracted.json-Macro F1,hr500k.conllup_extracted.json-Std,reldi-normtagner-hr.conllup_extracted.json-Std,reldi-normtagner-sr.conllup_extracted.json-Std,set.sr.plus.conllup_extracted.json-Std
0,bertic,0.924567,0.7919,0.797833,0.9356,0.002838,0.016372,0.032671,0.004493
1,csebert,0.917867,0.7936,0.7508,0.921567,0.001955,0.005719,0.011623,0.00223
2,xlm-r-base,0.903067,0.763233,0.733533,0.9141,0.001234,0.016102,0.023968,0.00406
3,xlm-r-large,0.919333,0.790533,0.774267,0.933367,0.005358,0.013639,0.012833,0.005121
4,xlmrb_bcms-12,0.914767,0.7684,0.7652,0.925733,0.001415,0.009752,0.004903,0.005129


In [52]:
# Change the order of the columns
final_agg_results = final_agg_results[['Model', 'hr500k.conllup_extracted.json-Macro F1', 'hr500k.conllup_extracted.json-Std', 'reldi-normtagner-hr.conllup_extracted.json-Macro F1', 'reldi-normtagner-hr.conllup_extracted.json-Std', 'reldi-normtagner-sr.conllup_extracted.json-Macro F1', 'reldi-normtagner-sr.conllup_extracted.json-Std', 'set.sr.plus.conllup_extracted.json-Macro F1', 'set.sr.plus.conllup_extracted.json-Std']]

final_agg_results.head()

Unnamed: 0,Model,hr500k.conllup_extracted.json-Macro F1,hr500k.conllup_extracted.json-Std,reldi-normtagner-hr.conllup_extracted.json-Macro F1,reldi-normtagner-hr.conllup_extracted.json-Std,reldi-normtagner-sr.conllup_extracted.json-Macro F1,reldi-normtagner-sr.conllup_extracted.json-Std,set.sr.plus.conllup_extracted.json-Macro F1,set.sr.plus.conllup_extracted.json-Std
0,bertic,0.924567,0.002838,0.7919,0.016372,0.797833,0.032671,0.9356,0.004493
1,csebert,0.917867,0.001955,0.7936,0.005719,0.7508,0.011623,0.921567,0.00223
2,xlm-r-base,0.903067,0.001234,0.763233,0.016102,0.733533,0.023968,0.9141,0.00406
3,xlm-r-large,0.919333,0.005358,0.790533,0.013639,0.774267,0.012833,0.933367,0.005121
4,xlmrb_bcms-12,0.914767,0.001415,0.7684,0.009752,0.7652,0.004903,0.925733,0.005129


In [53]:
# Save the results
final_agg_results.to_csv("aggregated-results-all-models.csv")

## Result Analysis: analysis of summary table of results

In [3]:
# Import the summary
import pandas as pd

sum_df = pd.read_csv("ner-results-summary-table.csv", index_col = 0)

sum_df.head()

Unnamed: 0,Run,datasets/hr500k.conllup_extracted.json,datasets/reldi-normtagner-hr.conllup_extracted.json,datasets/reldi-normtagner-sr.conllup_extracted.json,datasets/set.sr.plus.conllup_extracted.json
0,bertic-0,0.9,0.79,0.64,0.86
1,bertic-1,0.92,0.72,0.72,0.81
2,csebert-0,0.91,0.73,0.71,0.91
3,csebert-1,0.91,0.76,0.74,0.91
4,xlm-r-base-0,0.91,0.76,0.65,0.89
