For each dataset (2 HR, 2 SR, 4 SLO):

    - For each model (XLM-R-base, XLM-R-large, CSEBert, SloBERTa, BERTić, multiple versions of XLM-R-BERTić and XLM-R-SloBERTić):


        - fine-tune the model and evaluate it - 5 times

## Setup and Dataset Importing

In [1]:
# Define the gpu on the gpu machine
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=3

import evaluate
from datetime import datetime
import pandas as pd
import numpy as np
import json
from simpletransformers.ner import NERModel, NERArgs
from sklearn.metrics import classification_report, f1_score
from tqdm.autonotebook import tqdm as notebook_tqdm
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import logging
import sklearn
from numba import cuda
import argparse
import gc
import torch
import time

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=3


In [2]:
# Import the dataset

# Code for python script
"""
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("dataset", help="path to the dataset in JSON format")
    args = parser.parse_args()

# Define the path to the dataset
dataset_path = args.dataset
"""
# Define the path to the dataset
dataset_path = "datasets/hr500k.conllup_extracted.json"

# Load the json file
with open(dataset_path, "r") as file:
    json_dict = json.load(file)

# Open the train, eval and test dictionaries as DataFrames
train_df = pd.DataFrame(json_dict["train"])
test_df = pd.DataFrame(json_dict["test"])
dev_df = pd.DataFrame(json_dict["dev"])

# Change the sentence_ids to numbers
test_df['sentence_id'] = pd.factorize(test_df['sentence_id'])[0]
train_df['sentence_id'] = pd.factorize(train_df['sentence_id'])[0]
dev_df['sentence_id'] = pd.factorize(dev_df['sentence_id'])[0]

# Define the labels
LABELS = json_dict["labels"]
print(LABELS)

print(train_df.shape, test_df.shape, dev_df.shape)
print(train_df.head())



['O', 'B-loc', 'B-org', 'B-per', 'I-per', 'B-deriv-per', 'I-org', 'I-loc', 'B-misc', 'I-misc', 'I-deriv-per']
(398681, 3) (51190, 3) (49764, 3)
     sentence_id      words labels
717            0      Kazna      O
718            0  medijskom      O
719            0     mogulu      O
720            0   obnovila      O
721            0   raspravu      O


## Analysing model files

In [None]:
ner_list = dir(NERArgs)

import torch

optimizer_state = torch.load("model/checkpoint-48000/training_args.bin")

attributes = list(dir(optimizer_state))


# Find the intersection of the sets
common_elements = list(set(attributes).intersection(set(ner_list)))

print(common_elements)

In [None]:
print(optimizer_state.resume_from_checkpoint)

In [None]:
optimizer_state.warmup_ratio = 0.06
optimizer_state.learning_rate = 1e-5
optimizer_state.fp16 = True
optimizer_state.logging_steps = 50
#optimizer_state.n_gpu = 1
optimizer_state.gradient_accumulation_steps = 1
optimizer_state.output_dir = "outputs/"
optimizer_state.num_train_epochs = 1
optimizer_state.resume_from_checkpoint = True
optimizer_state.ignore_data_skip = True


# Save arguments with new attributes
torch.save(optimizer_state, "model/checkpoint-48000/training_args.bin")

# Result Analysis - Result list and creation of results summary

In [3]:
# Create a new file for results
with open("ner-results-custom.txt", "w") as file:
    file.write("Date\tModel\tRun\tDataset\tMicro F1\tMacro F1\tLabel Report\n")

In [2]:
# Import the txt with results
import pandas as pd

#results = pd.read_csv("ner-results.txt", sep="\t")
#results = pd.read_csv("ner-results-our-models.txt", sep="\t")
results = pd.read_csv("ner-results-all-models.txt", sep="\t")

results

Unnamed: 0,Date,Model,Run,Dataset,Micro F1,Macro F1,Label Report
0,18/08/2023 16:39:46,xlm-r-large,xlm-r-large-0,datasets/hr500k.conllup_extracted.json,0.990291,0.918266,{'B-deriv-per': {'precision': 0.92105263157894...
1,18/08/2023 16:54:08,xlm-r-large,xlm-r-large-1,datasets/hr500k.conllup_extracted.json,0.990350,0.920143,{'B-deriv-per': {'precision': 0.92307692307692...
2,18/08/2023 17:40:13,xlm-r-base,xlm-r-base-0,datasets/hr500k.conllup_extracted.json,0.988611,0.909217,"{'B-deriv-per': {'precision': 1.0, 'recall': 0..."
3,18/08/2023 17:50:32,xlm-r-base,xlm-r-base-1,datasets/hr500k.conllup_extracted.json,0.988572,0.903684,{'B-deriv-per': {'precision': 0.91891891891891...
4,22/08/2023 09:38:41,xlmrb_bcms-12,xlmrb_bcms-12-0,datasets/hr500k.conllup_extracted.json,0.989627,0.914450,"{'B-deriv-per': {'precision': 1.0, 'recall': 0..."
...,...,...,...,...,...,...,...
459,05/09/2023 17:02:20,bertic,bertic-3,datasets/reldi-normtagner-sr.conllup_extracted...,0.987064,0.807953,{'B-deriv-per': {'precision': 0.85714285714285...
460,05/09/2023 17:04:44,bertic,bertic-0,datasets/set.sr.plus.conllup_extracted.json,0.991157,0.933824,"{'B-deriv-per': {'precision': 1.0, 'recall': 0..."
461,05/09/2023 17:07:09,bertic,bertic-1,datasets/set.sr.plus.conllup_extracted.json,0.990894,0.927756,"{'B-deriv-per': {'precision': 1.0, 'recall': 1..."
462,05/09/2023 17:09:34,bertic,bertic-2,datasets/set.sr.plus.conllup_extracted.json,0.991594,0.935986,"{'B-deriv-per': {'precision': 1.0, 'recall': 1..."


In [3]:
results["Macro F1"] = results["Macro F1"].round(2)

# Pivot the DataFrame to rearrange columns into rows
pivot_df = results.pivot(index='Run', columns='Dataset', values='Macro F1')

# Reset the index to have 'Model' as a column
pivot_df.reset_index(inplace=True)

pivot_df

Dataset,Run,datasets/hr500k.conllup_extracted.json,datasets/reldi-normtagner-hr.conllup_extracted.json,datasets/reldi-normtagner-sr.conllup_extracted.json,datasets/set.sr.plus.conllup_extracted.json
0,bertic-0,0.93,0.80,0.80,0.93
1,bertic-0-old,0.90,0.79,0.64,0.86
2,bertic-1,0.93,0.78,0.82,0.93
3,bertic-1-old,0.92,0.72,0.72,0.81
4,bertic-2,0.92,0.78,0.81,0.94
...,...,...,...,...,...
111,xlmrl_sl-bcms-48-3,0.92,0.79,0.79,0.94
112,xlmrl_sl-bcms-6-0,0.92,0.78,0.79,0.94
113,xlmrl_sl-bcms-6-1,0.93,0.78,0.80,0.94
114,xlmrl_sl-bcms-6-2,0.92,0.78,0.74,0.93


In [4]:
# Save the df
#pivot_df.to_csv("ner-results-summary-table.csv")
pivot_df.to_csv("ner-results-summary-table-all-models.csv")

In [None]:
# Let's analyze the df with all the predictions
import numpy as np

pred_df = pd.read_csv("datasets/hr500k.conllup_extracted.json-test_df-with-predictions.csv", index_col = 0)

# Analyze instances where models are wrong
pred_df["match"] = np.where(pred_df["labels"] != pred_df["y_pred_xlm-r-large_0"], "no", "yes")
pred_df.match.value_counts()

## Create aggregated results

In [5]:
# Join information for model and dataset
results["model-dataset"] = [x[0]+"-"+x[1] for x in list(zip(results["Model"].to_list(), results["Dataset"].to_list()))]
results.head()

Unnamed: 0,Date,Model,Run,Dataset,Micro F1,Macro F1,Label Report,model-dataset
0,18/08/2023 16:39:46,xlm-r-large,xlm-r-large-0,datasets/hr500k.conllup_extracted.json,0.990291,0.92,{'B-deriv-per': {'precision': 0.92105263157894...,xlm-r-large-datasets/hr500k.conllup_extracted....
1,18/08/2023 16:54:08,xlm-r-large,xlm-r-large-1,datasets/hr500k.conllup_extracted.json,0.99035,0.92,{'B-deriv-per': {'precision': 0.92307692307692...,xlm-r-large-datasets/hr500k.conllup_extracted....
2,18/08/2023 17:40:13,xlm-r-base,xlm-r-base-0,datasets/hr500k.conllup_extracted.json,0.988611,0.91,"{'B-deriv-per': {'precision': 1.0, 'recall': 0...",xlm-r-base-datasets/hr500k.conllup_extracted.json
3,18/08/2023 17:50:32,xlm-r-base,xlm-r-base-1,datasets/hr500k.conllup_extracted.json,0.988572,0.9,{'B-deriv-per': {'precision': 0.91891891891891...,xlm-r-base-datasets/hr500k.conllup_extracted.json
4,22/08/2023 09:38:41,xlmrb_bcms-12,xlmrb_bcms-12-0,datasets/hr500k.conllup_extracted.json,0.989627,0.91,"{'B-deriv-per': {'precision': 1.0, 'recall': 0...",xlmrb_bcms-12-datasets/hr500k.conllup_extracte...


In [6]:
agg_results = pd.concat([results.groupby("model-dataset")["Macro F1"].mean(), results.groupby("model-dataset")["Macro F1"].std()], axis = 1)

# Rename columns
agg_results.columns = ["Macro F1", "Std"]
agg_results

Unnamed: 0_level_0,Macro F1,Std
model-dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
bertic-datasets/hr500k.conllup_extracted.json,0.9250,0.005774
bertic-datasets/reldi-normtagner-hr.conllup_extracted.json,0.7850,0.010000
bertic-datasets/reldi-normtagner-sr.conllup_extracted.json,0.8100,0.008165
bertic-datasets/set.sr.plus.conllup_extracted.json,0.9325,0.005000
bertic-old-datasets/hr500k.conllup_extracted.json,0.9125,0.009574
...,...,...
xlmrl_sl-bcms-48-datasets/set.sr.plus.conllup_extracted.json,0.9400,0.000000
xlmrl_sl-bcms-6-datasets/hr500k.conllup_extracted.json,0.9250,0.005774
xlmrl_sl-bcms-6-datasets/reldi-normtagner-hr.conllup_extracted.json,0.7850,0.010000
xlmrl_sl-bcms-6-datasets/reldi-normtagner-sr.conllup_extracted.json,0.7825,0.028723


In [7]:
# Reset index
agg_results.reset_index(inplace=True)

agg_results.head()


Unnamed: 0,model-dataset,Macro F1,Std
0,bertic-datasets/hr500k.conllup_extracted.json,0.925,0.005774
1,bertic-datasets/reldi-normtagner-hr.conllup_ex...,0.785,0.01
2,bertic-datasets/reldi-normtagner-sr.conllup_ex...,0.81,0.008165
3,bertic-datasets/set.sr.plus.conllup_extracted....,0.9325,0.005
4,bertic-old-datasets/hr500k.conllup_extracted.json,0.9125,0.009574


In [8]:
# Split the 'model_dataset' column into 'model' and 'dataset' columns
agg_results[['Model', 'Dataset']] = agg_results['model-dataset'].str.split('-datasets/', n=1, expand=True)

# Pivot the DataFrame to the desired structure
pivot_agg_results = agg_results.pivot(index='Model', columns='Dataset', values=['Macro F1', 'Std'])

# Flatten the column MultiIndex
pivot_agg_results.columns = [f'{col[1]}-{col[0]}' for col in pivot_agg_results.columns]

# Reset index and display the final DataFrame
final_agg_results = pivot_agg_results.reset_index()

final_agg_results.head()


Unnamed: 0,Model,hr500k.conllup_extracted.json-Macro F1,reldi-normtagner-hr.conllup_extracted.json-Macro F1,reldi-normtagner-sr.conllup_extracted.json-Macro F1,set.sr.plus.conllup_extracted.json-Macro F1,hr500k.conllup_extracted.json-Std,reldi-normtagner-hr.conllup_extracted.json-Std,reldi-normtagner-sr.conllup_extracted.json-Std,set.sr.plus.conllup_extracted.json-Std
0,bertic,0.925,0.785,0.81,0.9325,0.005774,0.01,0.008165,0.005
1,bertic-old,0.9125,0.7425,0.66,0.8175,0.009574,0.03304,0.069761,0.028723
2,csebert,0.9125,0.7575,0.7275,0.9075,0.005,0.020616,0.015,0.005
3,xlm-r-base,0.905,0.7325,0.6225,0.8925,0.005774,0.01893,0.048563,0.005
4,xlm-r-large,0.92,0.7775,0.77,0.9325,0.0,0.022174,0.008165,0.005


In [9]:
# Change the order of the columns
final_agg_results = final_agg_results[['Model', 'hr500k.conllup_extracted.json-Macro F1', 'hr500k.conllup_extracted.json-Std', 'reldi-normtagner-hr.conllup_extracted.json-Macro F1', 'reldi-normtagner-hr.conllup_extracted.json-Std', 'reldi-normtagner-sr.conllup_extracted.json-Macro F1', 'reldi-normtagner-sr.conllup_extracted.json-Std', 'set.sr.plus.conllup_extracted.json-Macro F1', 'set.sr.plus.conllup_extracted.json-Std']]

final_agg_results.head()

Unnamed: 0,Model,hr500k.conllup_extracted.json-Macro F1,hr500k.conllup_extracted.json-Std,reldi-normtagner-hr.conllup_extracted.json-Macro F1,reldi-normtagner-hr.conllup_extracted.json-Std,reldi-normtagner-sr.conllup_extracted.json-Macro F1,reldi-normtagner-sr.conllup_extracted.json-Std,set.sr.plus.conllup_extracted.json-Macro F1,set.sr.plus.conllup_extracted.json-Std
0,bertic,0.925,0.005774,0.785,0.01,0.81,0.008165,0.9325,0.005
1,bertic-old,0.9125,0.009574,0.7425,0.03304,0.66,0.069761,0.8175,0.028723
2,csebert,0.9125,0.005,0.7575,0.020616,0.7275,0.015,0.9075,0.005
3,xlm-r-base,0.905,0.005774,0.7325,0.01893,0.6225,0.048563,0.8925,0.005
4,xlm-r-large,0.92,0.0,0.7775,0.022174,0.77,0.008165,0.9325,0.005


In [10]:
# Save the results
final_agg_results.to_csv("aggregated-results-all-models.csv")

## Result Analysis: analysis of summary table of results

In [3]:
# Import the summary
import pandas as pd

sum_df = pd.read_csv("ner-results-summary-table.csv", index_col = 0)

sum_df.head()

Unnamed: 0,Run,datasets/hr500k.conllup_extracted.json,datasets/reldi-normtagner-hr.conllup_extracted.json,datasets/reldi-normtagner-sr.conllup_extracted.json,datasets/set.sr.plus.conllup_extracted.json
0,bertic-0,0.9,0.79,0.64,0.86
1,bertic-1,0.92,0.72,0.72,0.81
2,csebert-0,0.91,0.73,0.71,0.91
3,csebert-1,0.91,0.76,0.74,0.91
4,xlm-r-base-0,0.91,0.76,0.65,0.89
