# Preparing the dataset

Import all libraries necessary for this step

In [10]:
import pandas as pd

We'll import the csv files, prepared for the ML experiments.


In [11]:
train_df = pd.read_csv("/kaggle/input/gincodataframededuptraindevtest/GINCO_dataframe_dedup_train_dev.csv")
test_df = pd.read_csv("/kaggle//input/gincodataframededuptraindevtest/GINCO_dataframe_dedup_test.csv")

print("Train shape: {}, Test shape: {}.".format(train_df.shape, test_df.shape))

In [12]:
train_df.tail()

In [13]:
test_df.tail()

We will need to specify the exact number of labels, so we calculate it from our dataframe.

In [14]:
LABELS = train_df.labels.unique().tolist()
NUM_LABELS = len(LABELS)
NUM_LABELS

As we are using deduplicated text, it is possible that some of the instances have no text (nan instead of text string). We need to drop them.

In [15]:
train_df = train_df.dropna()
train_df.shape

In [16]:
test_df = test_df.dropna()

test_df.shape

# Training the baseline - SloBERTa

Import all libraries and install everything necessary for this step.

In [17]:
# install pytorch
!conda install --yes pytorch>=1.6 cudatoolkit=11.0 -c pytorch

In [18]:
# install simpletransformers
!pip install -q transformers
!pip install --upgrade transformers
!pip install -q simpletransformers

# check installed version
!pip freeze | grep simpletransformers

In [19]:
!pip uninstall -q torch -y
!pip install -q torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html

In [23]:
from simpletransformers.classification import ClassificationModel

# define hyperparameter
model_args ={"overwrite_output_dir": True,
             "num_train_epochs": 90,
             "labels_list": LABELS,
             "learning_rate": 1e-5,
             "train_batch_size": 32,
             "no_cache": True,
             "no_save": True,
             "max_seq_length": 300,
             "save_steps": -1,
             }

model = ClassificationModel(
    "camembert", "EMBEDDIA/sloberta",
    use_cuda = True,
    num_labels = NUM_LABELS,
    args = model_args)

model.train_model(train_df)

## SloBERTa Prediction

Import all libraries necessary for this step:

In [30]:
from sklearn.metrics import f1_score, ConfusionMatrixDisplay, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import numpy as np

Let's try if the model works:

In [25]:
Instance_predictions, raw_outputs = model.predict(['Danes poročamo o dogodku, ki se je zgodil 1. 1. 2020. Oseba je dejala:"To je res nenormalen dogodek"'])

In [26]:
Instance_predictions

Let's evaluate the model:

In [27]:
# Get the true labels from the dataframe
y_true = test_df.labels

# Calculate the model's predictions
y_pred = model.predict(test_df.text.tolist())[0]

Calculate the macro and micro F1 scores:

In [28]:
macro = f1_score(y_true, y_pred, labels=LABELS, average="macro")
micro = f1_score(y_true, y_pred, labels=LABELS,  average="micro")
print(f"Macro f1: {macro:0.3}\nMicro f1: {micro:0.3}")

Produce the confusion matrix:

In [37]:
def plot_cm(y_true, y_pred, labels, title=None):
    cm = confusion_matrix(y_true, y_pred, labels=labels, )
    # print(cm)
    plt.figure(figsize=(9, 9))
    plt.imshow(cm, cmap="Oranges")
    for (i, j), z in np.ndenumerate(cm):
        plt.text(j, i, '{:d}'.format(z), ha='center', va='center')
    classNames = labels
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(classNames))
    plt.xticks(tick_marks, classNames, rotation=90)
    plt.yticks(tick_marks, classNames)
    microF1 = f1_score(y_true, y_pred, labels=labels, average ="micro")
    macroF1 = f1_score(y_true, y_pred, labels=labels, average ="macro")

    print(f"Micro F1: {microF1:0.3}",f"Macro F1: {macroF1:0.3}")

    metrics = f"{microF1:0.4}, {macroF1:0.4}"
    if title:
        plt.title(title +";\n" + metrics)
    else:
        plt.title(metrics)
    plt.tight_layout()
    plt.show()
    return microF1, macroF1

In [39]:
plot_cm(y_true, y_pred, LABELS, title="SloBERTa-Initial_Setup_test_dev")
#plt.savefig("SloBERTa-Initial-Setup_test_dev.png")