In this notebook, we'll prepare the code for all transformers, so that we'll see whether the hyperparameters (especially the max_seq_length parameter which causes errors) work on all of them.

## Transformers

### SloBERTa
Slovene model
https://huggingface.co/EMBEDDIA/sloberta

In [None]:
def sloberta_model(train_df, labels=None, max_seq_length_no=300):
    """Trains a simpletransformer model and returns it.

    Args:
        train_df (pandas.DataFrame): A DataFrame with columns ["text", "labels"].
        labels (list(str), optional): If not None, use these labels to use string labels instead of numeric labels. 
            Defaults to None.
        max_seq_length_no (int, optional): Defaults to 300.

    Returns:
        simpletransformers.ClassificationModel: a trained model
    """
    from simpletransformers.classification import ClassificationModel

    # define hyperparameter
     model_args ={"overwrite_output_dir": True,
             "num_train_epochs": 90,
             "labels_list": LABELS,
             "learning_rate": 1e-5,
             "train_batch_size": 32,
             "no_cache": True,
             "no_save": True,
             "max_seq_length": max_seq_length_no,
             "save_steps": -1,
             }


    model = ClassificationModel(
        "camembert", "EMBEDDIA/sloberta",
        num_labels=21,
        use_cuda=True,
        args=model_args
    )
    model.train_model(train_df)
    return model


### CroSloEngual BERT
Slovene-Croatian-English model
https://huggingface.co/EMBEDDIA/crosloengual-bert

In [None]:
def crosloengualbert_model(train_df, labels=None, max_seq_length_no=300):
    """Trains a simpletransformer model and returns it.

    Args:
        train_df (pandas.DataFrame): A DataFrame with columns ["text", "labels"].
        labels (list(str), optional): If not None, use these labels to use string labels instead of numeric labels. 
            Defaults to None.
        max_seq_length_no (int, optional): Defaults to 300.

    Returns:
        simpletransformers.ClassificationModel: a trained model
    """
    from simpletransformers.classification import ClassificationModel

    # define hyperparameter
     model_args ={"overwrite_output_dir": True,
             "num_train_epochs": 90,
             "labels_list": LABELS,
             "learning_rate": 1e-5,
             "train_batch_size": 32,
             "no_cache": True,
             "no_save": True,
             "max_seq_length": max_seq_length_no,
             "save_steps": -1,
             }


    model = ClassificationModel(
        "bert", "EMBEDDIA/crosloengual-bert",
        num_labels=21,
        use_cuda=True,
        args=model_args
    )
    model.train_model(train_df)
    return model

### Base-sized XLM-RoBERTa

Multilingual model
https://huggingface.co/xlm-roberta-base

In [None]:
def roberta_base_model(train_df, labels=None, max_seq_length_no=300):
    """Trains a simpletransformer model and returns it.

    Args:
        train_df (pandas.DataFrame): A DataFrame with columns ["text", "labels"].
        labels (list(str), optional): If not None, use these labels to use string labels instead of numeric labels. 
            Defaults to None.
        max_seq_length_no (int, optional): Defaults to 300.

    Returns:
        simpletransformers.ClassificationModel: a trained model
    """
    from simpletransformers.classification import ClassificationModel

    # define hyperparameter
     model_args ={"overwrite_output_dir": True,
             "num_train_epochs": 90,
             "labels_list": LABELS,
             "learning_rate": 1e-5,
             "train_batch_size": 32,
             "no_cache": True,
             "no_save": True,
             "max_seq_length": max_seq_length_no,
             "save_steps": -1,
             }


    model = ClassificationModel(
        "xlm-roberta", "xlm-roberta-base",
        num_labels=21,
        use_cuda=True,
        args=model_args
    )
    model.train_model(train_df)
    return model

### Large-sized XML-RoBERTa
Multilingual model https://huggingface.co/xlm-roberta-large

In [None]:
def roberta_large_model(train_df, labels=None, max_seq_length_no=300):
    """Trains a simpletransformer model and returns it.

    Args:
        train_df (pandas.DataFrame): A DataFrame with columns ["text", "labels"].
        labels (list(str), optional): If not None, use these labels to use string labels instead of numeric labels. 
            Defaults to None.
        max_seq_length_no (int, optional): Defaults to 300.

    Returns:
        simpletransformers.ClassificationModel: a trained model
    """
    from simpletransformers.classification import ClassificationModel

    # define hyperparameter
     model_args ={"overwrite_output_dir": True,
             "num_train_epochs": 90,
             "labels_list": LABELS,
             "learning_rate": 1e-5,
             "train_batch_size": 32,
             "no_cache": True,
             "no_save": True,
             "max_seq_length": max_seq_length_no,
             "save_steps": -1,
             }


    model = ClassificationModel(
        "xlm-roberta", "xlm-roberta-large",
        num_labels=21,
        use_cuda=True,
        args=model_args
    )
    model.train_model(train_df)
    return model

### DeBERTaV3
Multilingual model https://huggingface.co/microsoft/mdeberta-v3-base

In [None]:
def debertav3_model(train_df, labels=None, max_seq_length_no=300):
    """Trains a simpletransformer model and returns it.

    Args:
        train_df (pandas.DataFrame): A DataFrame with columns ["text", "labels"].
        labels (list(str), optional): If not None, use these labels to use string labels instead of numeric labels. 
            Defaults to None.
        max_seq_length_no (int, optional): Defaults to 300.

    Returns:
        simpletransformers.ClassificationModel: a trained model
    """
    from simpletransformers.classification import ClassificationModel

    # define hyperparameter
     model_args ={"overwrite_output_dir": True,
             "num_train_epochs": 90,
             "labels_list": LABELS,
             "learning_rate": 1e-5,
             "train_batch_size": 32,
             "no_cache": True,
             "no_save": True,
             "max_seq_length": max_seq_length_no,
             "save_steps": -1,
             }


    model = ClassificationModel(
        "deberta-v2", "microsoft/mdeberta-v3-base",
        num_labels=21,
        use_cuda=True,
        args=model_args
    )
    model.train_model(train_df)
    return model

### BERTić
Model for related South Slavic languages https://huggingface.co/classla/bcms-bertic

In [None]:
def bertic_model(train_df, labels=None, max_seq_length_no=300):
    """Trains a simpletransformer model and returns it.

    Args:
        train_df (pandas.DataFrame): A DataFrame with columns ["text", "labels"].
        labels (list(str), optional): If not None, use these labels to use string labels instead of numeric labels. 
            Defaults to None.
        max_seq_length_no (int, optional): Defaults to 300.

    Returns:
        simpletransformers.ClassificationModel: a trained model
    """
    from simpletransformers.classification import ClassificationModel

    # define hyperparameter
     model_args ={"overwrite_output_dir": True,
             "num_train_epochs": 90,
             "labels_list": LABELS,
             "learning_rate": 1e-5,
             "train_batch_size": 32,
             "no_cache": True,
             "no_save": True,
             "max_seq_length": max_seq_length_no,
             "save_steps": -1,
             }


    model = ClassificationModel(
        "electra", "classla/bcms-bertic",
        num_labels=21,
        use_cuda=True,
        args=model_args
    )
    model.train_model(train_df)
    return model

### BERT base model (cased)
Monolingual English model https://huggingface.co/bert-base-cased

In [None]:
def bertbase_model(train_df, labels=None, max_seq_length_no=300):
    """Trains a simpletransformer model and returns it.

    Args:
        train_df (pandas.DataFrame): A DataFrame with columns ["text", "labels"].
        labels (list(str), optional): If not None, use these labels to use string labels instead of numeric labels. 
            Defaults to None.
        max_seq_length_no (int, optional): Defaults to 300.

    Returns:
        simpletransformers.ClassificationModel: a trained model
    """
    from simpletransformers.classification import ClassificationModel

    # define hyperparameter
     model_args ={"overwrite_output_dir": True,
             "num_train_epochs": 90,
             "labels_list": LABELS,
             "learning_rate": 1e-5,
             "train_batch_size": 32,
             "no_cache": True,
             "no_save": True,
             "max_seq_length": max_seq_length_no,
             "save_steps": -1,
             }


    model = ClassificationModel(
        "bert", "bert-base-cased",
        num_labels=21,
        use_cuda=True,
        args=model_args
    )
    model.train_model(train_df)
    return model

## Training and evaluation

Import the data:

In [None]:
import pandas as pd

# Import the data, prepared for the experiments
train_df = pd.read_csv("/kaggle/input/gincodataframededuptraindevtest/GINCO_dataframe_dedup_train_dev.csv")
test_df = pd.read_csv("/kaggle//input/gincodataframededuptraindevtest/GINCO_dataframe_dedup_test.csv")

print("Train shape: {}, Test shape: {}.".format(train_df.shape, test_df.shape))

In [None]:
# Create a list of labels
LABELS = train_df.labels.unique().tolist()

In [None]:
# Drop the instances with no text
train_df = train_df.dropna()
test_df = test_df.dropna()
print("Train shape: {}, Test shape: {}.".format(train_df.shape, test_df.shape))

Import all necessary libraries and install everything you need for training:

In [None]:
# install pytorch
!conda install --yes pytorch>=1.6 cudatoolkit=11.0 -c pytorch

# install simpletransformers
!pip install -q transformers
!pip install --upgrade transformers
!pip install -q simpletransformers

# check installed version
!pip freeze | grep simpletransformers

# install stable torch
!pip uninstall -q torch -y
!pip install -q torch==1.6.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html

# install the libraries necessary for prediction and result analysis
from sklearn.metrics import f1_score, ConfusionMatrixDisplay, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import numpy as np

Train:

In [None]:
# SloBERTa
sloberta_model(train_df)

# CroSloEngual BERT
#crosloengualbert_model(train_df)

# Base-sized XML-Roberta
#roberta_base_model(train_df)

# Large-sized XML-Roberta
#roberta_large_model(train_df)

# DeBERTav3
#debertav3_model(train_df)

# BERTić
#bertic_model(train_df)

# English base-sized BERT
#bertbase_model(train_df)

Evaluate:

Let's try if the model works:

In [None]:
Instance_predictions, raw_outputs = model.predict(['Danes poročamo o dogodku, ki se je zgodil 1. 1. 2020. Oseba je dejala:"To je res nenormalen dogodek"'])

In [None]:
Instance_predictions

Let's evaluate the model:

In [None]:
# Get the true labels from the dataframe
y_true = test_df.labels

# Calculate the model's predictions
y_pred = model.predict(test_df.text.tolist())[0]

Calculate the macro and micro F1 scores:

In [None]:
macro = f1_score(y_true, y_pred, labels=LABELS, average="macro")
micro = f1_score(y_true, y_pred, labels=LABELS,  average="micro")
print(f"Macro f1: {macro:0.3}\nMicro f1: {micro:0.3}")

Produce the confusion matrix:

In [None]:
def plot_cm(y_true, y_pred, labels, title=None):
    cm = confusion_matrix(y_true, y_pred, labels=labels, )
    # print(cm)
    plt.figure(figsize=(9, 9))
    plt.imshow(cm, cmap="Oranges")
    for (i, j), z in np.ndenumerate(cm):
        plt.text(j, i, '{:d}'.format(z), ha='center', va='center')
    classNames = labels
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(classNames))
    plt.xticks(tick_marks, classNames, rotation=90)
    plt.yticks(tick_marks, classNames)
    microF1 = f1_score(y_true, y_pred, labels=labels, average ="micro")
    macroF1 = f1_score(y_true, y_pred, labels=labels, average ="macro")

    print(f"Micro F1: {microF1:0.3}",f"Macro F1: {macroF1:0.3}")

    metrics = f"{microF1:0.4}, {macroF1:0.4}"
    if title:
        plt.title(title +";\n" + metrics)
    else:
        plt.title(metrics)
    plt.tight_layout()
    plt.show()
    return microF1, macroF1

In [None]:
plot_cm(y_true, y_pred, LABELS, title="SloBERTa-Initial_Setup_test_dev")
#plt.savefig("SloBERTa-Initial-Setup_test_dev.png")