# Training a Transformer model

To compare the fastText performance with the performance of Transformer models, I also trained the base-sized XLM-RoBERTa model on the baseline text.

## Training and testing on Transformer models

Importing the necessary libraries

In [1]:
# install the libraries necessary for data wrangling, prediction and result analysis
import json
import numpy as np
import pandas as pd
import logging
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, f1_score,precision_score, recall_score
import torch
from numba import cuda
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier

In [2]:
# Install transformers
# (this needs to be done on Kaggle each time you start the session)
!pip install -q transformers

In [3]:
# Install the simpletransformers
!pip install -q simpletransformers
from simpletransformers.classification import ClassificationModel

In [4]:
# Install wandb
!pip install -q wandb
import wandb
# Login to wandb
wandb.login()

In [5]:
#Import the data
dataframe = pd.read_json("/kaggle/input/ginco-with-additional-text-representations/Language-Processed-GINCO.json")


In [6]:
dataframe.head(2)

In [7]:
dataframe.shape

In [8]:
# Add the downcasted labels
dataframe["downcasted_to_5"] = np.where((dataframe['primary_level_3'] == 'Opinionated News'),'News', dataframe['primary_level_3'])

dataframe["downcasted_to_5"] = np.where((dataframe['downcasted_to_5'] == 'News/Reporting'),'News', dataframe['downcasted_to_5'])

In [9]:
dataframe["downcasted_to_5"].unique()

In [10]:
dataframe.head(2)

In [11]:
# Discard texts with labels that are not in the reduced set
# and create train-test-dev splits
downcasted_labels = ['Information/Explanation', 'Promotion', 'News', 'Forum', 'Opinion/Argumentation']

dataframe = dataframe[dataframe["downcasted_to_5"].isin(downcasted_labels)]

dataframe.describe(include="all")

In [12]:
dataframe.hard.value_counts()

In [13]:
# Discard hard texts
dataframe = dataframe[dataframe["hard"] != True]
dataframe.describe(include="all")

In [20]:
# Split the dataset into train, test and dev split
first_train_df = dataframe[dataframe["split"] == "train"]
first_test_df = dataframe[dataframe["split"] == "test"]
first_dev_df = dataframe[dataframe["split"] == "dev"]

print(f"The train-dev-test splits consist of the following numbers of examples: {first_train_df.shape}, {first_dev_df.shape}, {first_test_df.shape}")

In [15]:
# Create a proper train df dataframe
train_df = pd.DataFrame({"text": first_train_df["baseline_text"], "labels": first_train_df["downcasted_to_5"]})

train_df.head(2)

In [16]:
# Create a proper test df dataframe
test_df = pd.DataFrame({"text": first_test_df["baseline_text"], "labels": first_test_df["downcasted_to_5"]})

test_df.head(2)

In [17]:
# Create a proper dev df dataframe
dev_df = pd.DataFrame({"text": first_dev_df["baseline_text"], "labels": first_dev_df["downcasted_to_5"]})

dev_df.head(2)

In [18]:
LABELS = list(train_df["labels"].unique())
LABELS

In [19]:
#We will use the multilingual XLM-RoBERTa model
#https://huggingface.co/xlm-roberta-base
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# Initialize Wandb
wandb.init(project="GINCO-hyperparameter-search", name="training_on_5_labels")

In [None]:
# Calculate how many steps will each epoch have
# Num steps in epoch = training samples / batch size
steps_per_epoch = int(410/8)
steps_per_epoch

I evaluated per every 10th epoch - per 510 steps. I first trained the model while evaluating it to find the optimal number of epochs.

In [None]:
# Create a TransformerModel and evaluate during training
epoch = 30

roberta_base_model = ClassificationModel(
        "xlmroberta", "xlm-roberta-base",
        num_labels=len(LABELS),
        use_cuda=True,
        args= {
            "overwrite_output_dir": True,
            "num_train_epochs": epoch,
            "train_batch_size":8,
            "learning_rate": 1e-5,
            # Use these parameters if you want to evaluate during training
            "evaluate_during_training": True,
            "evaluate_during_training_steps": steps_per_epoch*10,
            "evaluate_during_training_verbose": True,
            "use_cached_eval_features": True,
            'reprocess_input_data': True,
            "labels_list": LABELS,
            # The following parameters (no_cache, no_save) are commented out if I want to save the model
            "no_cache": True,
            # Disable no_save: True if you want to save the model
            "no_save": True,
            "max_seq_length": 512,
            "save_steps": -1,
            # Only the trained model will be saved - to prevent filling all of the space
            "save_model_every_epoch":False,
            "wandb_project": 'GINCO-hyperparameter-search',
            "silent": True,
            }
        )

# Train the model and evaluate during training
roberta_base_model.train_model(train_df, eval_df = dev_df)

Evaluation during training showed that the number of epochs before the eval_loss starts rising is somewhere between epochs 5 and 13. We then trained the model for epochs 5, 10, 13 and 15 to find the optimum number.

In [None]:
def testing(test_df, test_name, epoch):
    """
    This function takes the test dataset and applies the trained model on it to infer predictions.
    It also prints and saves a confusion matrix, calculates the F1 scores and saves the results in a list of results.

    Args:
    - test_df (pandas DataFrame)
    - test_name
    - epoch: num_train_epochs
    """
    # Get the true labels
    y_true = test_df.labels

    model = roberta_base_model
    
    # Calculate the model's predictions on test
    def make_prediction(input_string):
        return model.predict([input_string])[0][0]

    y_pred = test_df.text.apply(make_prediction)

    # Calculate the scores
    macro = f1_score(y_true, y_pred, labels=LABELS, average="macro")
    micro = f1_score(y_true, y_pred, labels=LABELS,  average="micro")
    print(f"Macro f1: {macro:0.3}, Micro f1: {micro:0.3}")

    # Plot the confusion matrix:
    cm = confusion_matrix(y_true, y_pred, labels=LABELS)
    plt.figure(figsize=(9, 9))
    plt.imshow(cm, cmap="Oranges")
    for (i, j), z in np.ndenumerate(cm):
        plt.text(j, i, '{:d}'.format(z), ha='center', va='center')
    classNames = LABELS
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(classNames))
    plt.xticks(tick_marks, classNames, rotation=90)
    plt.yticks(tick_marks, classNames)
    plt.title(f"{test_name}")

    plt.tight_layout()
    fig1 = plt.gcf()
    plt.show()
    plt.draw()
    fig1.savefig(f"Confusion-matrix-{test_name}.png",dpi=100)

    # Save the results:
    rezdict = {
        "experiment": test_name,
        "num_train_epochs": epoch,
        "train_batch_size":8,
        "learning_rate": 1e-5,
        "microF1": micro,
        "macroF1": macro,
        "y_true": y_true.to_dict(),
        "y_pred": y_pred.to_dict(),
        }
    previous_results.append(rezdict)

    #Save intermediate results (just in case)
    backup = []
    backup.append(rezdict)
    with open(f"backup-results-{test_name}.json", "w") as backup_file:
        json.dump(backup,backup_file, indent= "")

In [None]:
# Train the model for various epochs to find the optimum number
epochs = [5, 10, 13, 15]

for epoch in epochs:
    roberta_base_model = ClassificationModel(
                "xlmroberta", "xlm-roberta-base",
                num_labels=len(LABELS),
                use_cuda=True,
                args= {
                    "overwrite_output_dir": True,
                    "num_train_epochs": epoch,
                    "train_batch_size":8,
                    "learning_rate": 1e-5,
                    "labels_list": LABELS,
                    # The following parameters (no_cache, no_save) are commented out if I want to save the model
                    "no_cache": True,
                    # Disable no_save: True if you want to save the model
                    "no_save": True,
                    "max_seq_length": 512,
                    "save_steps": -1,
                    # Only the trained model will be saved - to prevent filling all of the space
                    "save_model_every_epoch":False,
                    "wandb_project": 'GINCO-hyperparameter-search',
                    "silent": True,
                    }
                )

    # Train the model
    roberta_base_model.train_model(train_df)
    
    # Test the model on dev_df
    testing(dev_df, f"Dev-epoch-search:{epoch}", epoch)

In [None]:
# Compare the results by creating a dataframe from the previous_results dictionary:
results_df = pd.DataFrame(previous_results)

results_df

Optimum number of epochs is 13.

In [None]:
# Train the model and save it
# Create a TransformerModel
roberta_base_model = ClassificationModel(
        "xlmroberta", "xlm-roberta-base",
        num_labels=len(LABELS),
        use_cuda=True,
        args= {
            "overwrite_output_dir": True,
            "num_train_epochs": 13,
            "train_batch_size":8,
            "learning_rate": 1e-5,
            "labels_list": LABELS,
            # The following parameters are commented out because I want to save the model
            #"no_cache": True,
            # Disable no_save: True if you want to save the model
            #"no_save": True,
            "max_seq_length": 512,
            "save_steps": -1,
            # Only the trained model will be saved - to prevent filling all of the space
            "save_model_every_epoch":False,
            "wandb_project": 'GINCO-hyperparameter-search',
            "silent": True,
            }
        )

# Train the model
roberta_base_model.train_model(train_df)



In [None]:
# Save the trained model to Wandb
run = wandb.init(project="GINCO-hyperparameter-search", entity="tajak", name="saving-trained-model")
trained_model_artifact = wandb.Artifact("GINCO-5-labels-classifier", type="model", description="a model trained on the (Slovene) GINCO dataset with only the most frequent labels (5): 'Information/Explanation', 'Promotion', 'News', 'Forum', 'Opinion/Argumentation'. The model was trained as a part of the experiments for the Exploring the Impact of Lexical and Grammatical Features on Automatic Genre Identification article, mainly to see how its performance differs from the FastText. The model was trained on keep paragraphs and all instances that are not a part of the 5 labels (created from the primary_level_3 downcasted label set) and all instances marked with hard were discarded.")
trained_model_artifact.add_dir("outputs")
run.log_artifact(trained_model_artifact)

Import all necessary libraries and install everything you need for training:

## Testing

In [21]:
# Create a file to save results into (you can find it under Data: Output). Be careful, run this step only once to not overwrite the results file.
results = []

with open("GINCO-5-categories-experiments.json", "w") as results_file:
    json.dump(results,results_file, indent= "")

In [22]:
# Open the main results file:

previous_results_file = open("GINCO-5-categories-experiments.json")
previous_results = json.load(previous_results_file)
len(previous_results)

In [23]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [24]:
# Create a list of labels
LABELS = train_df.labels.unique().tolist()
LABELS

In [25]:
# Initialize Wandb
run = wandb.init(project="GINCO-hyperparameter-search", entity="tajak", name="testing-trained-model-5-labels")

In [26]:
# Load the saved model
artifact = run.use_artifact('tajak/GINCO-hyperparameter-search/GINCO-5-labels-classifier:v0', type='model')
artifact_dir = artifact.download()

# Loading a local save
model = ClassificationModel(
    "xlmroberta", "artifacts/GINCO-5-labels-classifier:v0")

In [27]:
def testing(test_df, test_name):
    """
    This function takes the test dataset and applies the trained model on it to infer predictions.
    It also prints and saves a confusion matrix, calculates the F1 scores and saves the results in a list of results.

    Args:
    - test_df (pandas DataFrame)
    - test_name
    """
    # Get the true labels
    y_true = test_df.labels
    
    # Calculate the model's predictions on test
    def make_prediction(input_string):
        return model.predict([input_string])[0][0]

    y_pred = test_df.text.apply(make_prediction)
    test_df["y_pred_GINCO_5_labels"] = y_pred

    # Calculate the scores
    macro = f1_score(y_true, y_pred, labels=LABELS, average="macro")
    micro = f1_score(y_true, y_pred, labels=LABELS,  average="micro")
    print(f"Macro f1: {macro:0.3}, Micro f1: {micro:0.3}")

    # Plot the confusion matrix:
    cm = confusion_matrix(y_true, y_pred, labels=LABELS)
    plt.figure(figsize=(9, 9))
    plt.imshow(cm, cmap="Oranges")
    for (i, j), z in np.ndenumerate(cm):
        plt.text(j, i, '{:d}'.format(z), ha='center', va='center')
    classNames = LABELS
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(classNames))
    plt.xticks(tick_marks, classNames, rotation=90)
    plt.yticks(tick_marks, classNames)
    plt.title(f"{test_name}")

    plt.tight_layout()
    fig1 = plt.gcf()
    plt.show()
    plt.draw()
    fig1.savefig(f"Confusion-matrix-{test_name}.png",dpi=100)

    # Save the results:
    rezdict = {
        "experiment": test_name,
        "num_train_epochs": 13,
        "train_batch_size":8,
        "learning_rate": 1e-5,
        "microF1": micro,
        "macroF1": macro,
        "y_true": y_true.to_dict(),
        "y_pred": y_pred.to_dict(),
        }
    previous_results.append(rezdict)

    #Save intermediate results (just in case)
    backup = []
    backup.append(rezdict)
    with open(f"backup-results-{test_name}.json", "w") as backup_file:
        json.dump(backup,backup_file, indent= "")

    # Save the new dataframe which contains the y_pred values as well
    test_df.to_csv(f"{test_name}-sheet-with-predictions.csv", sep="\t")

In [28]:
testing(dev_df, "testing-GINCO-5-labels-on-dev")

print("\nTesting finished.")

In [29]:
testing(test_df, "testing-GINCO-5-labels-on-test")
print("\nTesting finished.")

In [30]:
# Compare the results by creating a dataframe from the previous_results dictionary:
results_df = pd.DataFrame(previous_results)

results_df

In [32]:
# Save the file with updated results.
with open("GINCO-5-categories-experiments.json", "w") as results_file:
    json.dump(previous_results,results_file, indent= "")