## Import Libraries

In [None]:
import datasets
import numpy as np
import pandas as pd
import random
import tensorflow as tf
import torch
from datasets import load_dataset, load_metric, DatasetDict
from IPython.display import display, HTML

# !pip install transformers datasets

## Set Parameters
These are the parameters that have to be set before starting the training

In [None]:
# Select the pre-trained model that is finetuned
BASE_MODEL = "distilbert-base-german-cased"
# BASE_MODEL = 'dbmdz/bert-base-german-cased'
# BASE_MODEL = 'bert-base-uncased'

# Model and data are stored in the directory <timestamp>_<MODEL_DIR_NAME_SUFFIX>
MODEL_DIR_NAME_SUFFIX = "BIN_DistilBERT"

## Find Computing Device

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != "/device:GPU:0":
    raise SystemError("GPU device not found")
print("Found GPU at: {}".format(device_name))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

## Load and Preprocess Training Data

In [None]:
metric = load_metric("accuracy")

MESSAGES_DATA_CSV = "labeled_messages_binary.csv"
CONTENT_COLUMN_NAME = "content"
LABEL_COLUMN_NAME = "relevant"
RESULT_DIRECTORY_NAME = "binary_classifier"

messages_dataset = load_dataset("csv", data_files=[MESSAGES_DATA_CSV])
messages_dataset = messages_dataset.remove_columns(["message_hash"])
messages_dataset

In [None]:
# Show random examples from the given dataset

def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(
        dataset
    ), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset) - 1)
        while pick in picks:
            pick = random.randint(0, len(dataset) - 1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))


show_random_elements(messages_dataset["train"])

In [None]:
# Map dataset samples to inputs that are usable for the model (text to vectors, 
# string labels to integers)

from transformers import AutoTokenizer

# Define tokenizer
# Not all models support use_fast=True. Maybe you need to remove it.
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=True)


def preprocess_function(batch):
    tokenized_batch = tokenizer(
        batch[CONTENT_COLUMN_NAME], padding=True, truncation=True
    )
    tokenized_batch["label"] = [label for label in batch[LABEL_COLUMN_NAME]]
    return tokenized_batch


encoded_dataset = messages_dataset.map(preprocess_function, batched=True)
encoded_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
# 90% train, 10% test. Set train to 100% for final model?
train_test = encoded_dataset["train"].train_test_split(train_size=0.9)
datasets = DatasetDict({"train": train_test["train"], "test": train_test["test"]})

datasets["train"].features

In [None]:
# Initialize model

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL, num_labels=2
)

In [None]:
# Define training parameters

from transformers import TrainingArguments

batch_size = 5
metric_name = "accuracy"

args = TrainingArguments(
    RESULT_DIRECTORY_NAME,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=1,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    save_strategy="epoch",
    metric_for_best_model=metric_name,
)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

## Store Model

In [None]:
from datetime import datetime

model_dir_name = datetime.now().strftime("%Y-%m-%dT%H-%M-%S_") + MODEL_DIR_NAME_SUFFIX
model.save_pretrained(model_dir_name)

In [None]:
# For when executing the notebook in Google Colab

# Create ZIP file of checkpoint for download
# import shutil
# checkpoint_name = "checkpoint-xyz"
# checkpoint_to_store = f"/content/{RESULT_DIRECTORY_NAME}/checkpoint_name"
# shutil.make_archive(checkpoint_name, "zip", checkpoint_to_store)

# After mounting your Google Drive, move the files to drive/MyDrive using drag and drop
# from google.colab import drive
# drive.mount('/content/drive')