# Fine-tuning Transformer Model

## Colab Section

In [None]:
# run first: data_exploration.ipynb

In [None]:
# !git clone https://github.com/XnibyH/PatentMatch-Experiment.git
# %cd PatentMatch-Experiment

In [None]:
# !pip install -U -r requirements.txt --quiet

In [None]:
# colab specific
# !pip install -U accelerate --quiet
# !pip install -U transformers --quiet

In [None]:
# from google.colab import files

# # # upload data
# %cd data
# uploaded_files = files.upload()
# %cd ..

In [None]:
# # upload .env
# uploaded_files = files.upload()

In [None]:
# locally
%cd ..

## Fine-tuning

In [None]:
import mlflow
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
from src.utils import timestamp
import torch
import numpy as np
from src.settings import (
    MLFLOW_EXPERIMENT_NAME,
    )


# Check if a GPU is available and set the device
device = 0 if torch.cuda.is_available() else -1

# select the model
all_models = {
    'all-mpnet-base-v2': 'sentence-transformers/all-mpnet-base-v2',
    'stsb-roberta-large': 'cross-encoder/stsb-roberta-large',
    'stsb-roberta-base': 'cross-encoder/stsb-roberta-base',
    'Legal-BERT': 'nlpaueb/legal-bert-base-uncased',
    'EURLEX-BERT': 'nlpaueb/bert-base-uncased-eurlex',
    'SciBERT': 'allenai/scibert_scivocab_uncased',
    # fine-tuned models below
    #
}

selected_model = all_models['stsb-roberta-large']

# set mlflow parameters and start the experiment
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
# # mlflow.start_run(experiment_id=experiment.experiment_id, log_system_metrics=True)
mlflow.set_tag(key='mlflow.runName', value=f"Training_{selected_model.split('/')[1]}_{timestamp()}")

## Load and Train/Validation Split the Dataset

In [None]:
# loading train and test datasets
dataset = datasets.load_dataset("parquet", data_files={"train": "data/train_clean.parquet", "test": "data/test_clean.parquet"})

# split train into train and validation sets 20%
train_test_split = dataset['train'].train_test_split(test_size=0.20)

# rename temporary test from train for validation
train_test_split['validation'] = train_test_split.pop('test')

# full dataset: train, validation
dataset = datasets.DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['validation'],
    'test': dataset['test']
})

## Tokenize the Dataset

In [None]:
# init tokenizer
tokenizer = AutoTokenizer.from_pretrained(selected_model)

def preprocess_function(batch):
    # Tokenize the pairs of texts
    inputs = tokenizer(
        batch['text'], batch['text_b'],
        padding='max_length',
        truncation=True,
        max_length=tokenizer.model_max_length,  # None == tokenizer.model_max_length
        # return_overflowing_tokens=True,
        # stride=253,
        return_tensors="pt",
        )
    inputs['label'] = batch['label']
    return inputs

# preprocess the data  # TODO check truncated for possible data loss in training
tokenized_dataset = dataset.map(preprocess_function, batched=True)

## Configure a Model

In [None]:
# set num_labels for selected model - cross-encoder support only 1 label
num_labels = 1 #1 if selected_model.split('/')[0] in ['cross-encoder'] else 2

# init the model
model = AutoModelForSequenceClassification.from_pretrained(selected_model, num_labels=num_labels)


In [None]:
def logits_to_binary(logits, threshold: float=0.50):
    """
    Convert logits to probabilities using the sigmoid function and binarize on set threshold

    Args:
    logits (torch.Tensor or np.ndarray): Logits output from the model.
    threshold (float): default 0.50

    Returns:
    list: binary predictions
    """
    if isinstance(logits, np.ndarray):
        logits = torch.tensor(logits)

    probabilities = torch.sigmoid(logits)
    # print('prob:', probabilities)
    
    # Binarize the output using the threshold
    binary_predictions = [0 if x <= threshold else 1 for x in probabilities]
    # print('bin_pred:', binary_predictions)

    return binary_predictions

In [None]:
def compute_metrics(eval_pred):
    # Load metrics
    f1_metric = evaluate.load("f1")
    mcc_metric = evaluate.load("matthews_correlation")

    # eval predictions
    logits, labels = eval_pred
    # logits = [x[0] for x in logits]
    predictions = logits_to_binary(logits)

    # predictions, labels = trainer.preprocess_logits_for_metrics(eval_pred)
    # # predictions = logits
    # print(predictions)
    # # predictions = np.argmax(logits, axis=-1)

    # # # predictions for cross-encoder output
    # # predictions = logits_to_probabilities(logits)
    # # # binarization
    # # predictions = [0 if x <= 0.51 else 1 for x in predictions]
    # # # predictions = [x[0] for x in logits]
    # # # print(predictions)

    f1 = f1_metric.compute(predictions=predictions, references=labels)
    mcc = mcc_metric.compute(predictions=predictions, references=labels)

    return {
        "f1": f1["f1"],
        "mcc": mcc["matthews_correlation"]
    }


# set training arguments
training_args = TrainingArguments(
    output_dir=f"./fine_tuning_results/{selected_model.split('/')[1]}",
    num_train_epochs=5,
    per_device_train_batch_size=8,  # RTX 3090 32
    per_device_eval_batch_size=16,  # RTX 3090 128
    # per_device_train_batch_size=1,  # local
    # per_device_eval_batch_size=1,  # local
    warmup_steps=20,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    learning_rate=2e-5,  # learning rate
    save_total_limit=5,  # limit the total amount of checkpoints, delete the older checkpoints
    logging_dir=f"./fine_tuning_results/{selected_model.split('/')[1]}/logs",  # directory for storing logs
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=50,  # 50
    # save_strategy="epoch",
    # save_steps=1,
    # report_to=None,
    
)

# init trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
)

## Start Training

In [None]:
# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the model
trainer.save_model(f"./saved_models/{selected_model.split('/')[1]}_FT")

# end experiment
mlflow.end_run()