# Fine-tuning Transformer Model

## Colab Section

In [1]:
# run first: data_exploration.ipynb

In [2]:
# !git clone https://github.com/XnibyH/PatentMatch-Experiment.git
# %cd PatentMatch-Experiment

In [3]:
# !pip install -U -r requirements.txt --quiet

In [4]:
# colab specific
# !pip install -U accelerate --quiet
# !pip install -U transformers --quiet

In [5]:
# from google.colab import files

# # # upload data
# %cd data
# uploaded_files = files.upload()
# %cd ..

In [6]:
# # upload .env
# uploaded_files = files.upload()

In [7]:
# locally
%cd ..

/data_science/projects/LOGOSAI.TECH_external/CHALLENGE/Experiment-repository-template


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


## Fine-tuning

In [8]:
import mlflow
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate
from src.utils import timestamp
import torch
import numpy as np
from src.settings import (
    MLFLOW_EXPERIMENT_NAME,
    )


# Check if a GPU is available and set the device
device = 0 if torch.cuda.is_available() else -1

# select the model
all_models = {
    'all-mpnet-base-v2': 'sentence-transformers/all-mpnet-base-v2',
    'stsb-roberta-large': 'cross-encoder/stsb-roberta-large',
    'stsb-roberta-base': 'cross-encoder/stsb-roberta-base',
    'Legal-BERT': 'nlpaueb/legal-bert-base-uncased',
    'EURLEX-BERT': 'nlpaueb/bert-base-uncased-eurlex',
    'SciBERT': 'allenai/scibert_scivocab_uncased',
    # fine-tuned models below
    #
}

selected_model = all_models['all-mpnet-base-v2']

# set mlflow parameters and start the experiment
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
# # mlflow.start_run(experiment_id=experiment.experiment_id, log_system_metrics=True)
mlflow.set_tag(key='mlflow.runName', value=f"Training_{selected_model.split('/')[1]}_{timestamp()}")

2024/05/31 21:17:14 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


## Load and Train/Validation Split the Dataset

In [9]:
# loading train and test datasets
dataset = datasets.load_dataset("parquet", data_files={"train": "data/train_clean.parquet", "test": "data/test_clean.parquet"})

# split train into train and validation sets 20%
train_test_split = dataset['train'].train_test_split(test_size=0.20)

# rename temporary test from train for validation
train_test_split['validation'] = train_test_split.pop('test')

# full dataset: train, validation
dataset = datasets.DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['validation'],
    'test': dataset['test']
})

## Tokenize the Dataset

In [10]:
# init tokenizer
tokenizer = AutoTokenizer.from_pretrained(selected_model)

def preprocess_function(batch):
    # Tokenize the pairs of texts
    inputs = tokenizer(
        batch['text'], batch['text_b'],
        padding='max_length',
        truncation=True,
        max_length=tokenizer.model_max_length,  # None == tokenizer.model_max_length
        # return_overflowing_tokens=True,
        # stride=253,
        return_tensors="pt",
        )
    inputs['label'] = batch['label']
    return inputs

# preprocess the data  # TODO check truncated for possible data loss in training
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2329 [00:00<?, ? examples/s]

Map:   0%|          | 0/583 [00:00<?, ? examples/s]

## Configure a Model

In [11]:
# set num_labels for selected model - cross-encoder support only 1 label
num_labels = 1 #if selected_model.split('/')[0] in ['cross-encoder'] else 2

# init the model
model = AutoModelForSequenceClassification.from_pretrained(selected_model, num_labels=num_labels)


Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def compute_metrics(eval_pred):
    # Load metrics
    f1_metric = evaluate.load("f1")
    mcc_metric = evaluate.load("matthews_correlation")

    # eval predictions
    logits, labels = eval_pred
    print(logits,labels)
    # predictions = np.argmax(logits, axis=-1)

    # predictions for cross-encoder output
    predictions = [x[0] for x in logits]

    f1 = f1_metric.compute(predictions=predictions, references=labels)
    mcc = mcc_metric.compute(predictions=predictions, references=labels)

    return {
        "f1": f1["f1"],
        "mcc": mcc["matthews_correlation"]
    }


# set training arguments
training_args = TrainingArguments(
    output_dir="./fine_tuning_results",
    num_train_epochs=1,  # 5
    # per_device_train_batch_size=18,  # colab
    # per_device_eval_batch_size=64,  # colab
    per_device_train_batch_size=1,  # local
    per_device_eval_batch_size=1,  # local
    warmup_steps=1,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    learning_rate=2e-5,  # learning rate
    save_total_limit=1,  # limit the total amount of checkpoints, delete the older checkpoints
    # logging_dir="./logs",  # directory for storing logs
    # logging_steps=100,
    eval_strategy="steps",
    eval_steps=1,
    # save_strategy="epoch",
    # save_steps=1,
    
)

# init trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
)

## Start Training

In [13]:
# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the model
trainer.save_model(f"./saved_models/{selected_model.split('/')[1]}_FT")

  0%|          | 0/2329 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 