In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
%%capture
!pip install fsspec==2023.6.0
!python -m pip install seqeval==1.2.2
!pip install git+https://github.com/huggingface/transformers@4.54.0.dev0
!python -m pip install matplotlib==3.10.0 ipywidgets==7.7.1
!pip install iterative-stratification==0.1.9

In [None]:
import torch
import os
import sys
import datasets
import json
import matplotlib.pyplot as plt
import torch
import transformers
from sklearn.preprocessing import MultiLabelBinarizer
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, TrainingArguments, Trainer, EarlyStoppingCallback

from transformers import set_seed
set_seed(0)

In [None]:
# https://huggingface.co/blog/modernbert

model_id = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
BASE_DIR = "<Project Folder>"

In [None]:
hf_dataset = Dataset.from_json(os.path.join(BASE_DIR, "data/dataset.jsonl"))

In [None]:
labels = []
for i in range(len(hf_dataset)):
  labels.extend(hf_dataset[i]["str_label"])

labels = list(set(labels))

In [None]:
mlb = MultiLabelBinarizer()
mlb.fit([labels])

In [None]:
id2label = {idx:label for idx, label in enumerate(mlb.classes_)}
label2id = {label:idx for idx, label in enumerate(mlb.classes_)}

In [None]:
kfold_datasets = []
for i in range(5):
  kfold_datasets.append(DatasetDict.load_from_disk(os.path.join(BASE_DIR, "data/k_fold_ds", f"{i}-fold")))

In [None]:
def transform(example):
  labels = mlb.transform([example["str_label"]])[0]
  return {"label": [float(label) for label in labels]}

In [None]:
def tokenize(batch, text_field="text"):
  return tokenizer(batch[text_field], padding="longest")

In [None]:
kfold_datasets = [d.map(transform) for d in kfold_datasets]
kfold_datasets = [d.map(tokenize, batched=True, batch_size=None, fn_kwargs={"text_field": "text"}) for d in kfold_datasets]

In [None]:
from transformers import AutoModelForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def model_init():
    return AutoModelForSequenceClassification.from_pretrained(model_id,
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

In [None]:
import datetime
import os

def create_timestamped_folder(base_directory, timestamp, purpose_token="training_run"):
    folder_name = f"{purpose_token}_{timestamp}"
    folder_path = os.path.join(base_directory, folder_name)

    try:
        os.makedirs(folder_path, exist_ok=True)
        print(f"Successfully created folder: {folder_path}")
        return folder_path
    except OSError as e:
        print(f"Error creating directory {folder_path}: {e}")
        return None

In [None]:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

output_folder = os.path.join(BASE_DIR, "output")
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print(f"Output folder created at: {output_folder}")

trainer_checkpoints = create_timestamped_folder(output_folder, timestamp, "trainer_checkpoints")
if trainer_checkpoints:
    print(f"Trainer Checkpoints created at: {trainer_checkpoints}")

kfold_best_checkpoints = create_timestamped_folder(output_folder, timestamp, "kfold_checkpoints")
if kfold_best_checkpoints:
    print(f"Default folder created at: {kfold_best_checkpoints}")

# Train

In [None]:
batch_size = 1
gradient_accumulation_steps = 8
num_epochs = 50
metric_name_for_early_stopping = "eval_loss"

In [None]:
for idx, d in enumerate(kfold_datasets):
  logging_steps = len(d["train"]) // batch_size

  args = TrainingArguments(
    output_dir=trainer_checkpoints,

    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,

    learning_rate=2e-5,
    weight_decay=0.01,
    gradient_accumulation_steps=gradient_accumulation_steps,

    save_total_limit = 2,
    save_strategy = "epoch",

    metric_for_best_model=metric_name_for_early_stopping,
    greater_is_better = False,
    load_best_model_at_end=True,

    eval_strategy = "epoch",
    logging_strategy="epoch",
    log_level="error",

    disable_tqdm=False,
    report_to="none"
  )

  early_stopping_callback = EarlyStoppingCallback(
      early_stopping_patience=5,    # wait 5 epochs for improvement
      early_stopping_threshold=0.001 # loss must decrease by at least 0.001 to count as an improvement
  )

  trainer = Trainer(
      model_init=model_init,
      args = args,
      train_dataset=d["train"],
      eval_dataset=d["valid"],
      tokenizer=tokenizer,
      callbacks=[early_stopping_callback]
  )

  trainer.train()
  with open(os.path.join(kfold_best_checkpoints, f"fold{idx}_training_history.pkl"), 'wb') as f:
    pickle.dump(trainer.state.log_history, f)

  best_ckpt_path = trainer.state.best_model_checkpoint
  trainer.save_model(os.path.join(kfold_best_checkpoints, f"fold-{idx}"))