In [None]:
### Raphael Mourad
### Associate Professor
### University Paul Sabatier / INRAE MIAT Lab Toulouse
### 20/12/2024

In [None]:
# Script to fine tune GPT-NEO on vignettes to predict recommendation for surgery.
# GPT-Neo 1.3B is a transformer model designed using EleutherAI's replication of the GPT-3 architecture. 

In [1]:
# IMPORT LIBRARIES
import torch
from transformers import AutoTokenizer, GPTNeoForSequenceClassification, GPT2ForSequenceClassification
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from peft import PeftModel, PeftConfig, LoftQConfig
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
)

2024-01-11 16:53:05.234563: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-11 16:53:05.543554: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-11 16:53:06.184183: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvrtc.so.11.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/usr/lib/x86_64-linux-gnu
2024-01-11 16:53:06.185214: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvrtc.so.11.1: cannot o

In [2]:
torch.backends.cudnn.benchmark=True
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32 "

In [31]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig, AutoModelForPreTraining
from transformers import AutoModelForMaskedLM, CharacterTokenizer

In [26]:
#model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
config = AutoConfig.from_pretrained("google/switch-base-8")
model = AutoModelForPreTraining.from_config(config)
#model = AutoModelForCausalLM.from_config(config)
model

SwitchTransformersForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): SwitchTransformersStack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): SwitchTransformersBlock(
        (layer): ModuleList(
          (0): SwitchTransformersLayerSelfAttention(
            (SelfAttention): SwitchTransformersAttention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): SwitchTransformersLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): SwitchTransformersLayerFF(
            (mlp): SwitchTransformersDenseActDense(
              (wi): Linear(in_features=768, out_features=3072

In [28]:
# DNABERT2
config = AutoConfig.from_pretrained("/media/mourad/SSD2/MistralDNA/data/DNABERT-2-117M/")
#model = AutoModelForPreTraining.from_config(config)
model = AutoModelForMaskedLM.from_config(config)
model

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(4096, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [29]:
pytorch_total_params = sum(p.numel() for p in model.parameters())
pytorch_total_params

32489728

In [36]:
#@title Tokenizer

"""
Just a simple character level tokenizer.

From: https://github.com/dariush-bahrami/character-tokenizer/blob/master/charactertokenizer/core.py

CharacterTokenzier for Hugging Face Transformers.
This is heavily inspired from CanineTokenizer in transformers package.
"""
import json
import os
from pathlib import Path
from typing import Dict, List, Optional, Sequence, Union

from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer


class CharacterTokenizer(PreTrainedTokenizer):
    def __init__(self, characters: Sequence[str], model_max_length: int, padding_side: str='left', **kwargs):
        """Character tokenizer for Hugging Face transformers.
        Args:
            characters (Sequence[str]): List of desired characters. Any character which
                is not included in this list will be replaced by a special token called
                [UNK] with id=6. Following are list of all of the special tokens with
                their corresponding ids:
                    "[CLS]": 0
                    "[SEP]": 1
                    "[BOS]": 2
                    "[MASK]": 3
                    "[PAD]": 4
                    "[RESERVED]": 5
                    "[UNK]": 6
                an id (starting at 7) will be assigned to each character.
            model_max_length (int): Model maximum sequence length.
        """
        self.characters = characters
        self.model_max_length = model_max_length
        bos_token = AddedToken("[BOS]", lstrip=False, rstrip=False)
        eos_token = AddedToken("[SEP]", lstrip=False, rstrip=False)
        sep_token = AddedToken("[SEP]", lstrip=False, rstrip=False)
        cls_token = AddedToken("[CLS]", lstrip=False, rstrip=False)
        pad_token = AddedToken("[PAD]", lstrip=False, rstrip=False)
        unk_token = AddedToken("[UNK]", lstrip=False, rstrip=False)

        mask_token = AddedToken("[MASK]", lstrip=True, rstrip=False)

        super().__init__(
            bos_token=bos_token,
            eos_token=sep_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            unk_token=unk_token,
            add_prefix_space=False,
            model_max_length=model_max_length,
            padding_side=padding_side,
            **kwargs,
        )

        self._vocab_str_to_int = {
            "[CLS]": 0,
            "[SEP]": 1,
            "[BOS]": 2,
            "[MASK]": 3,
            "[PAD]": 4,
            "[RESERVED]": 5,
            "[UNK]": 6,
            **{ch: i + 7 for i, ch in enumerate(characters)},
        }
        self._vocab_int_to_str = {v: k for k, v in self._vocab_str_to_int.items()}

    @property
    def vocab_size(self) -> int:
        return len(self._vocab_str_to_int)

    def _tokenize(self, text: str) -> List[str]:
        return list(text)

    def _convert_token_to_id(self, token: str) -> int:
        return self._vocab_str_to_int.get(token, self._vocab_str_to_int["[UNK]"])

    def _convert_id_to_token(self, index: int) -> str:
        return self._vocab_int_to_str[index]

    def convert_tokens_to_string(self, tokens):
        return "".join(tokens)

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]
        result = cls + token_ids_0 + sep
        if token_ids_1 is not None:
            result += token_ids_1 + sep
        return result

    def get_special_tokens_mask(
        self,
        token_ids_0: List[int],
        token_ids_1: Optional[List[int]] = None,
        already_has_special_tokens: bool = False,
    ) -> List[int]:
        if already_has_special_tokens:
            return super().get_special_tokens_mask(
                token_ids_0=token_ids_0,
                token_ids_1=token_ids_1,
                already_has_special_tokens=True,
            )

        result = [1] + ([0] * len(token_ids_0)) + [1]
        if token_ids_1 is not None:
            result += ([0] * len(token_ids_1)) + [1]
        return result

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        result = len(cls + token_ids_0 + sep) * [0]
        if token_ids_1 is not None:
            result += len(token_ids_1 + sep) * [1]
        return result

    def get_config(self) -> Dict:
        return {
            "char_ords": [ord(ch) for ch in self.characters],
            "model_max_length": self.model_max_length,
        }

    @classmethod
    def from_config(cls, config: Dict) -> "CharacterTokenizer":
        cfg = {}
        cfg["characters"] = [chr(i) for i in config["char_ords"]]
        cfg["model_max_length"] = config["model_max_length"]
        return cls(**cfg)

    def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
        cfg_file = Path(save_directory) / "tokenizer_config.json"
        cfg = self.get_config()
        with open(cfg_file, "w") as f:
            json.dump(cfg, f, indent=4)

    @classmethod
    def from_pretrained(cls, save_directory: Union[str, os.PathLike], **kwargs):
        cfg_file = Path(save_directory) / "tokenizer_config.json"
        with open(cfg_file) as f:
            cfg = json.load(f)
        return cls.from_config(cfg)

In [42]:
import string
#from charactertokenizer import CharacterTokenizer

import sys
sys.path.append("/media/mourad/SSD2/MistralDNA/data/character-tokenizer/")
import charactertokenizer

chars = string.ascii_letters # This character vocab!
model_max_length = 2048
tokenizer = CharacterTokenizer(chars, model_max_length)


NotImplementedError: 

In [None]:
CharacterTokenizer

In [None]:
##tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

#prompt = "My favourite condiment is"

#model_inputs = tokenizer([prompt], return_tensors="pt").to(device)

#model.to(device)

#generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)

#tokenizer.batch_decode(generated_ids)[0]
#"The expected output"

In [None]:
#Â IMPORT AND BUILD GPT-NEO
# Define the task
task_name = "news_classification"
num_labels = 2
model_name="GPTNEO2.7B" # "GPTNEO125m" "GPTNEO2.7B" "BioMedLM"

# Load the tokenizer
if model_name=="GPTNEO125m":
    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125m") 
    model = GPTNeoForSequenceClassification.from_pretrained('EleutherAI/gpt-neo-125m')
elif model_name=="GPTNEO2.7B":
    tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B") 
    model = GPTNeoForSequenceClassification.from_pretrained('EleutherAI/gpt-neo-2.7B')
elif model_name=="BioMedLM":
    tokenizer = AutoTokenizer.from_pretrained("stanford-crfm/BioMedLM") 
    model = GPT2ForSequenceClassification.from_pretrained('stanford-crfm/BioMedLM')
tokenizer.pad_token = tokenizer.eos_token

# Load the pre-trained GPT-3 model
model.config.pad_token_id = model.config.eos_token_id

# Add a classification head on top of the model
model.resize_token_embeddings(len(tokenizer))
model.classifier = torch.nn.Linear(model.config.hidden_size, num_labels)

model

In [None]:
# PARAMETERS FOR FINE-TUNING
if model_name!="GPTNEO125m":
    training_args = TrainingArguments(
        output_dir='./results/'+model_name,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        num_train_epochs=50,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        learning_rate=1e-5,
        weight_decay=0.01,
        logging_dir='./logs',
        load_best_model_at_end=True,
        fp16=True,
        gradient_accumulation_steps=10,
    )
elif model_name=="GPTNEO125m": # For small models
    training_args = TrainingArguments(
        output_dir='./results/'+model_name,
        evaluation_strategy='epoch',
        save_strategy='epoch',
        num_train_epochs=50,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        learning_rate=1e-5,
        weight_decay=0.01,
        logging_dir='./logs',
        load_best_model_at_end=True,
        fp16=True,
    )

print(training_args)

#loftq_config = LoftQConfig(loftq_bits=4)
lora_config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
    inference_mode=False,
    #init_lora_weights="loftq", 
    #loftq_config=loftq_config,
)
model = get_peft_model(model, lora_config)

In [None]:
# FINE-TUNE MODEL ON VIGNETTES
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)],
)
trainer.train()

# Deploy the model
model.save_pretrained(model_name+'/vignette_classification_model')
tokenizer.save_pretrained(model_name+'/vignette_classification_tokenizer')