### Runner which uses Trainer from transformers library

Unlike the other code in this repo, we will use the Trainer and TrainingArguments class from HuggingFace. The goal is to compare performance with qlora_runner.py

In [None]:
import logging
import gc
import torch
from peft import LoraConfig

from config import (
    UserConfiguration,
    LogConfiguration,
    TorchConfiguration,
    TokenizerConfiguration,
    TextGenConfiguration,
    SystemConfiguration,
    TrainerConfiguration,
)

from os_environment_manager import OSEnvironmentManager
from package_path_manager import PackagePathManager
from model_manager import ModelManager
from system_monitor import SystemMonitor

from tokenization_manager import TokenizationManager
from data_manager import DataManager

# TODO: These should be picked up from command line
from trainer import Trainer

from transformers import (
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    IntervalStrategy,
)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=False,
)


NET_ID = "vgn2004"
ENV = "qlora"
NUM_WORKERS = 8
MAX_TOKENS = 64
MIN_GENERATION = 64
MODEL_NAME = "facebook/opt-125m"
DATASET_NAME = "NIH_ExPORTER_awarded_grant_text"
TOKENIZER_NAME = "speedup"
BATCH_SIZE = 64

# Constants
OS_ENV_DICT = {
    "CUDA_VISIBLE_DEVICES": 0,
    "TRANSFORMERS_NO_ADVISORY_WARNINGS": "true",
    "TORCHDYNAMO_DISABLE": 1,
    "TOKENIZERS_PARALLELISM": "false",
}

if __name__ == "__main__":
    # Clear the GPU
    torch.cuda.empty_cache()
    gc.collect()

    # Configure the logger, needed for initial utilization checks
    LogConfiguration.setup_logging()
    logger = logging.getLogger(__name__)

    # Get initial RAM and GPU utilization
    monitor = SystemMonitor()
    logger.info(f"RAM Usage: {monitor.get_ram_usage()} MB")
    logger.info(f"GPU Utilization: {monitor.get_gpu_utilization()} MB")

    # Configurations

    # Setup folder/file path related configurations
    user_config = UserConfiguration(net_id=NET_ID, env=ENV)
    system_config = SystemConfiguration(num_workers=NUM_WORKERS)
    tokenizer_config = TokenizerConfiguration(
        max_tokens=MAX_TOKENS, tokenizer_name=TOKENIZER_NAME
    )
    torch_config = TorchConfiguration()
    torch_config.commit()

    # System configurations

    # Add Python packages to sys path
    package_path_manager = PackagePathManager(user_config)
    package_path_manager.add_package_paths_to_system()

    # Add environment variables to OS env
    os_env_manager = OSEnvironmentManager()
    os_env_manager.update_from_dict(OS_ENV_DICT)

    # Tokenization
    tokenization_manager = TokenizationManager(user_config, tokenizer_config)
    tokenization_manager.load_for_model(MODEL_NAME)

    # Datasets
    data_manager = DataManager(user_config, system_config, tokenizer_config)
    data_manager.dataset_name = DATASET_NAME
    data_manager.set_data_collator(tokenization_manager.tokenizer)

    # Tokenize dataset from scratch (skipped)
    #     data_manager.create_dataset_from_jsonl_zst_file(name=DATASET_NAME,
    #                                                     jsonl_zst_file_path="E:\\NIH_ExPORTER_awarded_grant_text.jsonl.zst")
    #     data_manager.create_tokenized_dataset()
    #     training_dataset, validation_dataset = data_manager.fetch_train_validation_split()

    # Load from disk
    try:
        (
            training_dataset,
            validation_dataset,
        ) = data_manager.fetch_train_validation_split_from_disk()
    except FileNotFoundError as fe:
        logger.warning(f"{fe.__repr__()}")
        data_manager.create_dataset_from_jsonl_zst_file(
            name=DATASET_NAME,
            jsonl_zst_file_path="/scratch/vgn2004/fine_tuning/datasets/NIH_ExPORTER_awarded_grant_text.jsonl.zst",
        )
        data_manager.create_tokenized_dataset(tokenization_manager.tokenize)
        (
            training_dataset,
            validation_dataset,
        ) = data_manager.fetch_train_validation_split()

    # Dataloaders
    training_dataloader, validation_dataloader = data_manager.fetch_dataloaders(
        training_dataset=training_dataset,
        validation_dataset=validation_dataset,
        batch_size=BATCH_SIZE,
    )

    # Model
    model_manager = ModelManager(system_config)
    model_manager.load(
        MODEL_NAME,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype="float16",
            bnb_4bit_use_double_quant=False,
        ),
    )

    logger.info(model_manager.model)

    # Text Generation
    text_gen_config = TextGenConfiguration(
        tokenization_manager.tokenizer, min_tokens_to_generate=MIN_GENERATION
    )
    prompt = tokenization_manager.encode("This")
    sequence = model_manager.infer(prompt, text_gen_config)
    text = tokenization_manager.decode(sequence, text_gen_config)
    logging.info(f"Generated Text Before Fine-Tuning:\n{text}")

    # Existing Trainer
    from peft import prepare_model_for_kbit_training

    #     model_manager.model.gradient_checkpointing_enable()
    model_manager.model = prepare_model_for_kbit_training(model_manager.model)
    model_manager.lorify(
        LoraConfig(
            r=64, lora_alpha=16, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
        )
    )

    from transformers import TrainerCallback, TrainerControl

    class SampleTextCallback(TrainerCallback):
        def __init__(
            self, model, tokenizer, output_dir, prompt_text="This", max_length=64
        ):
            self.model = model
            self.tokenizer = tokenizer
            self.output_dir = output_dir
            self.prompt_text = prompt_text
            self.max_length = max_length

        def on_step_begin(self, args, state, control, **kwargs):
            import os

            if state.global_step % 50 == 0 and state.global_step > 0:
                input_ids = self.tokenizer.encode(
                    self.prompt_text, return_tensors="pt"
                ).to(self.model.device)
                sample_outputs = self.model.generate(
                    input_ids=input_ids,
                    max_length=self.max_length,
                    num_return_sequences=1,
                    temperature=1.0,
                )
                text = f"\n{state.global_step}: {self.tokenizer.decode(sample_outputs[0], skip_special_tokens=True)}"
                print(text)

                sample_file_path = os.path.join(
                    self.output_dir, f"training_samples.txt"
                )
                with open(sample_file_path, "a") as file:
                    file.write(text)

    trainer_callbacks = [
        SampleTextCallback(
            model_manager.model,
            tokenization_manager.tokenizer,
            "/scratch/vgn2004/fine_tuning/standard",
        )
    ]

    trainer = Trainer(
        model=model_manager.model,
        train_dataset=training_dataset,
        eval_dataset=validation_dataset,
        callbacks=trainer_callbacks,
        args=TrainingArguments(
            gradient_accumulation_steps=4,
            warmup_steps=2,
            num_train_epochs=50,
            learning_rate=2e-4,
            logging_strategy=IntervalStrategy.STEPS,
            logging_steps=25,
            evaluation_strategy=IntervalStrategy.STEPS,
            eval_steps=500,
            save_strategy=IntervalStrategy.STEPS,
            save_steps=1000,
            lr_scheduler_type="linear",
            output_dir="/scratch/vgn2004/fine_tuning/standard",
            optim="paged_adamw_8bit",
        ),
        data_collator=DataCollatorForLanguageModeling(
            tokenizer=tokenization_manager.tokenizer,
            mlm=False,  # For causal LM; set to True if you're using a masked LM like BERT
        ),
    )
    model_manager.model.config.use_cache = (
        False  # silence the warnings. Please re-enable for inference!
    )
    trainer.train()

INFO: At 08:59 PM 10s, from <ipython-input-3-c200f21b3bd6>:<module>:58 => RAM Usage: 2050.92578125 MB
INFO: At 08:59 PM 10s, from <ipython-input-3-c200f21b3bd6>:<module>:58 => RAM Usage: 2050.92578125 MB
INFO: At 08:59 PM 10s, from <ipython-input-3-c200f21b3bd6>:<module>:58 => RAM Usage: 2050.92578125 MB
INFO: At 08:59 PM 10s, from <ipython-input-3-c200f21b3bd6>:<module>:59 => GPU Utilization: 1022 MB
INFO: At 08:59 PM 10s, from <ipython-input-3-c200f21b3bd6>:<module>:59 => GPU Utilization: 1022 MB
INFO: At 08:59 PM 10s, from <ipython-input-3-c200f21b3bd6>:<module>:59 => GPU Utilization: 1022 MB
INFO: At 08:59 PM 10s, from user_configuration.py:__init__:18 => The base directory is set to /scratch.
INFO: At 08:59 PM 10s, from user_configuration.py:__init__:18 => The base directory is set to /scratch.
INFO: At 08:59 PM 10s, from user_configuration.py:__init__:18 => The base directory is set to /scratch.
INFO: At 08:59 PM 10s, from torch_configuration.py:commit:17 => Flash attention is en

INFO: At 08:59 PM 11s, from <ipython-input-3-c200f21b3bd6>:<module>:120 => OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear4bit(in_features=768, out_features=768, bias=True)
            (v_proj): Linear4bit(in_features=768, out_features=768, bias=True)
            (q_proj): Linear4bit(in_features=768, out_features=768, bias=True)
            (out_proj): Linear4bit(in_features=768, out_features=768, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear4bit(in_features=768, out_features=3072, bias=True)
          (fc2): Linear4bi

Step,Training Loss,Validation Loss



50: This study is the first to examine the role of the human immune system in the development of a novel immune system. The immune system is a complex system of cells and tissues that is composed of a number of different types of cells. The immune system is a complex system of cells and tissues that is composed of a number

100: This study is the first of a series of studies to investigate the relationship between the development of the human immune system and the development of the immune system. The study will focus on the development of the immune system in the human body and the development of the immune system in the human body. The study will focus on the
