In [4]:
import argparse
import os
import torch
import gc
import bitsandbytes
from datasets import load_dataset, Dataset
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

from transformers import set_seed
from transformers import LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM, AutoConfig
from transformers import (
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding,
    DataCollatorForSeq2Seq
)
from transformers import BitsAndBytesConfig
from peft import (
    get_peft_model,
    prepare_model_for_kbit_training,
    LoraConfig,
    TaskType,
)
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
from accelerate import Accelerator, FullyShardedDataParallelPlugin
from psutil import Process
from pynvml import (
    nvmlInit,
    nvmlDeviceGetHandleByIndex,
    nvmlDeviceGetMemoryInfo,
    nvmlDeviceGetCount,
)

class SystemMonitor:
    def __init__(self):
        # Initialize NVML for GPU monitoring
        self.nvml_initialized = SystemMonitor._initialize_nvml()

    @classmethod
    def _initialize_nvml(cls):
        try:
            nvmlInit()
            return True
        except Exception as e:
            print(f"Error initializing NVML: {e}")
            return False

    def get_ram_usage(self):
        return Process().memory_info().rss / (1024 * 1024)

    def get_gpu_memory_usage(self):
        if not self.nvml_initialized:
            print("NVML not initialized.")
            return None

        gpu_memory_usage = []
        try:
            gpu_count = nvmlDeviceGetCount()
            for i in range(gpu_count):
                handle = nvmlDeviceGetHandleByIndex(i)
                info = nvmlDeviceGetMemoryInfo(handle)
                gpu_memory_usage.append(info.used // 1024 ** 3)
        except Exception as e:
            print(f"Error retrieving GPU memory info: {e}")
            return None

        return gpu_memory_usage

    def get_gpu_utilization(self):
        gpu_memory_usages = self.get_gpu_memory_usage()
        return gpu_memory_usages if gpu_memory_usages is not None else None

In [11]:
INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
INSTRUCTION_KEY = "### Instruction:"
INPUT_KEY = "Input:"
RESPONSE_KEY = "### Response:"
PROMPT_NO_INPUT_FORMAT = """{intro}
{instruction_key}
{instruction}
{response_key}""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    response_key=RESPONSE_KEY,
)

PROMPT_WITH_INPUT_FORMAT = """{intro}
{instruction_key}
{instruction}
{input_key}
{input}
{response_key}""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    input_key=INPUT_KEY,
    input="{input}",
    response_key=RESPONSE_KEY,
)

ROOT_PATH = "/scratch/vgn2004"
DEFAULT_TRAINING_DATASET = "databricks/databricks-dolly-15k"
DEFAULT_SEED = 68
MAX_SEQ_LEN = 512


def load_training_dataset(
        tokenizer,
        path_or_dataset: str = DEFAULT_TRAINING_DATASET,
        seed: int = DEFAULT_SEED
) -> Dataset:
    """
    This function is used for preprocessing the databricks-dolly-15k dataset.
    To fine-tune on your own dataset, you would need to customize the function.
    """

    print(f"Loading dataset from {path_or_dataset}")
    dataset = load_dataset(path_or_dataset)["train"]
    print(f"Found {dataset.num_rows} rows", )

    def _reformat_data(rec):
        # Each row of databricks-dolly-15k contains fields "instruction", "response", and optionally the "context" field
        instruction = rec["instruction"]
        response = rec["response"]
        context = rec.get("context")

        if context:
            questions = PROMPT_WITH_INPUT_FORMAT.format(instruction=instruction, input=context)
        else:
            questions = PROMPT_NO_INPUT_FORMAT.format(instruction=instruction)

        return {"text": f"{questions}\n{response}"}

    dataset = dataset.map(_reformat_data)

    def tokenize_function(allEntries):
        return tokenizer(allEntries['text'], truncation=True, max_length=MAX_SEQ_LEN)

    dataset = dataset.map(tokenize_function)

    split_dataset = dataset.train_test_split(test_size=1000, seed=seed)
    train_tokenized_dataset = split_dataset['train']
    eval_tokenized_dataset = split_dataset['test']

    return train_tokenized_dataset, eval_tokenized_dataset

def load_model(
        pretrained_model_name_or_path: str = "NousResearch/Llama-2-7b-hf",
        bf16: bool = False,
) -> AutoModelForCausalLM:
    print(f"Loading model for {pretrained_model_name_or_path}")
    config = AutoConfig.from_pretrained(
        pretrained_model_name_or_path,
        trust_remote_code=True,
    )

    torch_dtype = torch.bfloat16 if bf16 else torch.float16

    model = transformers.AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path,
        config=config,
        torch_dtype=torch_dtype,
    )

    model.config.use_cache = False

    return model

def get_tokenizer(
        pretrained_tokenizer_name_or_path: str = "NousResearch/Llama-2-7b-hf",
):
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_tokenizer_name_or_path,
        trust_remote_code=True,
        revision=REVISION,
    )
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

def train(
        *,
        input_model: str,
        local_output_dir: str,
        dbfs_output_dir: str,
        epochs: int,
        per_device_train_batch_size: int,
        per_device_eval_batch_size: int,
        lr: float,
        seed: int,
        gradient_checkpointing: bool,
        gradient_accumulation_steps: int,
        local_rank: str,
        bf16: bool,
        logging_steps: int,
        save_steps: int,
        max_steps: int,
        eval_steps: int,
        save_total_limit: int,
        warmup_steps: int,
):
    set_seed(seed)
    # Enable tf32 for better performance
    torch.backends.cuda.matmul.allow_tf32 = True

    tokenizer = get_tokenizer()
    train_dataset, val_dataset = load_training_dataset(tokenizer, seed=seed)

    model = load_model(pretrained_model_name_or_path=input_model, bf16=bf16)

    training_args = TrainingArguments(
        output_dir=local_output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        gradient_checkpointing=gradient_checkpointing,
        gradient_accumulation_steps=gradient_accumulation_steps,
        learning_rate=lr,
        num_train_epochs=epochs,
        weight_decay=1,
        do_eval=True,
        evaluation_strategy="epoch",
        eval_steps=eval_steps,
        fp16=True,
        bf16=False,
        logging_strategy="steps",
        logging_steps=logging_steps,
        save_strategy="steps",
        save_steps=save_steps,
        save_total_limit=save_total_limit,
        max_steps=max_steps,
        local_rank=local_rank,
        warmup_steps=warmup_steps,
        report_to=[],
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    print("Training the model")
    trainer.train()

    print(f"Saving Model to {local_output_dir}")
    trainer.save_model(output_dir=local_output_dir)
    tokenizer.save_pretrained(local_output_dir)

    if dbfs_output_dir:
        print(f"Saving Model to {dbfs_output_dir}")
        trainer.save_model(output_dir=dbfs_output_dir)
        tokenizer.save_pretrained(dbfs_output_dir)

    print("Training finished.")

if __name__ == "__main__":
    kwargs = {
    "optimizer": {
      "type": "AdamW",
      "params": {
        "lr": "auto",
        "betas": "auto",
        "eps": "auto",
        "weight_decay": "auto"
      }
    },
    "scheduler": {
      "type": "WarmupLR",
      "params": {
        "warmup_min_lr": "auto",
        "warmup_max_lr": "auto",
        "warmup_num_steps": "auto"
      }
    },
    "gradient_accumulation_steps": "auto",
    "gradient_clipping": "auto",
    "steps_per_print": 2000,
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": False
    }
    train(**kwargs)

TypeError: train() got an unexpected keyword argument 'optimizer'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name_or_path)
tokenizer.model_max_length = config.seq_length
tokenizer.padding_side = "right" 
tokenizer.pad_token, tokenizer.eos_token