In [1]:
from collections import defaultdict
import copy
import json
import os
from os.path import exists, join, isdir
from dataclasses import dataclass, field
import sys
from typing import Optional, Dict, Sequence
import numpy as np
from tqdm import tqdm
import logging
import bitsandbytes as bnb
import pandas as pd

from common.utils import *

import torch
import transformers
from torch.nn.utils.rnn import pad_sequence
import argparse
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    set_seed,
    Seq2SeqTrainer,
    BitsAndBytesConfig,
    LlamaTokenizer

)
from datasets import load_dataset, Dataset
import evaluate

from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel
)
from peft.tuners.lora import LoraLayer
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

In [2]:
if torch.cuda.is_available():   
    torch.backends.cuda.matmul.allow_tf32 = True

In [3]:
logger = logging.getLogger(__name__)

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"

In [4]:
## We set the ModelArguments
@dataclass
class ModelArguments:
    model_name_or_path: Optional[str] = field(
        default = "EleutherAI/pythia-12b"
    )
    trust_remote_code: Optional[bool] = field(
        default = False,
        metadata = {"help": "Enable unpickling of arbitrary code in AutoModelForCausalLM#from_pretrained."}
    )
    use_auth_token: Optional[bool] = field(
        default = False,
        metadata = {"help": "Enables using Huggingface auth token from Git Credentials."}
    )

In [5]:
## We set the DataArguments
@dataclass
class DataArguments:
    eval_dataset_size: int = field(
        default = 1024, metadata={"help": "Size of validation dataset."}
    )
    max_train_samples: Optional[int] = field(
        default = None,
        metadata = {
            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
            "value if set."
        },
    )
    max_eval_samples: Optional[int] = field(
        default = None,
        metadata = {
            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
            "value if set."
        },
    )
    source_max_len: int = field(
        default = 1024,
        metadata = {"help": "Maximum source sequence length. Sequences will be right padded (and possibly truncated)."},
    )
    target_max_len: int = field(
        default = 256,
        metadata = {"help": "Maximum target sequence length. Sequences will be right padded (and possibly truncated)."},
    )
    dataset: str = field(
        default = 'alpaca',
        metadata = {"help": "Which dataset to finetune on. See datamodule for options."}
    )
    dataset_format: Optional[str] = field(
        default = None,
        metadata = {"help": "Which dataset format is used. [alpaca|chip2|self-instruct|hh-rlhf]"}
    )

In [6]:
@dataclass
class TrainingArguments(transformers.Seq2SeqTrainingArguments):
    cache_dir: Optional[str] = field(default = None)
    train_on_source: Optional[bool] = field(
        default = False,
        metadata = {"help": "Whether to train on the input in addition to the target text."}
    )
    mmlu_split: Optional[str] = field(default = 'eval', metadata = {"help": "The MMLU split to run on"})
    mmlu_dataset: Optional[str] = field(
        default = 'mmlu-fs',
        metadata = {"help": "MMLU dataset to use: options are `mmlu-zs` for zero-shot or `mmlu-fs` for few shot."}
    )
    do_mmlu_eval: Optional[bool] = field(default = False, metadata = {"help": "Whether to run the MMLU evaluation."})
    max_mmlu_samples: Optional[int] = field(
        default = None,
        metadata = {"help": "If set, only evaluates on `max_mmlu_samples` of the MMMLU dataset."}
    )
    mmlu_source_max_len: int = field(default = 2048, metadata = {"help": "Maximum source sequence length for mmlu."})
    full_finetune: bool = field(default = False, metadata = {"help": "Finetune the entire model without adapters."})
    adam8bit: bool = field(default = False, metadata = {"help": "Use 8-bit adam."})
    double_quant: bool = field(
        default = True,
        metadata = {"help": "Compress the quantization statistics through double quantization."}
    )
    quant_type: str = field(
        default = "nf4",
        metadata = {"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
    )
    bits: int = field(default = 4, metadata = {"help": "How many bits to use."})
    lora_r: int = field(default = 64, metadata = {"help": "Lora R dimension."})
    lora_alpha: float = field(default = 16, metadata = {"help": " Lora alpha."})
    lora_dropout: float = field(default = 0.0, metadata = {"help":"Lora dropout."})
    max_memory_MB: int = field(default = 80000, metadata = {"help": "Free memory per gpu."})
    report_to: str = field(default = 'none', metadata = {"help": "To use wandb or something else for reporting."})
    output_dir: str = field(default = './output', metadata = {"help": 'The output dir for logs and checkpoints'})
    optim: str = field(default = 'paged_adamw_32bit', metadata = {"help": 'The optimizer to be used'})
    per_device_train_batch_size: int = field(default = 2, metadata = {"help": 'The training batch size per GPU. Increase for better speed.'})
    gradient_accumulation_steps: int = field(default = 16, metadata = {"help": 'How many gradients to accumulate before to perform an optimizer step'})
    max_steps: int = field(default = 10000, metadata = {"help": 'How many optimizer update steps to take'})
    weight_decay: float = field(default = 0.0, metadata = {"help": 'The L2 weight decay rate of AdamW'}) # use lora dropout instead for regularization if needed
    learning_rate: float = field(default = 0.0002, metadata = {"help": 'The learnign rate'})
    remove_unused_columns: bool = field(default = False, metadata = {"help": 'Removed unused columns. Needed to make this codebase work.'})
    max_grad_norm: float = field(default = 0.3, metadata = {"help": 'Gradient clipping max norm. This is tuned and works well for all models tested.'})
    gradient_checkpointing: bool = field(default = True, metadata = {"help": 'Use gradient checkpointing. You want to use this.'})
    do_train: bool = field(default = True, metadata = {"help": 'To train or not to train, that is the question?'})
    lr_scheduler_type: str = field(default = 'constant', metadata = {"help": 'Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis'})
    warmup_ratio: float = field(default = 0.03, metadata = {"help": 'Fraction of steps to do a warmup for'})
    logging_steps: int = field(default = 10, metadata = {"help": 'The frequency of update steps after which to log the loss'})
    group_by_length: bool = field(default = True, metadata = {"help": 'Group sequences into batches with same length. Saves memory and speeds up training considerably.'})
    save_strategy: str = field(default = 'steps', metadata = {"help": 'When to save checkpoints'})
    save_steps: int = field(default = 250, metadata = {"help": 'How often to save a model'})
    save_total_limit: int = field(default = 40, metadata = {"help": 'How many checkpoints to save before the oldest is overwritten'})

In [7]:
@dataclass
class GenerationArguments:
    # For more hyperparameters check:
    # https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
    # Length arguments
    max_new_tokens: Optional[int] = field(
        default = 256,
        metadata = {"help": "Maximum number of new tokens to be generated in evaluation or prediction loops"
                            "if predict_with_generate is set."}
    )
    min_new_tokens : Optional[int] = field(default = None, metadata = {"help": "Minimum number of new tokens to generate."})

    # Generation strategy
    do_sample: Optional[bool] = field(default = False)
    num_beams: Optional[int] = field(default = 1)
    num_beam_groups: Optional[int] = field(default = 1)
    penalty_alpha: Optional[float] = field(default = None)
    use_cache: Optional[bool] = field(default = True)

    # Hyperparameters for logit manipulation
    temperature: Optional[float] = field(default = 1.0)
    top_k: Optional[int] = field(default = 50)
    top_p: Optional[float] = field(default = 1.0)
    typical_p: Optional[float] = field(default = 1.0)
    diversity_penalty: Optional[float] = field(default = 0.0)
    repetition_penalty: Optional[float] = field(default = 1.0)
    length_penalty: Optional[float] = field(default = 1.0)
    no_repeat_ngram_size: Optional[int] = field(default = 0)

### Model functions

In [8]:
if torch.cuda.is_available():
    n_gpus = torch.cuda.device_count()
if is_ipex_available() and torch.xpu.is_available():
    n_gpus = torch.xpu.device_count()

In [9]:
torch.cuda.is_bf16_supported()

True

In [11]:
## sets the maximum memory that can be used per device and automatically determines a device mapping for model parts
max_memory = f'{80000}MB'
max_memory = {i: max_memory for i in range(2)}
device_map = "auto"

# if we are in a distributed setting, we need to set the device map and max memory per device
if os.environ.get('LOCAL_RANK') is not None:
    local_rank = int(os.environ.get('LOCAL_RANK', '0'))
    device_map = {'': local_rank}
    max_memory = {'': max_memory[local_rank]}

In [10]:
model_name_or_path = "mistralai/Mistral-7B-Instruct-v0.2"

In [12]:
## model is loaded from a pre-trained state, with parameters for quantization and other configurations determined by the arguments.
print(f'loading base model {model_name_or_path}...')
compute_dtype = torch.bfloat16
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    cache_dir = None,
    device_map = device_map,
    max_memory = max_memory,
    quantization_config = BitsAndBytesConfig(
        load_in_4bit = True,
        load_in_8bit = False,
        llm_int8_threshold = 6.0,
        llm_int8_has_fp16_weight = False,
        bnb_4bit_compute_dtype = compute_dtype,
        bnb_4bit_use_double_quant = True,
        bnb_4bit_quant_type = "nf4",
    ),
    torch_dtype = torch.bfloat16,
    trust_remote_code = False,
    use_auth_token = False
)

loading base model mistralai/Mistral-7B-Instruct-v0.2...




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [13]:
## flags the model as parallelizable and model-parallel ready
setattr(model, 'model_parallel', True)
setattr(model, 'is_parallelizable', True)

In [14]:
model.config.torch_dtype = torch.bfloat16

In [15]:
# Initializing the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    cache_dir = None,
    padding_side = "right",
    use_fast = False, # Fast tokenizer giving issues.
    tokenizer_type = 'llama' if 'llama' in model_name_or_path else None, # Needed for HF name change
    trust_remote_code = False,
    use_auth_token = False,
)



tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

In [16]:
def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))
    
    if num_new_tokens > 0:
        input_embeddings_data = model.get_input_embeddings().weight.data
        output_embeddings_data = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings_data[-num_new_tokens:] = input_embeddings_avg
        output_embeddings_data[-num_new_tokens:] = output_embeddings_avg

In [17]:
if tokenizer._pad_token is None:
    smart_tokenizer_and_embedding_resize(
        special_tokens_dict = dict(pad_token = DEFAULT_PAD_TOKEN),
        tokenizer = tokenizer,
        model = model,
    )

In [18]:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing = True)

In [19]:
# used to find and list the names of all the linear modules
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])


    if 'lm_head' in lora_module_names: # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [20]:
print(f'adding LoRA modules...')
modules = find_all_linear_names(model)
config = LoraConfig(
    r = 64,
    lora_alpha = 16,
    target_modules = modules,
    lora_dropout = 0.0,
    bias = "none",
    task_type = "CAUSAL_LM",
)
model = get_peft_model(model, config)

adding LoRA modules...


In [21]:
## iterates through the named modules of the model to perform type casting to the appropriate data types
for name, module in model.named_modules():
    if isinstance(module, LoraLayer):
        module = module.to(torch.bfloat16)
    if 'norm' in name:
        module = module.to(torch.float32)
    if 'lm_head' in name or 'embed_tokens' in name:
        if hasattr(module, 'weight'):
            if module.weight.dtype == torch.float32:
                module = module.to(torch.bfloat16)

In [22]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    if 4 == 4: trainable_params /= 2
    print(
        f"trainable params: {trainable_params} || "
        f"all params: {all_param} || "
        f"trainable: {100 * trainable_params / all_param}"
    )

In [23]:
print_trainable_parameters(model)

trainable params: 83886080.0 || all params: 3919851520 || trainable: 2.140032079582443


### Data Functions

### Testing train fn

In [24]:
hfparser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments, GenerationArguments))

In [27]:
hfparser

HfArgumentParser(prog='ipykernel_launcher.py', usage=None, description=None, formatter_class=<class 'argparse.ArgumentDefaultsHelpFormatter'>, conflict_handler='error', add_help=True)

: 

In [26]:
model_args, data_args, training_args, generation_args, extra_args = hfparser.parse_args_into_dataclasses(return_remaining_strings=True)

usage: ipykernel_launcher.py [-h] [--model_name_or_path MODEL_NAME_OR_PATH]
                             [--trust_remote_code [TRUST_REMOTE_CODE]]
                             [--use_auth_token [USE_AUTH_TOKEN]]
                             [--eval_dataset_size EVAL_DATASET_SIZE]
                             [--max_train_samples MAX_TRAIN_SAMPLES]
                             [--max_eval_samples MAX_EVAL_SAMPLES]
                             [--source_max_len SOURCE_MAX_LEN]
                             [--target_max_len TARGET_MAX_LEN]
                             [--dataset DATASET]
                             [--dataset_format DATASET_FORMAT]
                             [--output_dir OUTPUT_DIR]
                             [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]]
                             [--do_train [DO_TRAIN]] [--no_do_train]
                             [--do_eval [DO_EVAL]] [--do_predict [DO_PREDICT]]
                             [--evaluation_strategy {no,steps,epo

SystemExit: 2