In [1]:
import os
import json
import torch
import random
import logging
import argparse
import numpy as np
import transformers
from pathlib import Path
# import accelerate.utils
import torch.backends.mps
import torch.backends.cudnn
from torch.cuda import (
    max_memory_allocated,
    reset_peak_memory_stats,
    reset_max_memory_allocated,
    memory_allocated,
)
from loader.logger import get_logger
from transformers import ( 
    set_seed,
    Seq2SeqTrainer,
    PreTrainedTokenizer,
    TrainerCallback,
    AutoModelForCausalLM,
    AutoTokenizer,
    LlamaTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
)
# from accelerate import Accelerator
from os.path import exists, join, isdir
from dataclasses import dataclass, field
from typing import Optional, Dict, Sequence

from transformers.utils.logging import (
    set_verbosity_error as transformers_vb_err,
)
from datasets.utils.logging import (
    set_verbosity_error as datasets_vb_err,
)

import evaluate
from tqdm import tqdm  
from datasets import load_dataset

import copy
import pandas as pd
from datasets import load_dataset, Dataset
from torch.nn.utils.rnn import pad_sequence

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"

  from .autonotebook import tqdm as notebook_tqdm
2024-02-22 22:48:07.200209: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-22 22:48:07.200257: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-22 22:48:07.201911: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-22 22:48:07.210798: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler fl

In [2]:
os.environ["TRANSFORMERS_CACHE"] = "/rscratch/tpang/kinshuk/cache"
os.environ["HF_DATASETS_CACHE"]="/rscratch/tpang/kinshuk/cache"

In [3]:
@dataclass
class ModelArguments:
    model_name_or_path: Optional[str] = field(
        default="meta-llama/Llama-2-7b-hf"
    )
    trust_remote_code: Optional[bool] = field(
        default=False,
        metadata={"help": "Enable unpickling of arbitrary code in AutoModelForCausalLM#from_pretrained."}
    )
    use_auth_token: Optional[bool] = field(
        default=False,
        metadata={"help": "To use Huggingface auth token from Git Credentials."}
    )

@dataclass
class DataArguments:
    eval_dataset_size: int = field(
        default=1024, metadata={"help": "Size of validation dataset."}
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging, truncate the number of train examples."
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "For debugging, truncate the number of eval examples."
        },
    )
    source_max_len: int = field(
        default=1024,
        metadata={"help": "Maximum source sequence length."},
    )
    target_max_len: int = field(
        default=256,
        metadata={"help": "Maximum target sequence length."},
    )
    dataset: str = field(
        default='alpaca',
        metadata={"help": "Which dataset to finetune on. See datamodule for options."}
    )
    dataset_format: Optional[str] = field(
        default=None,
        metadata={"help": "Dataset format being used. [alpaca|chip2|self-instruct|hh-rlhf]"}
    )

@dataclass
class TrainingArguments(transformers.Seq2SeqTrainingArguments):
    seed: Optional[int] = field(
        default=7,
        metadata={"help": "Random seed for reproducibility."}
    )
    cache_dir: Optional[str] = field(
        default='/rscratch/tpang/kinshuk/cache',
    )
    verbose: Optional[bool] = field(
        default=True,
        metadata={"help": "Whether to print verbose output."}
    )
    memlog: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to log memory usage."}
    )
    freeze: Optional[bool] = field(
        default=True,
        metadata={"help": "Whether to freeze the model."}
    )
    sortby: str = field(
        default='random',
        metadata={"help": "Layer sorting method. [random|alpha|layer]"}
    )
    num_layers: int = field(
        default=0,
        metadata={"help": "Number of layers to train."}
    )
    sort_ascending: bool = field(
        default=False,
        metadata={"help": "Whether to train in ascending order of layer sorting method."}
    )
    add_layer_norm: bool = field(
        default=False,
        metadata={"help": "Whether to add layer norm to the layers being trained."}
    )
    train_on_source: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to train on the input in addition to the target text."}
    )
    mmlu_split: Optional[str] = field(
        default='eval',
        metadata={"help": "The MMLU split to run on"}
    )
    mmlu_dataset: Optional[str] = field(
        default='mmlu-fs',
        metadata={"help": "MMLU dataset to use: [`mmlu-zs`:zero-shot|`mmlu-fs`:few-shot]."}
    )
    do_mmlu_eval: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to run the MMLU evaluation."}
    )
    max_mmlu_samples: Optional[int] = field(
        default=None,
        metadata={"help": "If set, only evaluates on `max_mmlu_samples` of the MMMLU dataset."}
    )
    mmlu_source_max_len: int = field(
        default=2048,
        metadata={"help": "Maximum source sequence length for MMLU."}
    )
    full_finetune: bool = field(
        default=False,
        metadata={"help": "Finetune the entire model without adapters."}
    )
    max_memory_MB: int = field(
        default=12000,
        metadata={"help": "Free memory per gpu."}
    )
    report_to: str = field(
        default='none',
        metadata={"help": "To use wandb or something else for reporting."}
    )
    output_dir: str = field(default='./output', metadata={"help": 'The output dir for logs and checkpoints'})
    optim: str = field(default='paged_adamw_32bit', metadata={"help": 'The optimizer to be used'})
    per_device_train_batch_size: int = field(default=1, metadata={"help": 'The training batch size per GPU.'})
    gradient_accumulation_steps: int = field(default=16, metadata={"help": 'Gradients to accumulate before performing an optimizer step'})
    max_steps: int = field(default=10000, metadata={"help": 'How many optimizer update steps to take'})
    weight_decay: float = field(default=0.0, metadata={"help": 'The L2 weight decay rate of AdamW'}) # use lora dropout instead for regularization if needed
    learning_rate: float = field(default=0.0002, metadata={"help": 'The learnign rate'})
    remove_unused_columns: bool = field(default=False, metadata={"help": 'Removed unused columns. Needed to make this codebase work.'})
    max_grad_norm: float = field(default=0.3, metadata={"help": 'Gradient clipping max norm. This is tuned and works well for all models tested.'})
    gradient_checkpointing: bool = field(default=True, metadata={"help": 'Use gradient checkpointing. You want to use this.'})
    do_train: bool = field(default=True, metadata={"help": 'To train or not.'})
    lr_scheduler_type: str = field(default='constant', metadata={"help": 'Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis'})
    warmup_ratio: float = field(default=0.03, metadata={"help": 'Fraction of steps to do a warmup for'})
    logging_steps: int = field(default=10, metadata={"help": 'The frequency of update steps after which to log the loss'})
    group_by_length: bool = field(default=True, metadata={"help": 'Group sequences into batches with same length. Saves memory and speeds up training considerably.'})
    save_strategy: str = field(default='steps', metadata={"help": 'When to save checkpoints'})
    save_steps: int = field(default=250, metadata={"help": 'How often to save a model'})
    save_total_limit: int = field(default=40, metadata={"help": 'How many checkpoints to save before the oldest is overwritten'})

@dataclass
class GenerationArguments:
    # For more hyperparameters check:
    # https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
    # Length arguments
    max_new_tokens: Optional[int] = field(
        default=256,
        metadata={"help": "Max number of new tokens to be generated in eval or prediction loops"
                          "if predict_with_generate is set."}
    )
    min_new_tokens : Optional[int] = field(
        default=None,
        metadata={"help": "Min number of new tokens to generate."}
    )

    # Generation strategy
    do_sample: Optional[bool] = field(default=False)
    num_beams: Optional[int] = field(default=1)
    num_beam_groups: Optional[int] = field(default=1)
    penalty_alpha: Optional[float] = field(default=None)
    use_cache: Optional[bool] = field(default=True)

    # Hyperparameters for logit manipulation
    temperature: Optional[float] = field(default=1.0)
    top_k: Optional[int] = field(default=50)
    top_p: Optional[float] = field(default=1.0)
    typical_p: Optional[float] = field(default=1.0)
    diversity_penalty: Optional[float] = field(default=0.0)
    repetition_penalty: Optional[float] = field(default=1.0)
    length_penalty: Optional[float] = field(default=1.0)
    no_repeat_ngram_size: Optional[int] = field(default=0)

In [4]:
def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: PreTrainedTokenizer,
    model: PreTrainedModel,
):
    """Borrowed from qlora codebase
    Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))
    
    if num_new_tokens > 0:
        input_embeddings_data = model.get_input_embeddings().weight.data
        output_embeddings_data = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean( # type: ignore
            dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean( # type: ignore
            dim=0, keepdim=True)

        input_embeddings_data[-num_new_tokens:] = input_embeddings_avg # type: ignore
        output_embeddings_data[-num_new_tokens:] = output_embeddings_avg # type: ignore

In [5]:
def get_model(args):

    if torch.cuda.is_available():
        n_gpus = torch.cuda.device_count()
    
    max_memory = f'{args.max_memory_MB}MB'
    max_memory = {i: max_memory for i in range(n_gpus)}
    device_map = "auto"

    if os.environ.get('LOCAL_RANK') is not None:
        local_rank = int(os.environ.get('LOCAL_RANK', '0'))
        device_map = {'': local_rank}
        max_memory = {'': max_memory[local_rank]}

    model = AutoModelForCausalLM.from_pretrained(
        args.model_name_or_path, 
        token="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",
        device_map=device_map,
        max_memory=max_memory,
    )

    model.config.use_cache = False
    setattr(model, 'model_parallel', True)
    setattr(model, 'is_parallelizable', True)

    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        args.model_name_or_path,
        token="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",
        cache_dir=args.cache_dir,
        padding_side="right",
        use_fast=False, # Fast tokenizer giving issues.
        tokenizer_type='llama' if 'llama' in args.model_name_or_path else None, # Needed for HF name change
        trust_remote_code=args.trust_remote_code,
        # use_auth_token=args.use_auth_token,
    )
    if tokenizer._pad_token is None:
        smart_tokenizer_and_embedding_resize(
            special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
            tokenizer=tokenizer, # type: ignore
            model=model,
        )
    if 'llama' in args.model_name_or_path or isinstance(tokenizer, LlamaTokenizer):
        # LLaMA tokenizer may not have correct special tokens set.
        # Check and add them if missing to prevent them from being parsed into different tokens.
        # Note that these are present in the vocabulary.
        # Note also that `model.config.pad_token_id` is 0 which corresponds to `<unk>` token.
        print('Adding special tokens.')
        tokenizer.add_special_tokens({
                "eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),
                "bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),
                "unk_token": tokenizer.convert_ids_to_tokens(
                    model.config.pad_token_id
                    if model.config.pad_token_id != -1
                    else tokenizer.pad_token_id # type: ignore
                ),
        })

    for name, module in model.named_modules():
        if 'norm' in name:
            module = module.to(torch.float32)
        if 'lm_head' in name or 'embed_tokens' in name:
            if hasattr(module, 'weight'):
                if args.bf16 and module.weight.dtype == torch.float32:
                    module = module.to(torch.bfloat16) 
    
    # SELECTIVE FINETUNING >>>------------------------------------->

    if args.freeze:
        for name, param in model.named_parameters():
            param.requires_grad = False
            if "lm_head" in name:
                param.requires_grad = True
    else:
        for name, param in model.named_parameters():  # type: ignore
            param.requires_grad = True
        return model, tokenizer
    
    # if "lora" not in args.sortby.lower():
    #     # Save WeightWatcher Metrics
    #     watcher = ww.WeightWatcher(model=model)
    #     ww_details = watcher.analyze(min_evals=10)

    # if not args.debug and "lora" not in args.sortby.lower():
    #     ww_details.to_csv(os.path.join(stats_path, f"epoch_{epoch}.csv"))  # type: ignore

    ww_details = pd.read_csv("./llama_ww.csv")
    # CHOOSING LAYERS TO TRAIN BASED ON WEIGHTWATCHER METRICS/SORTBY
    if "lora" not in args.sortby.lower():
        filtered = ww_details[  # type: ignore
            ww_details["longname"].str.contains("embed_tokens") == False  # type: ignore
        ]
        sortby = "alpha"
        if args.num_layers > len(filtered):
            args.num_layers = len(filtered)
        if "random" in (args.sortby).lower():
            train_names = random.sample(filtered["longname"].to_list(), args.num_layers)
        else:
            if "alpha" in (args.sortby).lower():
                sortby = "alpha"
            elif "layer" in (args.sortby).lower():
                sortby = "layer_id"
            else:
                sortby = "random"
            train_names = (
                filtered.sort_values(by=[sortby], ascending=args.sort_ascending)[
                    "longname"
                ]
                .iloc[: args.num_layers]
                .to_list()
            )
        if args.verbose:
            print("Sorted by ", sortby)
            print("Training layers:", train_names)
        layer_to_train = []
        for layer in train_names:
            layer_to_train.append(layer + ".weight")
            layer_to_train.append(layer + ".bias")
            # Add Layer Norm
            if args.add_layer_norm:
                if "output" in layer:
                    layer_to_train.append(
                        layer.replace("dense", "LayerNorm") + ".weight"
                    )
                    layer_to_train.append(layer.replace("dense", "LayerNorm") + ".bias")
        layer_to_train = list(set(layer_to_train))
        # print("Final Training layers:", layer_to_train)
        for name, param in model.named_parameters():
            if name in layer_to_train:
                if args.verbose:
                    print(f"Enabling {name} parameter")
                param.requires_grad = True
    
    for name, module in model.named_modules():
        if 'norm' in name:
            module = module.to(torch.float32)
        if 'lm_head' in name or 'embed_tokens' in name:
            if hasattr(module, 'weight'):
                if args.bf16 and module.weight.dtype == torch.float32:
                    module = module.to(torch.bfloat16) 
    
    return model, tokenizer

In [6]:
@dataclass
class DataCollatorForCausalLM(object):
    """Borrowed from qlora codebase."""
    tokenizer: PreTrainedTokenizer
    source_max_len: int
    target_max_len: int
    train_on_source: bool
    predict_with_generate: bool

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        # Extract elements
        sources = [f"{self.tokenizer.bos_token}{example['input']}" for example in instances]
        targets = [f"{example['output']}{self.tokenizer.eos_token}" for example in instances]
        # Tokenize
        tokenized_sources_with_prompt = self.tokenizer(
            sources,
            max_length=self.source_max_len,
            truncation=True,
            add_special_tokens=False,
        )
        tokenized_targets = self.tokenizer(
            targets,
            max_length=self.target_max_len,
            truncation=True,
            add_special_tokens=False,
        )
        # Build the input and labels for causal LM
        input_ids = []
        labels = []
        for tokenized_source, tokenized_target in zip(
            tokenized_sources_with_prompt['input_ids'], # type: ignore
            tokenized_targets['input_ids'] # type: ignore
        ):
            if not self.predict_with_generate:
                input_ids.append(torch.tensor(tokenized_source + tokenized_target))
                if not self.train_on_source:
                    labels.append(
                        torch.tensor([IGNORE_INDEX for _ in range(len(tokenized_source))] + copy.deepcopy(tokenized_target))
                    )
                else:
                    labels.append(torch.tensor(copy.deepcopy(tokenized_source + tokenized_target)))
            else:
                input_ids.append(torch.tensor(tokenized_source))
        # Apply padding
        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id) # type: ignore
        labels = pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX) if not self.predict_with_generate else None
        data_dict = {
            'input_ids': input_ids,
            'attention_mask':input_ids.ne(self.tokenizer.pad_token_id), # type: ignore
        }
        if labels is not None:
            data_dict['labels'] = labels
        return data_dict

ALPACA_PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: "
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response: "
    ),
}

def extract_alpaca_dataset(example):
    if example.get("input", "") != "":
        prompt_format = ALPACA_PROMPT_DICT["prompt_input"]
    else:
        prompt_format = ALPACA_PROMPT_DICT["prompt_no_input"]
    return {'input': prompt_format.format(**example)}

def local_dataset(dataset_name):
    if dataset_name.endswith('.json') or dataset_name.endswith('.jsonl'):
        full_dataset = Dataset.from_json(path_or_paths=dataset_name)
    elif dataset_name.endswith('.csv'):
        full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name))
    elif dataset_name.endswith('.tsv'):
        full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name, delimiter='\t'))
    else:
        raise ValueError(f"Unsupported dataset format: {dataset_name}")

    split_dataset = full_dataset.train_test_split(test_size=0.1) # type: ignore
    return split_dataset

def make_data_module(tokenizer: PreTrainedTokenizer, args) -> Dict:
    """
    Make dataset and collator for supervised fine-tuning.
    Datasets are expected to have the following columns: { `input`, `output` }

    Available datasets to be selected with `dataset` argument:
        - alpaca, 52002 examples
        - alpaca cleaned, 51942 examples
        - chip2 (OIG), 210289 examples
        - self-instruct, 82612 examples
        - hh-rlhf (Anthropic), 160800 examples
        - longform, 23.7k examples
        - oasst1 (OpenAssistant) primary message tree only, 9,846 examples

    Coming soon:
        - unnatural instructions core, 66010 examples
        - unnatural instructions full, 240670 examples
        - alpaca-gpt4, 52002 examples
        - unnatural-instructions-gpt4, 9000 examples
        - supernatural-instructions, 69624 examples (same as paper with 100 ex/task more can be used)
        - flan (FLAN v2), up to 20M examples available
        - vicuna

    """
    def load_data(dataset_name):
        if dataset_name == 'alpaca':
            return load_dataset("tatsu-lab/alpaca")
        elif dataset_name == 'alpaca-clean':
            return load_dataset("yahma/alpaca-cleaned")
        elif dataset_name == 'chip2':
            return load_dataset("laion/OIG", data_files='unified_chip2.jsonl')
        elif dataset_name == 'self-instruct':
            return load_dataset("yizhongw/self_instruct", name='self_instruct')
        elif dataset_name == 'hh-rlhf':
            return load_dataset("Anthropic/hh-rlhf")
        elif dataset_name == 'longform':
            return load_dataset("akoksal/LongForm")
        elif dataset_name == 'oasst1':
            return load_dataset("timdettmers/openassistant-guanaco")
        elif dataset_name == 'vicuna':
            raise NotImplementedError("Vicuna data was not released.")
        else:
            if os.path.exists(dataset_name):
                try:
                    args.dataset_format = args.dataset_format if args.dataset_format else "input-output"
                    full_dataset = local_dataset(dataset_name)
                    return full_dataset
                except:
                    raise ValueError(f"Error loading dataset from {dataset_name}")
            else:
                raise NotImplementedError(f"Dataset {dataset_name} not implemented yet.")

    def format_dataset(dataset, dataset_format):
        if (
            dataset_format == 'alpaca' or dataset_format == 'alpaca-clean' or
            (dataset_format is None and args.dataset in ['alpaca', 'alpaca-clean'])
        ):
            dataset = dataset.map(extract_alpaca_dataset, remove_columns=['instruction'])
        elif dataset_format == 'chip2' or (dataset_format is None and args.dataset == 'chip2'):
            dataset = dataset.map(lambda x: {
                'input': x['text'].split('\n<bot>: ')[0].replace('<human>: ', ''),
                'output': x['text'].split('\n<bot>: ')[1],
            })
        elif dataset_format == 'self-instruct' or (dataset_format is None and args.dataset == 'self-instruct'):
            for old, new in [["prompt", "input"], ["completion", "output"]]:
                dataset = dataset.rename_column(old, new)
        elif dataset_format == 'hh-rlhf' or (dataset_format is None and args.dataset == 'hh-rlhf'):
            dataset = dataset.map(lambda x: {
                'input': '',
                'output': x['chosen']
            })
        elif dataset_format == 'oasst1' or (dataset_format is None and args.dataset == 'oasst1'):
            dataset = dataset.map(lambda x: {
                'input': '',
                'output': x['text'],
            })
        elif dataset_format == 'input-output':
            # leave as is
            pass
        # Remove unused columns.
        dataset = dataset.remove_columns(
            [col for col in dataset.column_names['train'] if col not in ['input', 'output']]
        )
        return dataset

     # Load dataset.
    dataset = load_data(args.dataset)
    dataset = format_dataset(dataset, args.dataset_format)

    # Split train/eval, reduce size
    if args.do_eval or args.do_predict:
        if 'eval' in dataset:
            eval_dataset = dataset['eval']
        else:
            print('Splitting train dataset in train and validation according to `eval_dataset_size`')
            dataset = dataset["train"].train_test_split(
                test_size=args.eval_dataset_size, shuffle=True, seed=42
            )
            eval_dataset = dataset['test']
        if args.max_eval_samples is not None and len(eval_dataset) > args.max_eval_samples:
            eval_dataset = eval_dataset.select(range(args.max_eval_samples))
        if args.group_by_length:
            eval_dataset = eval_dataset.map(lambda x: {'length': len(x['input']) + len(x['output'])})
    if args.do_train:
        train_dataset = dataset['train']
        if args.max_train_samples is not None and len(train_dataset) > args.max_train_samples:
            train_dataset = train_dataset.select(range(args.max_train_samples))
        if args.group_by_length:
            train_dataset = train_dataset.map(lambda x: {'length': len(x['input']) + len(x['output'])})

    data_collator = DataCollatorForCausalLM(
        tokenizer=tokenizer,
        source_max_len=args.source_max_len,
        target_max_len=args.target_max_len,
        train_on_source=args.train_on_source,
        predict_with_generate=args.predict_with_generate,
    )
    return dict(
        train_dataset=train_dataset if args.do_train else None,
        eval_dataset=eval_dataset if args.do_eval else None,
        predict_dataset=eval_dataset if args.do_predict else None,
        data_collator=data_collator
    )

In [7]:
# Train
def train_func(args, logger, trainer, all_metrics):
    if args.do_train:
        logger.info("*** Train ***")
        # Note: `resume_from_checkpoint` not supported for adapter checkpoints by HF.
        # Currently adapter checkpoint is reloaded as expected but optimizer/scheduler states are not.
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        # trainer.save_state()
        all_metrics.update(metrics)
    return all_metrics

In [8]:
# Eval
def eval_func(args, logger, trainer, all_metrics):
    if args.do_eval:
        logger.info("*** Evaluate ***")
        metrics = trainer.evaluate(metric_key_prefix="eval")
        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)
        all_metrics.update(metrics)
    return all_metrics

In [9]:
def mmlu_callback(args, tokenizer, trainer):
    if args.do_mmlu_eval:
        if args.mmlu_dataset == 'mmlu-zs':
            mmlu_dataset = load_dataset("json", data_files={
                'eval': 'data/mmlu/zero_shot_mmlu_val.json',
                'test': 'data/mmlu/zero_shot_mmlu_test.json',
            })
            mmlu_dataset = mmlu_dataset.remove_columns('subject')
        # MMLU Five-shot (Eval/Test only)
        elif args.mmlu_dataset == 'mmlu' or args.mmlu_dataset == 'mmlu-fs':
            mmlu_dataset = load_dataset("json", data_files={
                'eval': 'data/mmlu/five_shot_mmlu_val.json',
                'test': 'data/mmlu/five_shot_mmlu_test.json',
            })
            # mmlu_dataset = mmlu_dataset.remove_columns('subject')
        mmlu_dataset = mmlu_dataset[args.mmlu_split] # type: ignore
        if args.max_mmlu_samples is not None:
            mmlu_dataset = mmlu_dataset.select(range(args.max_mmlu_samples))
        abcd_idx = [
            tokenizer("A", add_special_tokens=False).input_ids[0],
            tokenizer("B", add_special_tokens=False).input_ids[0],
            tokenizer("C", add_special_tokens=False).input_ids[0],
            tokenizer("D", add_special_tokens=False).input_ids[0],
        ]
        accuracy = evaluate.load("accuracy")
        class MMLUEvalCallback(TrainerCallback):
            def on_evaluate(self, args, state, control, model, **kwargs):
                data_loader = trainer.get_eval_dataloader(mmlu_dataset)
                source_max_len = trainer.data_collator.source_max_len
                trainer.data_collator.source_max_len = args.mmlu_source_max_len
                trainer.model.eval()
                preds, refs = [], []
                loss_mmlu = 0
                for batch in tqdm(data_loader, total=len(data_loader)):
                    (loss, logits, labels) = trainer.prediction_step(trainer.model,batch,prediction_loss_only=False,)
                    # There are two tokens, the output, and eos token.
                    for i, logit in enumerate(logits):
                        label_non_zero_id = (batch['labels'][i] != -100).nonzero()[0][0]
                        logit_abcd = logit[label_non_zero_id-1][abcd_idx]
                        preds.append(torch.argmax(logit_abcd).item())
                    labels = labels[labels != IGNORE_INDEX].view(-1, 2)[:,0]
                    refs += [abcd_idx.index(label) for label in labels.tolist()]
                    loss_mmlu += loss.item()
                # Extract results by subject.
                results = {'mmlu_loss':loss_mmlu/len(data_loader)}
                subject = mmlu_dataset['subject']
                subjects = {s:{'refs':[], 'preds':[]} for s in set(subject)}
                for s,p,r in zip(subject, preds, refs):
                    subjects[s]['preds'].append(p)
                    subjects[s]['refs'].append(r)
                subject_scores = []
                for subject in subjects:
                    subject_score = accuracy.compute(
                        references=subjects[subject]['refs'],
                        predictions=subjects[subject]['preds']
                    )['accuracy'] # type: ignore
                    results[f'mmlu_{args.mmlu_split}_accuracy_{subject}'] = subject_score
                    subject_scores.append(subject_score)
                results[f'mmlu_{args.mmlu_split}_accuracy'] = np.mean(subject_scores)
                trainer.log(results)
                trainer.data_collator.source_max_len = source_max_len

        trainer.add_callback(MMLUEvalCallback)
    return trainer

In [11]:
# Setting up the arguments

model_args = ModelArguments(
    model_name_or_path="meta-llama/Llama-2-7b-hf"
)

data_args = DataArguments(
    eval_dataset_size=1024,
    max_eval_samples=50,
    dataset="oasst1",
)

training_args = TrainingArguments(
    output_dir="./output",
    logging_steps=10,
    data_seed=42,
    save_strategy="steps",
    evaluation_strategy="steps",
    logging_strategy="steps",
    do_eval=False,
    max_steps=5,
    eval_steps=187,
    adam_beta2=0.999,
    seed=7,
    sortby="random",
    num_layers=15,
    memlog=False,
)

generation_args = GenerationArguments(
    # Define generation-specific arguments here, if any are required
)

# If you need to use GenerationConfig or similar for generation_args
training_args.generation_config = transformers.GenerationConfig(
    **vars(generation_args)
)

# Combine arguments into a single Namespace object (if needed)
args = argparse.Namespace(
    **vars(model_args), **vars(data_args), **vars(training_args),
)

In [12]:
def main():
    logger = logging.getLogger(__name__)

    print(args)
    os.environ["TRANSFORMERS_CACHE"] = args.cache_dir
    cuda_device = torch.cuda.current_device()
    gpus = torch.cuda.device_count()
    sby = args.sortby
    if "alpha" in (args.sortby).lower():
        sby = "alpha"
    elif "layer" in (args.sortby).lower():
        sby = "layer"
    else:
        sby = "rand"

    # Memory Log Path
    mempath = (
        f"/rscratch/tpang/kinshuk/RpMKin/llama_ft/{args.dataset}/"
        + f"{sby}"
    )
    
    # Control randomness
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # accelerate.utils.set_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    set_seed(args.seed)  # transformers seed
    
    start_memory = [0] * gpus
    end_memory = [0] * gpus
    peek_memory = 0
    # Memory Stats Initialization
    for device in range(gpus):
        reset_peak_memory_stats(device=device)
        reset_max_memory_allocated(device=device)
        start_memory[device] = memory_allocated(device=device)

    if args.verbose:
        task_info = (
            f"\n\n\nSeed: {args.seed}\n\n"
            + f"Dataset: {args.dataset}\n\n"
            + f"Sort by: {args.sortby}\n\n"
            + f"Sort Descending: {not args.sort_ascending}\n\n"
            + f"Layers to train: {args.num_layers}\n\n\n"
        )
        print(task_info)
    else:
        datasets_vb_err()
        transformers_vb_err()
        global _tqdm_active
        _tqdm_active = False

    # WIP >>>------------------------------------------>

    model, tokenizer = get_model(args)

    data_module = make_data_module(tokenizer=tokenizer, args=args) # type: ignore

    trainer = Seq2SeqTrainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        **{k:v for k,v in data_module.items() if k != 'predict_dataset'},
    )

    if args.do_mmlu_eval:
        trainer = mmlu_callback(args, tokenizer, trainer)

    all_metrics = {"run_name": args.run_name}

    # Train
    if args.do_train:
        all_metrics = train_func(args, logger, trainer, all_metrics)
    
    # Eval
    if args.do_eval:
        all_metrics = eval_func(args, logger, trainer, all_metrics)

    for device in range(gpus):
        end_memory[device] = memory_allocated(device=device)
        peek_memory += max_memory_allocated(device=device)
    print(
        f"\n\n\nMemory usage before: {int(sum(start_memory)/1e6)} MB\n"\
        +f"Memory usage after: {int(sum(end_memory)/1e6)} MB"
    )
    print(f"\nPeak Memory usage: {int(peek_memory/1e6)} MB\n\n\n")

    # WIP <-----------------------------------------<<<

    if args.memlog: # Memory Logging
        log_info = (
            f"\n\n{args.dataset} "
            + f"{args.num_layers} Layers "
            + f"{args.sortby} "
            + f"Ascending {args.sort_ascending}"
        )
        Path(mempath).mkdir(parents=True, exist_ok=True)
        logger = get_logger(mempath, "memlog.log")
        logger.info(log_info)
        logger.info(
            f"\nMemory usage before: {int(sum(start_memory)/1e6)} MB\n"
            + f"Memory usage after: {int(sum(end_memory)/1e6)} MB"
        )
        logger.info(f"\nPeak Memory usage: {int(peek_memory/1e6)} MB\n\n")

    if (args.do_train or args.do_eval or args.do_predict):
        metrics_file_path = os.path.join(args.output_dir,
                                    f'trainseed_{args.seed}',
                                    args.dataset,
                                    f"{sby}_asc_{args.sort_ascending}",
                                    f"layers_{args.num_layers}",
                                    "metrics.json")

        os.makedirs(os.path.dirname(metrics_file_path), exist_ok=True)
        with open(metrics_file_path, "w") as fout:
            fout.write(json.dumps(all_metrics))


if __name__ == "__main__":
    main()

  "transformers_version": "4.31.0"
}
, cache_dir='/rscratch/tpang/kinshuk/cache', verbose=True, memlog=False, freeze=True, sortby='random', num_layers=15, sort_ascending=False, add_layer_norm=False, train_on_source=False, mmlu_split='eval', mmlu_dataset='mmlu-fs', do_mmlu_eval=False, max_mmlu_samples=None, mmlu_source_max_len=2048, full_finetune=False, max_memory_MB=12000, distributed_state=Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda
, _n_gpu=8, __cached__setup_devices=device(type='cuda', index=0), deepspeed_plugin=None)



Seed: 7

Dataset: oasst1

Sort by: random

Sort Descending: True

Layers to train: 15





Loading checkpoint shards: 100%|██████████| 2/2 [00:15<00:00,  7.66s/it]


Adding special tokens.
Sorted by  alpha
Training layers: ['model.layers.11.mlp.up_proj', 'model.layers.5.self_attn.o_proj', 'model.layers.14.self_attn.o_proj', 'model.layers.23.mlp.up_proj', 'model.layers.1.mlp.up_proj', 'model.layers.2.mlp.gate_proj', 'model.layers.30.self_attn.q_proj', 'model.layers.19.mlp.gate_proj', 'model.layers.3.self_attn.o_proj', 'model.layers.13.self_attn.v_proj', 'model.layers.21.self_attn.v_proj', 'model.layers.2.self_attn.q_proj', 'model.layers.18.self_attn.o_proj', 'model.layers.7.mlp.up_proj', 'model.layers.1.self_attn.v_proj']
Enabling model.layers.1.self_attn.v_proj.weight parameter
Enabling model.layers.1.mlp.up_proj.weight parameter
Enabling model.layers.2.self_attn.q_proj.weight parameter
Enabling model.layers.2.mlp.gate_proj.weight parameter
Enabling model.layers.3.self_attn.o_proj.weight parameter
Enabling model.layers.5.self_attn.o_proj.weight parameter
Enabling model.layers.7.mlp.up_proj.weight parameter
Enabling model.layers.11.mlp.up_proj.weigh



Splitting train dataset in train and validation according to `eval_dataset_size`




Step,Training Loss,Validation Loss


***** train metrics *****
  epoch                    =       0.01
  total_flos               =   759104GF
  train_loss               =     1.5364
  train_runtime            = 0:00:38.88
  train_samples_per_second =      2.057
  train_steps_per_second   =      0.129


***** eval metrics *****
  epoch                   =       0.01
  eval_loss               =     1.5692
  eval_runtime            = 0:00:17.54
  eval_samples_per_second =       2.85
  eval_steps_per_second   =      0.399



Memory usage before: 0 MB
Memory usage after: 27156 MB

Peak Memory usage: 30715 MB



