In [1]:
cachedir = '/rscratch/tpang/kinshuk/cache'
from model import get_model
from traineval.eval import eval_func
from traineval.train import train_func
from loader.callbacks import mmlu_callback
from loader.data_module import make_data_module
import os
import json
import torch
import random
import logging
import argparse
import numpy as np
import transformers
from pathlib import Path
import torch.backends.mps
import torch.backends.cudnn
from torch.cuda import (
    max_memory_allocated,
    reset_peak_memory_stats,
    reset_max_memory_allocated,
    memory_allocated,
)
from loader.logger import get_logger
from transformers import ( 
    set_seed,
    Seq2SeqTrainer,
)
# from accelerate import Accelerator
from os.path import exists, join, isdir
from dataclasses import dataclass, field
from typing import Optional, Dict, Sequence

from transformers.utils.logging import (
    set_verbosity_error as transformers_vb_err,
)
from datasets.utils.logging import (
    set_verbosity_error as datasets_vb_err,
)

logger = logging.getLogger(__name__)

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"

  from .autonotebook import tqdm as notebook_tqdm
2024-02-22 23:21:23.981934: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-22 23:21:23.981966: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-22 23:21:23.983739: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-22 23:21:23.991969: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler fl

In [2]:
os.environ["TRANSFORMERS_CACHE"] = "/rscratch/tpang/kinshuk/cache"
os.environ["HF_DATASETS_CACHE"]="/rscratch/tpang/kinshuk/cache"

In [3]:
from llamaft import ModelArguments, DataArguments, TrainingArguments, GenerationArguments

In [10]:
# Setting up the arguments

model_args = ModelArguments(
    model_name_or_path="meta-llama/Llama-2-7b-hf"
)

data_args = DataArguments(
    eval_dataset_size=1024,
    max_eval_samples=50,
    dataset="oasst1",
)

training_args = TrainingArguments(
    output_dir="./output",
    logging_steps=10,
    data_seed=42,
    save_strategy="steps",
    evaluation_strategy="steps",
    logging_strategy="steps",
    do_eval=False,
    max_steps=5,
    eval_steps=187,
    adam_beta2=0.999,
    seed=7,
    sortby="random",
    num_layers=15,
    memlog=False,
)

generation_args = GenerationArguments(
    # Define generation-specific arguments here, if any are required
)

# If you need to use GenerationConfig or similar for generation_args
training_args.generation_config = transformers.GenerationConfig(
    **vars(generation_args)
)

# Combine arguments into a single Namespace object (if needed)
args = argparse.Namespace(
    **vars(model_args), **vars(data_args), **vars(training_args),
)

In [11]:
def main():
    logger = logging.getLogger(__name__)

    print(args)
    os.environ["TRANSFORMERS_CACHE"] = args.cache_dir
    cuda_device = torch.cuda.current_device()
    gpus = torch.cuda.device_count()
    sby = args.sortby
    if "alpha" in (args.sortby).lower():
        sby = "alpha"
    elif "layer" in (args.sortby).lower():
        sby = "layer"
    else:
        sby = "rand"

    # Memory Log Path
    mempath = (
        f"/rscratch/tpang/kinshuk/RpMKin/llama_ft/{args.dataset}/"
        + f"{sby}"
    )
    
    # Control randomness
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # accelerate.utils.set_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    set_seed(args.seed)  # transformers seed
    
    start_memory = [0] * gpus
    end_memory = [0] * gpus
    peek_memory = 0
    # Memory Stats Initialization
    for device in range(gpus):
        reset_peak_memory_stats(device=device)
        reset_max_memory_allocated(device=device)
        start_memory[device] = memory_allocated(device=device)

    if args.verbose:
        task_info = (
            f"\n\n\nSeed: {args.seed}\n\n"
            + f"Dataset: {args.dataset}\n\n"
            + f"Sort by: {args.sortby}\n\n"
            + f"Sort Descending: {not args.sort_ascending}\n\n"
            + f"Layers to train: {args.num_layers}\n\n\n"
        )
        print(task_info)
    else:
        datasets_vb_err()
        transformers_vb_err()
        global _tqdm_active
        _tqdm_active = False

    # WIP >>>------------------------------------------>

    model, tokenizer = get_model(args)

    data_module = make_data_module(tokenizer=tokenizer, args=args) # type: ignore

    trainer = Seq2SeqTrainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        **{k:v for k,v in data_module.items() if k != 'predict_dataset'},
    )

    if args.do_mmlu_eval:
        trainer = mmlu_callback(args, tokenizer, trainer)

    all_metrics = {"run_name": args.run_name}

    # Train
    if args.do_train:
        all_metrics = train_func(args, logger, trainer, all_metrics)
    
    # Eval
    if args.do_eval:
        all_metrics = eval_func(args, logger, trainer, all_metrics)

    for device in range(gpus):
        end_memory[device] = memory_allocated(device=device)
        peek_memory += max_memory_allocated(device=device)
    print(
        f"\n\n\nMemory usage before: {int(sum(start_memory)/1e6)} MB\n"\
        +f"Memory usage after: {int(sum(end_memory)/1e6)} MB"
    )
    print(f"\nPeak Memory usage: {int(peek_memory/1e6)} MB\n\n\n")

    # WIP <-----------------------------------------<<<

    if args.memlog: # Memory Logging
        log_info = (
            f"\n\n{args.dataset} "
            + f"{args.num_layers} Layers "
            + f"{args.sortby} "
            + f"Ascending {args.sort_ascending}"
        )
        Path(mempath).mkdir(parents=True, exist_ok=True)
        logger = get_logger(mempath, "memlog.log")
        logger.info(log_info)
        logger.info(
            f"\nMemory usage before: {int(sum(start_memory)/1e6)} MB\n"
            + f"Memory usage after: {int(sum(end_memory)/1e6)} MB"
        )
        logger.info(f"\nPeak Memory usage: {int(peek_memory/1e6)} MB\n\n")

    if (args.do_train or args.do_eval or args.do_predict):
        metrics_file_path = os.path.join(args.output_dir,
                                    f'trainseed_{args.seed}',
                                    args.dataset,
                                    f"{sby}_asc_{args.sort_ascending}",
                                    f"layers_{args.num_layers}",
                                    "metrics.json")

        os.makedirs(os.path.dirname(metrics_file_path), exist_ok=True)
        with open(metrics_file_path, "w") as fout:
            fout.write(json.dumps(all_metrics))


if __name__ == "__main__":
    main()

  "max_new_tokens": 256,
  "transformers_version": "4.31.0"
}
, cache_dir='/rscratch/tpang/kinshuk/cache', verbose=True, memlog=False, freeze=True, sortby='random', num_layers=15, sort_ascending=False, add_layer_norm=False, train_on_source=False, mmlu_split='eval', mmlu_dataset='mmlu-fs', do_mmlu_eval=False, max_mmlu_samples=None, mmlu_source_max_len=2048, full_finetune=False, max_memory_MB=12000, distributed_state=Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda
, _n_gpu=8, __cached__setup_devices=device(type='cuda', index=0), deepspeed_plugin=None)



Seed: 7

Dataset: oasst1

Sort by: random

Sort Descending: True

Layers to train: 15







Loading checkpoint shards: 100%|██████████| 2/2 [00:16<00:00,  8.41s/it]


Adding special tokens.
Sorted by  alpha
Training layers: ['model.layers.11.mlp.up_proj', 'model.layers.5.self_attn.o_proj', 'model.layers.14.self_attn.o_proj', 'model.layers.23.mlp.up_proj', 'model.layers.1.mlp.up_proj', 'model.layers.2.mlp.gate_proj', 'model.layers.30.self_attn.q_proj', 'model.layers.19.mlp.gate_proj', 'model.layers.3.self_attn.o_proj', 'model.layers.13.self_attn.v_proj', 'model.layers.21.self_attn.v_proj', 'model.layers.2.self_attn.q_proj', 'model.layers.18.self_attn.o_proj', 'model.layers.7.mlp.up_proj', 'model.layers.1.self_attn.v_proj']
Enabling model.layers.1.self_attn.v_proj.weight parameter
Enabling model.layers.1.mlp.up_proj.weight parameter
Enabling model.layers.2.self_attn.q_proj.weight parameter
Enabling model.layers.2.mlp.gate_proj.weight parameter
Enabling model.layers.3.self_attn.o_proj.weight parameter
Enabling model.layers.5.self_attn.o_proj.weight parameter
Enabling model.layers.7.mlp.up_proj.weight parameter
Enabling model.layers.11.mlp.up_proj.weigh



Splitting train dataset in train and validation according to `eval_dataset_size`




Step,Training Loss,Validation Loss


***** train metrics *****
  epoch                    =       0.01
  total_flos               =   759104GF
  train_loss               =     1.5364
  train_runtime            = 0:00:38.85
  train_samples_per_second =      2.059
  train_steps_per_second   =      0.129


***** eval metrics *****
  epoch                   =       0.01
  eval_loss               =     1.5692
  eval_runtime            = 0:00:17.51
  eval_samples_per_second =      2.855
  eval_steps_per_second   =        0.4



Memory usage before: 0 MB
Memory usage after: 27156 MB

Peak Memory usage: 30715 MB



