In [1]:
import os
cachedir = '/rscratch/tpang/kinshuk/cache'
os.environ["CUDA_VISIBLE_DEVICES"] = "5,6,7"
os.environ["TRANSFORMERS_CACHE"] = cachedir
os.environ["HF_DATASETS_CACHE"]= cachedir
from model import get_model
from loader.data_module import make_data_module
import json
import torch
import random
import logging
import argparse
import numpy as np
import transformers
from pathlib import Path
import torch.backends.mps
import torch.backends.cudnn
from torch.cuda import (
    max_memory_allocated,
    reset_peak_memory_stats,
    reset_max_memory_allocated,
    memory_allocated,
)
from loader.logger import get_logger
from transformers import set_seed
# from accelerate import Accelerator
from os.path import exists, join, isdir
from dataclasses import dataclass, field
from typing import Optional, Dict, Sequence

from transformers.utils.logging import (
    set_verbosity_error as transformers_vb_err,
)
from datasets.utils.logging import (
    set_verbosity_error as datasets_vb_err,
)
logger = logging.getLogger(__name__)
IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
from llamaft import ModelArguments, DataArguments, TrainingArguments, GenerationArguments

  from .autonotebook import tqdm as notebook_tqdm
2024-03-29 12:25:05.698668: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-29 12:25:05.700555: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-29 12:25:05.702294: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-29 12:25:05.736896: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler fl

In [2]:
# Setting up the arguments

model_args = ModelArguments(
    model_name_or_path="meta-llama/Llama-2-7b-hf"
)

data_args = DataArguments(
    eval_dataset_size=1024,
    max_eval_samples=50,
    dataset="alpaca",
)

training_args = TrainingArguments(
    output_dir="./output",
    logging_steps=10,
    data_seed=42,
    save_strategy="steps",
    evaluation_strategy="steps",
    logging_strategy="steps",
    do_eval=False,
    max_steps=5,
    eval_steps=187,
    adam_beta2=0.999,
    seed=7,
    sortby="alpha",
    num_layers=100,
    memlog=False,
)

generation_args = GenerationArguments(
    # Define generation-specific arguments here, if any are required
)

# If you need to use GenerationConfig or similar for generation_args
training_args.generation_config = transformers.GenerationConfig(
    **vars(generation_args)
)

# Combine arguments into a single Namespace object (if needed)
args = argparse.Namespace(
    **vars(model_args), **vars(data_args), **vars(training_args),
)

# Control randomness
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
# accelerate.utils.set_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
set_seed(args.seed)  # transformers seed

In [3]:
logger = logging.getLogger(__name__)
print(args)
gpus = torch.cuda.device_count()
# Memory Log Path
mempath = (
    f"/rscratch/tpang/kinshuk/RpMKin/llama_ft/{args.dataset}/"
    + f"{args.sortby}"
)
start_memory = [0] * gpus
end_memory = [0] * gpus
peek_memory = 0

if args.verbose:
        task_info = (
            f"\nSeed: {args.seed}\n\n"
            + f"Dataset: {args.dataset}\n\n"
            + f"Sort by: {args.sortby}\n\n"
            + f"Layers to train: {args.num_layers}\n"
        )
        print(task_info)
else:
    datasets_vb_err()
    transformers_vb_err()
    global _tqdm_active
    _tqdm_active = False

  "max_new_tokens": 256,
  "transformers_version": "4.31.0"
}
, cache_dir='/rscratch/tpang/kinshuk/cache', verbose=True, memlog=False, freeze=True, sortby='alpha', num_layers=100, sort_ascending=False, add_layer_norm=False, train_on_source=False, mmlu_split='eval', mmlu_dataset='mmlu-fs', do_mmlu_eval=False, max_mmlu_samples=None, mmlu_source_max_len=2048, adam8bit=False, double_quant=False, quant_type='nf4', bits=16, lora_r=8, lora_alpha=16, lora_dropout=0.0, full_finetune=False, max_memory_MB=45000, distributed_state=Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda
, _n_gpu=3, __cached__setup_devices=device(type='cuda', index=0), deepspeed_plugin=None)

Seed: 7

Dataset: alpaca

Sort by: alpha

Layers to train: 100



In [4]:
def memall(gpus=gpus):
    for i in range(gpus):
        start_memory[i] = torch.cuda.memory_allocated(i)
    return sum(start_memory)

In [5]:
model, tokenizer = get_model(args)

for device in range(gpus):
    reset_peak_memory_stats(device=device)
    reset_max_memory_allocated(device=device)

weight_memory = memall()

Loading checkpoint shards: 100%|██████████| 2/2 [00:33<00:00, 16.67s/it]


Adding special tokens.
Sorted by  alpha
Training layers: ['model.layers.21.self_attn.v_proj', 'model.layers.22.self_attn.v_proj', 'model.layers.16.self_attn.v_proj', 'model.layers.19.self_attn.v_proj', 'model.layers.18.self_attn.v_proj', 'model.layers.30.self_attn.v_proj', 'model.layers.28.self_attn.v_proj', 'model.layers.17.self_attn.v_proj', 'model.layers.27.self_attn.v_proj', 'model.layers.11.self_attn.v_proj', 'model.layers.23.self_attn.o_proj', 'model.layers.3.mlp.up_proj', 'model.layers.21.self_attn.o_proj', 'model.layers.25.mlp.up_proj', 'model.layers.24.self_attn.o_proj', 'model.layers.20.self_attn.v_proj', 'model.layers.26.mlp.up_proj', 'model.layers.27.self_attn.o_proj', 'model.layers.25.self_attn.o_proj', 'model.layers.22.self_attn.o_proj', 'model.layers.24.mlp.up_proj', 'model.layers.19.self_attn.o_proj', 'model.layers.23.self_attn.v_proj', 'model.layers.27.mlp.up_proj', 'model.layers.22.mlp.down_proj', 'model.layers.20.self_attn.o_proj', 'model.layers.18.self_attn.o_proj',

In [6]:
data_module = make_data_module(tokenizer=tokenizer, args=args) # type: ignore

Splitting train dataset in train and validation according to `eval_dataset_size`


In [7]:
dataset = {k:v for k,v in data_module.items()}
dataset

{'train_dataset': Dataset({
     features: ['input', 'output', 'length'],
     num_rows: 50978
 }),
 'eval_dataset': Dataset({
     features: ['input', 'output', 'length'],
     num_rows: 50
 }),
 'predict_dataset': None,
 'data_collator': DataCollatorForCausalLM(tokenizer=LlamaTokenizer(name_or_path='meta-llama/Llama-2-7b-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[PAD]', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=False), source_max_len=1024, target_max_len=256, train_on_source=False, predict_with_generate=False)}

In [8]:
dataset['train_dataset']['input'][1]

'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nComplete the following sentence with a proper verb:\nThe teacher ____ the results of the exam.\n\n### Response: '

In [9]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    dataset['train_dataset'], # type: ignore
    batch_size=1,
    collate_fn=dataset['data_collator']
)

input_memory = memall()- weight_memory

In [10]:
def loss_fn(x, y):
    "A Flat CrossEntropy" 
    return torch.nn.functional.cross_entropy(x.view(-1, x.shape[-1]), y.view(-1))

from tqdm.auto import tqdm
optimizer = torch.optim.Adam(model.parameters(), lr=2e-4)

for step, batch in enumerate(tqdm(train_dataloader)):
    batch = {k: v.to(model.device) for k, v in batch.items()}
    output = model(**batch)
    activation_memory = memall() - weight_memory
    # loss = loss_fn(out.logits, batch["labels"]) / config.gradient_accumulation_steps
    loss = output.loss
    loss.backward()
    gradient_memory = memall() - weight_memory
    optimizer.step()
    optimizer_memory = memall() - gradient_memory - weight_memory 
    optimizer.zero_grad()
    if step == 5:
        break

total_memory = memall()

  0%|          | 5/50978 [00:05<15:43:31,  1.11s/it]


In [11]:
memory_string = (
    f"Weight memory    : {weight_memory / 1e6} MB\n"
    f"Input memory     : {input_memory / 1e6} MB\n"
    f"Activation memory: {activation_memory / 1e6} MB\n"
    f"Gradient memory  : {gradient_memory / 1e6} MB\n"
    f"Optimizer memory : {optimizer_memory / 1e6} MB\n"
    f"Total memory     : {total_memory / 1e6} MB\n"
)

print(memory_string)


Weight memory    : 28136.505344 MB
Input memory     : 0.0 MB
Activation memory: 27918.253056 MB
Gradient memory  : 39917.12512 MB
Optimizer memory : 0.0 MB
Total memory     : 54770.253312 MB



In [12]:
if args.memlog: # Memory Logging
    log_info = (
        f"\n\n{args.dataset} "
        + f"{args.sortby} "
        + f"{args.num_layers} Layers "
    )
    Path(mempath).mkdir(parents=True, exist_ok=True)
    logger = get_logger(mempath, "memlog.log")
    logger.info(log_info)
    logger.info(
        f"\n{memory_string}\n"
    )
    logger.info(f"\nPeak Memory usage: {int(peek_memory/1e6)} MB\n\n")

In [13]:
if False and (args.do_train or args.do_eval or args.do_predict):
    metrics_file_path = os.path.join(args.output_dir,
                                f'trainseed_{args.seed}',
                                args.dataset,
                                f"{args.sortby}_asc_{args.sort_ascending}",
                                f"layers_{args.num_layers}",
                                "metrics.json")

    os.makedirs(os.path.dirname(metrics_file_path), exist_ok=True)
    with open(metrics_file_path, "w") as fout:
        fout.write(json.dumps(all_metrics))