In [1]:
import os
cachedir = '/rscratch/tpang/kinshuk/cache'
os.environ["CUDA_VISIBLE_DEVICES"] = "4"
os.environ["TRANSFORMERS_CACHE"] = cachedir
os.environ["HF_DATASETS_CACHE"]= cachedir
from model import get_model
from loader.layers import param_count
from loader.data_module import make_data_module
import json
import torch
import random
import logging
import argparse
import numpy as np
import transformers
from pathlib import Path
import torch.backends.mps
import torch.backends.cudnn
from torch.cuda import (
    max_memory_allocated,
    reset_peak_memory_stats,
    reset_max_memory_allocated,
    memory_allocated,
)
from loader.logger import get_logger
from transformers import set_seed
# from accelerate import Accelerator
from os.path import exists, join, isdir
from dataclasses import dataclass, field
from typing import Optional, Dict, Sequence

from transformers.utils.logging import (
    set_verbosity_error as transformers_vb_err,
)
from datasets.utils.logging import (
    set_verbosity_error as datasets_vb_err,
)
from transformers import Seq2SeqTrainer
from traineval.eval import eval_func
logger = logging.getLogger(__name__)
IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
from llamaft import ModelArguments, DataArguments, TrainingArguments, GenerationArguments

  from .autonotebook import tqdm as notebook_tqdm
2024-04-03 03:20:20.413405: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-03 03:20:20.413439: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-03 03:20:20.414840: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-03 03:20:20.423192: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler fl

In [2]:
# Setting up the arguments

model_args = ModelArguments(
    model_name_or_path="meta-llama/Llama-2-7b-hf"
)

data_args = DataArguments(
    eval_dataset_size=1024,
    max_eval_samples=1000,
    source_max_len = 1024,
    target_max_len = 256,

    dataset="alpaca", # DATASET [alpaca|chip2|self-instruct|hh-rlhf|oasst1|longform]
)

training_args = TrainingArguments(
    output_dir="./output",
    data_seed=7,
    save_strategy="steps",
    evaluation_strategy="steps",
    logging_strategy="steps",
    do_eval=True,
    eval_steps=187,
    adam_beta2=0.999,

    learning_rate=2e-6,     # LEARNING RATE
    
    max_steps=2000,         # NUMBER OF STEPS

    sortby="alpha",         # CAN DO "alpha" or "lora"

    num_layers=4,           # NUMBER OF LAYERS FOR FULL FINE-TUNING

    per_device_train_batch_size = 2, # BATCH-SIZE
    memlog=False,
)

generation_args = GenerationArguments(
    max_new_tokens=128 # default is 256
)

# If you need to use GenerationConfig or similar for generation_args
training_args.generation_config = transformers.GenerationConfig(
    **vars(generation_args)
)

# Combine arguments into a single Namespace object (if needed)
args = argparse.Namespace(
    **vars(model_args), **vars(data_args), **vars(training_args),
)

# Control randomness
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
set_seed(args.seed)  # transformers seed

In [3]:
if 'lora' in args.sortby:
    args.num_layers = 0
logger = logging.getLogger(__name__)
gpus = torch.cuda.device_count()
mempath = (
    f"/rscratch/tpang/kinshuk/RpMKin/llama_ft/{args.dataset}/"
    + f"{args.sortby}"
)
start_memory = [0] * gpus
end_memory = [0] * gpus
peek_memory = 0

if args.verbose:
        task_info = (
            f"\nSeed: {args.seed}\n"
            + f"Dataset: {args.dataset}\n"
            + f"Sort by: {args.sortby}\n"
            + f"Layers to train: {args.num_layers}\n"
        )
        print(task_info)
else:
    datasets_vb_err()
    transformers_vb_err()
    global _tqdm_active
    _tqdm_active = False


Seed: 7
Dataset: alpaca
Sort by: alpha
Layers to train: 4



In [4]:
def memall(gpus=gpus):
    for i in range(gpus):
        start_memory[i] = torch.cuda.memory_allocated(i)
    return sum(start_memory)

model, tokenizer = get_model(args)

for device in range(gpus):
    reset_peak_memory_stats(device=device)
    reset_max_memory_allocated(device=device)

weight_memory = memall()

Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.43s/it]


Adding special tokens.
Sorted by  alpha
Training layers: ['model.layers.21.self_attn.v_proj', 'model.layers.22.self_attn.v_proj', 'model.layers.16.self_attn.v_proj', 'model.layers.19.self_attn.v_proj']
Enabling model.layers.16.self_attn.v_proj.weight parameter
Enabling model.layers.19.self_attn.v_proj.weight parameter
Enabling model.layers.21.self_attn.v_proj.weight parameter
Enabling model.layers.22.self_attn.v_proj.weight parameter


In [5]:
data_module = make_data_module(tokenizer=tokenizer, args=args) # type: ignore

Splitting train dataset in train and validation according to `eval_dataset_size`


In [6]:
dataset = {k:v for k,v in data_module.items()}
dataset

{'train_dataset': Dataset({
     features: ['input', 'output', 'length'],
     num_rows: 50978
 }),
 'eval_dataset': Dataset({
     features: ['input', 'output', 'length'],
     num_rows: 1000
 }),
 'predict_dataset': None,
 'data_collator': DataCollatorForCausalLM(tokenizer=LlamaTokenizer(name_or_path='meta-llama/Llama-2-7b-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[PAD]', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=False), source_max_len=1024, target_max_len=256, train_on_source=False, predict_with_generate=False)}

In [7]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    dataset['train_dataset'], # type: ignore
    batch_size=args.per_device_train_batch_size,
    collate_fn=dataset['data_collator'],
    shuffle=True,
)

input_memory = memall()- weight_memory

train_dataloader.__len__()

25489

In [8]:
def loss_fn(x, y):
    "A Flat CrossEntropy" 
    return torch.nn.functional.cross_entropy(x.view(-1, x.shape[-1]), y.view(-1))

from tqdm.auto import tqdm
optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
train_losses = []
val_losses = []
val_accs = []

model.train()
for epoch in range(1):
        train_loss = 0
        tr_steps = 0
        for step, batch in enumerate(tqdm(train_dataloader)):
            optimizer.zero_grad()
            batch = {k: v.to(model.device) for k, v in batch.items()}
            output = model(**batch)
            activation_memory = memall() - weight_memory
            # loss = loss_fn(out.logits, batch["labels"]) / args.gradient_accumulation_steps
            loss = output.loss
            train_loss += loss.item()
            loss.backward()
            gradient_memory = memall() - weight_memory
            optimizer.step()
            optimizer_memory = memall() - gradient_memory - weight_memory 
            tr_steps += 1
            train_losses.append(train_loss/tr_steps)
            if step % 50 == 0:
                print(f'Step: {step}, Train Loss: {train_loss/tr_steps}')
            if step == args.max_steps:
                model.eval()
                break

total_memory = memall()

  0%|          | 2/25489 [00:01<3:39:12,  1.94it/s]

Step: 0, Train Loss: 1.2829551696777344


  0%|          | 51/25489 [00:11<1:23:37,  5.07it/s]

Step: 50, Train Loss: 1.633158242001253


  0%|          | 102/25489 [00:22<1:30:09,  4.69it/s]

Step: 100, Train Loss: 1.5235318393990545


  1%|          | 151/25489 [00:33<1:36:27,  4.38it/s]

Step: 150, Train Loss: 1.4431651913567094


  1%|          | 201/25489 [00:44<1:18:53,  5.34it/s]

Step: 200, Train Loss: 1.4104549600117242


  1%|          | 251/25489 [00:54<1:24:44,  4.96it/s]

Step: 250, Train Loss: 1.3686336842903577


  1%|          | 301/25489 [01:05<1:28:38,  4.74it/s]

Step: 300, Train Loss: 1.3517712794269041


  1%|▏         | 352/25489 [01:17<1:38:55,  4.24it/s]

Step: 350, Train Loss: 1.3536584560687726


  2%|▏         | 402/25489 [01:27<1:18:26,  5.33it/s]

Step: 400, Train Loss: 1.3303687304184026


  2%|▏         | 451/25489 [01:37<1:37:43,  4.27it/s]

Step: 450, Train Loss: 1.3162758380108557


  2%|▏         | 501/25489 [01:49<1:36:30,  4.32it/s]

Step: 500, Train Loss: 1.3065542754774797


  2%|▏         | 551/25489 [02:00<1:39:11,  4.19it/s]

Step: 550, Train Loss: 1.2942384045483197


  2%|▏         | 601/25489 [02:11<1:32:45,  4.47it/s]

Step: 600, Train Loss: 1.2926947735311982


  3%|▎         | 651/25489 [02:22<1:27:08,  4.75it/s]

Step: 650, Train Loss: 1.2878620744483995


  3%|▎         | 701/25489 [02:33<1:31:00,  4.54it/s]

Step: 700, Train Loss: 1.2831243690682546


  3%|▎         | 752/25489 [02:44<1:18:41,  5.24it/s]

Step: 750, Train Loss: 1.2760449403056133


  3%|▎         | 801/25489 [02:56<1:43:51,  3.96it/s]

Step: 800, Train Loss: 1.269196626510513


  3%|▎         | 851/25489 [03:06<1:28:55,  4.62it/s]

Step: 850, Train Loss: 1.2716259036375128


  4%|▎         | 901/25489 [03:17<1:32:59,  4.41it/s]

Step: 900, Train Loss: 1.2671615946107646


  4%|▎         | 951/25489 [03:29<1:49:41,  3.73it/s]

Step: 950, Train Loss: 1.2636506722928598


  4%|▍         | 1001/25489 [03:40<1:26:33,  4.72it/s]

Step: 1000, Train Loss: 1.260070458128974


  4%|▍         | 1051/25489 [03:51<1:29:59,  4.53it/s]

Step: 1050, Train Loss: 1.258226943940463


  4%|▍         | 1102/25489 [04:03<1:23:05,  4.89it/s]

Step: 1100, Train Loss: 1.2594866162397773


  5%|▍         | 1151/25489 [04:14<1:28:49,  4.57it/s]

Step: 1150, Train Loss: 1.2579274865127874


  5%|▍         | 1201/25489 [04:26<1:29:43,  4.51it/s]

Step: 1200, Train Loss: 1.2549936870278964


  5%|▍         | 1251/25489 [04:37<1:28:05,  4.59it/s]

Step: 1250, Train Loss: 1.2506046679070433


  5%|▌         | 1301/25489 [04:48<1:33:00,  4.33it/s]

Step: 1300, Train Loss: 1.2501974799122653


  5%|▌         | 1351/25489 [04:59<1:29:02,  4.52it/s]

Step: 1350, Train Loss: 1.2474244391031393


  5%|▌         | 1401/25489 [05:10<1:35:32,  4.20it/s]

Step: 1400, Train Loss: 1.2450046826374692


  6%|▌         | 1451/25489 [05:22<1:17:55,  5.14it/s]

Step: 1450, Train Loss: 1.240957829518453


  6%|▌         | 1501/25489 [05:33<1:39:05,  4.03it/s]

Step: 1500, Train Loss: 1.2393660918424163


  6%|▌         | 1552/25489 [05:44<1:13:24,  5.43it/s]

Step: 1550, Train Loss: 1.2358210859838261


  6%|▋         | 1602/25489 [05:55<1:25:44,  4.64it/s]

Step: 1600, Train Loss: 1.2333336866214677


  6%|▋         | 1652/25489 [06:05<1:14:46,  5.31it/s]

Step: 1650, Train Loss: 1.2292978589387462


  7%|▋         | 1702/25489 [06:17<1:18:04,  5.08it/s]

Step: 1700, Train Loss: 1.2268903006146334


  7%|▋         | 1752/25489 [06:28<1:15:32,  5.24it/s]

Step: 1750, Train Loss: 1.2231265308344996


  7%|▋         | 1802/25489 [06:39<1:02:35,  6.31it/s]

Step: 1800, Train Loss: 1.2217645296813513


  7%|▋         | 1851/25489 [06:50<1:36:08,  4.10it/s]

Step: 1850, Train Loss: 1.218809972560966


  7%|▋         | 1901/25489 [07:01<1:26:53,  4.52it/s]

Step: 1900, Train Loss: 1.215531432804216


  8%|▊         | 1951/25489 [07:12<1:24:35,  4.64it/s]

Step: 1950, Train Loss: 1.2118963566133758


  8%|▊         | 2000/25489 [07:23<1:26:42,  4.51it/s]

Step: 2000, Train Loss: 1.2110853766885654





In [9]:
optimizer.zero_grad()
model.eval()
trainer=Seq2SeqTrainer(
                model=model,
                tokenizer=tokenizer,
                args=training_args,
                **{k:v for k,v in data_module.items() if k != 'predict_dataset'},
            )
all_metrics = {"run_name": args.run_name}
if args.do_eval:
    all_metrics = eval_func(args, logger, trainer, all_metrics)
total_memory = memall()
peek_memory = max([max_memory_allocated(i) for i in range(gpus)])


INFO:__main__:*** Evaluate ***


***** eval metrics *****
  eval_loss               =     1.1706
  eval_runtime            = 0:01:07.61
  eval_samples_per_second =     14.791
  eval_steps_per_second   =      1.849


In [10]:
all_metrics

{'run_name': './output',
 'eval_loss': 1.1705690622329712,
 'eval_runtime': 67.611,
 'eval_samples_per_second': 14.791,
 'eval_steps_per_second': 1.849}

In [11]:
param_count(model)
memory_string = (
    f"Method           : {args.sortby}\n"
    f"Layers           : {args.num_layers}\n"
    f"Learning Rate    : {args.learning_rate}\n"
    f"Batch size       : {args.per_device_train_batch_size}\n"
    f"Weight memory    : {weight_memory / 1e6} MB\n"
    f"Activation memory: {activation_memory / 1e6} MB\n"
    f"Gradient memory  : {gradient_memory / 1e6} MB\n"
    f"Optimizer memory : {optimizer_memory / 1e6} MB\n"
    f"Total memory     : {total_memory / 1e6} MB\n"
    f"Peak memory      : {peek_memory / 1e6} MB\n"
)
print(memory_string)

Total params: 6738.42M, Trainable: 198.18M
Method           : alpha
Layers           : 4
Learning Rate    : 2e-06
Batch size       : 2
Weight memory    : 27087.929344 MB
Activation memory: 3427.019264 MB
Gradient memory  : 2445.723648 MB
Optimizer memory : 0.0 MB
Total memory     : 28740.913152 MB
Peak memory      : 37007.917056 MB



In [12]:
# SAVE TRAINING HISTORY
base = {"train_loss": train_loss,}
savepath = f"./output/{args.dataset}/lr_{args.learning_rate}/batch_{args.per_device_train_batch_size}/{args.sortby}/layers_{args.num_layers}"
if True:
    Path(savepath).mkdir(parents=True, exist_ok=True)
    np.save(os.path.join(savepath, "finetune.npy"), base) # type: ignore
    with open(os.path.join(savepath, "metrics.json"), "w") as fout:
        fout.write(json.dumps(all_metrics))
    log_info = (
        f"\n\n{args.dataset} "
        + f"Batch Size {args.per_device_train_batch_size} "
        + f"{args.sortby} fine-tuning "
        + f"{args.num_layers} Layers"
    )
    logger = get_logger(savepath, "memlog.log")
    logger.info(log_info)
    logger.info(f"\n{memory_string}\n")
    if (args.do_train or args.do_eval or args.do_predict):
        with open(os.path.join(savepath, "metrics.json"), "w") as fout:
            fout.write(json.dumps(all_metrics))

INFO:root:

alpaca Batch Size 2 alpha fine-tuning 4 Layers


2024-04-03 03:29:40;INFO;

alpaca Batch Size 2 alpha fine-tuning 4 Layers


INFO:root:
Method           : alpha
Layers           : 4
Learning Rate    : 2e-06
Batch size       : 2
Weight memory    : 27087.929344 MB
Activation memory: 3427.019264 MB
Gradient memory  : 2445.723648 MB
Optimizer memory : 0.0 MB
Total memory     : 28740.913152 MB
Peak memory      : 37007.917056 MB




2024-04-03 03:29:40;INFO;
Method           : alpha
Layers           : 4
Learning Rate    : 2e-06
Batch size       : 2
Weight memory    : 27087.929344 MB
Activation memory: 3427.019264 MB
Gradient memory  : 2445.723648 MB
Optimizer memory : 0.0 MB
Total memory     : 28740.913152 MB
Peak memory      : 37007.917056 MB


