In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "5"

from loader.logger import get_logger
from model import get_model, smart_tokenizer_and_embedding_resize
from llamaft import ModelArguments, DataArguments, TrainingArguments, GenerationArguments
from loader.callbacks import mmlu_callback
from loader.data_module import make_data_module
from traineval.train import train_func
from traineval.eval import eval_func

import sys
import json
import torch
import random
import logging
import argparse
import numpy as np
import transformers
from pathlib import Path
# import accelerate.utils
import torch.backends.mps
import torch.backends.cudnn
from torch.cuda import (
    max_memory_allocated,
    reset_peak_memory_stats,
    reset_max_memory_allocated,
    memory_allocated,
)
from transformers import ( 
    set_seed,
    Seq2SeqTrainer,
    PreTrainedTokenizer,
    TrainerCallback,
    AutoModelForCausalLM,
    AutoTokenizer,
    LlamaTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
)
# from accelerate import Accelerator
from os.path import exists, join, isdir
from dataclasses import dataclass, field
from typing import Optional, Dict, Sequence

from transformers.utils.logging import (
    set_verbosity_error as transformers_vb_err,
)
from datasets.utils.logging import (
    set_verbosity_error as datasets_vb_err,
)

import evaluate
from tqdm import tqdm  
from datasets import load_dataset

import copy
import pandas as pd
from datasets import load_dataset, Dataset
from torch.nn.utils.rnn import pad_sequence

IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"

logdir = "/scratch/vipul/"
os.environ["TRANSFORMERS_CACHE"] = os.path.join(logdir, "cache")
os.environ["HF_DATASETS_CACHE"]= os.path.join(logdir, "cache")

INFO:datasets:PyTorch version 2.1.2 available.


In [2]:
# Setting up the arguments

model_args = ModelArguments(
    model_name_or_path="meta-llama/Llama-2-7b-hf"
)

data_args = DataArguments(
    eval_dataset_size=1024,
    max_eval_samples=50,
    dataset="oasst1",
)

training_args = TrainingArguments(
    output_dir="./output",
    logging_steps=10,
    data_seed=42,
    save_strategy="steps",
    evaluation_strategy="steps",
    logging_strategy="steps",
    do_eval=False,
    max_steps=10,
    eval_steps=187,
    adam_beta2=0.999,
    seed=7,
    sortby="lora",
    num_layers=15,
    memlog=False,
    per_device_train_batch_size=1,
)

training_args.bits = 4
training_args.per_device_train_batch_size = 1

generation_args = GenerationArguments(
    # Define generation-specific arguments here, if any are required
)

# If you need to use GenerationConfig or similar for generation_args
training_args.generation_config = transformers.GenerationConfig(
    **vars(generation_args)
)

# Combine arguments into a single Namespace object (if needed)
args = argparse.Namespace(
    **vars(model_args), **vars(data_args), **vars(training_args),
)
args.do_eval = False

In [3]:
args.lora_modules = ['v_proj']

In [4]:

logger = logging.getLogger(__name__)
os.environ["TRANSFORMERS_CACHE"] = args.cache_dir
cuda_device = torch.cuda.current_device()
gpus = torch.cuda.device_count()

# Control randomness
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
# accelerate.utils.set_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
set_seed(args.seed)  # transformers seed

start_memory = [0] * gpus
end_memory = [0] * gpus
peek_memory = 0
# Memory Stats Initialization
for device in range(gpus):
    reset_peak_memory_stats(device=device)
    reset_max_memory_allocated(device=device)
    start_memory[device] = memory_allocated(device=device)

In [5]:
model, tokenizer = get_model(args)
# Memory usage is 6314 MB

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Adding special tokens.
adding LoRA modules...


In [6]:
data_module = make_data_module(tokenizer=tokenizer, args=args) # type: ignore
# Memory usage is still 6314 MB

In [7]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    **{k:v for k,v in data_module.items() if k != 'predict_dataset'},
)

if args.do_mmlu_eval:
    trainer = mmlu_callback(args, tokenizer, trainer)

all_metrics = {"run_name": args.run_name}
# Memory usage is still 6314 MB

In [12]:
torch.cuda.memory_allocated()/(1024**2)

5095.171875

In [13]:
torch.cuda.memory_reserved()/(1024**2)

5810.0

In [8]:
# # Function to register the hooks
# def get_activation(name):
#     def hook(model, input, output):
#         activations[name] = output.detach()
#     return hook

# activations = {}
# model.base_model.model.model.layers[0].self_attn.q_proj.register_forward_hook(get_activation('q_proj_0'))
# model.base_model.model.model.layers[0].self_attn.v_proj.register_forward_hook(get_activation('v_proj_0'))

<torch.utils.hooks.RemovableHandle at 0x7fabd07cd750>

In [9]:
# Train
if args.do_train:
    all_metrics = train_func(args, logger, trainer, all_metrics)

# Eval
if args.do_eval:
    all_metrics = eval_func(args, logger, trainer, all_metrics)

for device in range(gpus):
    end_memory[device] = memory_allocated(device=device)
    peek_memory += max_memory_allocated(device=device)
print(f"\nPeak Memory usage: {int(peek_memory/1e6)} MB")

INFO:__main__:*** Train ***


Step,Training Loss,Validation Loss


***** train metrics *****
  epoch                    =        0.0
  total_flos               =    94918GF
  train_loss               =       1.86
  train_runtime            = 0:00:03.97
  train_samples_per_second =      2.515
  train_steps_per_second   =      2.515

Peak Memory usage: 7928 MB


In [14]:
# for name, param in model.named_parameters():
#     print(f"{name}: {param.requires_grad}")

In [23]:
data_module.items()

dict_items([('train_dataset', Dataset({
    features: ['input', 'output', 'length'],
    num_rows: 9846
})), ('eval_dataset', None), ('predict_dataset', None), ('data_collator', DataCollatorForCausalLM(tokenizer=LlamaTokenizer(name_or_path='meta-llama/Llama-2-7b-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[PAD]', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32000: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, source_max_len=1024

In [22]:
model(data_module['train_dataset']['output'][0])

AttributeError: 'str' object has no attribute 'shape'