In [1]:
from tqdm import tqdm
from datasets import load_dataset
import torch
from torch.utils.data import DataLoader

from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
)
from transformers import default_data_collator, Trainer, TrainingArguments
import torch.nn as nn
from short_hf import ShortHFModel
from transformers import AutoModelForCausalLM
import argparse
import shutil
import json
import os
from samsum import get_preprocessed_samsum, create_peft_config

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def count_parameters(module: nn.Module) -> int:
    return sum(p.numel() for p in module.parameters())

In [3]:
PARAMETER_BUDGET = 0

MODEL_PATH_DICT = {
    "llama3": "/nfs/ukrc_roma_ait/models/huggingface/meta-llama/Meta-Llama-3-8B",
    "llama3_instruct": "/nfs/ukrc_roma_ait/models/huggingface/meta-llama/Meta-Llama-3-8B-Instruct",
    "gemma2b": "google/gemma-2-2b-it",
    "llama3.2_1b": "/home/rdutt/Llama-3.2-1B/",
    "llama3.2_3b": "/home/rdutt/Llama-3.2-3B/",
}

model_name = 'llama3'

In [4]:
data = load_dataset("arcee-ai/sec-data-mini", split="train")
data = data.select(range(100))

In [5]:
len(data)

100

In [6]:
dataloader = DataLoader(
        data,
        batch_size=8,
        shuffle=True,
    )

In [7]:
MAX_SEQ_LEN = 1024
short_model = ShortHFModel(
    model_name=MODEL_PATH_DICT[model_name],
    # model_name="google/gemma-2-2b-it",
    layers_path="model.layers",
    n_prune_layers=10,
)

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:20<00:00,  5.17s/it]


In [10]:
original_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH_DICT[model_name], device_map='cpu')

Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.27s/it]


In [11]:
# original_model = short_model.model
tokenizer = short_model.tokenizer

In [None]:
for i, batch in enumerate(tqdm(dataloader)):
    prompts = batch['text']

    short_model.eval_importance(
        prompts=prompts,
        max_seq_len=MAX_SEQ_LEN,
        stride=256,
        max_gen_len=0
    )

In [13]:
layers_to_remove = list(range(20, 30))
layers_to_remove

[20, 21, 22, 23, 24, 25, 26, 27, 28, 29]

In [14]:
layers_to_remove = short_model.remove_layers(layers_to_remove)

In [15]:
len(original_model.model.layers), len(short_model.model.model.layers)

(32, 22)

In [16]:
PARAMETER_BUDGET = 0

for _layer_idx in layers_to_remove:
    PARAMETER_BUDGET += count_parameters(original_model.model.layers[_layer_idx])

PARAMETER_BUDGET

2181120000

In [60]:
peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=8,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules =["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",]
    )

peft_model = get_peft_model(short_model.model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 14,417,920 || all params: 5,863,559,168 || trainable%: 0.2459


In [55]:
ROOT_SAVEPATH = "/nfs/ukrc_roma_ait/models/"
model_name = "TEST_Pruned_"+model_name+"_dataset_"+"arcee-ai/sec-data-mini".split("/")[-1]+"_layers_"+str(10)
MODEL_SAVEPATH = os.path.join(ROOT_SAVEPATH, "Pruned_Models", model_name)
MODEL_SAVEPATH

'/nfs/ukrc_roma_ait/models/Pruned_Models/TEST_Pruned_TEST_Pruned_TEST_Pruned_llama3_dataset_sec-data-mini_layers_10_dataset_sec-data-mini_layers_10_dataset_sec-data-mini_layers_10'

In [21]:
output_dir = os.path.join(MODEL_SAVEPATH, "logs")

In [61]:
config = {
        'lora_config': peft_config,
        'learning_rate': 1e-6,
        'num_train_epochs': 1,
        'gradient_checkpointing': False,
    }

training_args = TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=True,
            # logging strategies
            logging_strategy="steps",
            logging_steps=10,
            save_strategy="no",
            optim="adamw_torch",
            **{k:v for k,v in config.items() if k != 'lora_config'}
        )

In [62]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [65]:
from trl import SFTTrainer

training_args = TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=True,
            # logging strategies
            logging_strategy="steps",
            logging_steps=100,
            per_device_train_batch_size=2,
            save_strategy="no",
            optim="adamw_torch",
            **{k:v for k,v in config.items() if k != 'lora_config'}
        )


trainer = SFTTrainer(
            model=peft_model,
            args=training_args,
            # train_dataset=get_preprocessed_samsum(tokenizer),
            train_dataset = dataset,
            dataset_text_field = "text",
            # data_collator=default_data_collator,
            callbacks=[],
        )


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [66]:
trainer.train()

Step,Training Loss
100,3.3699
200,0.0
300,0.0
400,0.0
500,0.0
600,0.0
700,0.0
800,0.0
900,0.0
1000,0.0


OutOfMemoryError: CUDA out of memory. Tried to allocate 1022.00 MiB. GPU 0 has a total capacty of 31.73 GiB of which 200.19 MiB is free. Including non-PyTorch memory, this process has 31.53 GiB memory in use. Of the allocated memory 30.59 GiB is allocated by PyTorch, and 584.90 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF