In [1]:
DEVICE = "cuda"
DATASET = "iamtarun/python_code_instructions_18k_alpaca"

####### SHORTEND_LLM #######
MODEL_NAME="deepseek-ai/LASER_PRUNED_LORA_deepseek-coder-7b-instruct-v1.5" # path to model
NUM_CALIB_DATA=10 # Что-то с датасетом
NUM_PRUNED_BLOCKS=6 # Количество блоков для прунинга с минимальной метрикой
OUTPUT_SENSITIVITY=f"cache/shortend_llm/output_block_sensitivity/{MODEL_NAME}/taylor_n{NUM_CALIB_DATA}"
OUTPUT_PRUNE=f"cache/shortend_llm/output_prune/{MODEL_NAME}/taylor_n{NUM_CALIB_DATA}/rm_{NUM_PRUNED_BLOCKS}_blocks"
OUTPUT_TUNE=f"cache/shortend_llm/output_tune/{MODEL_NAME}/taylor_n{NUM_CALIB_DATA}/rm_{NUM_PRUNED_BLOCKS}_blocks"
CHECKPOINT_PATH = f"cache/shortend_llm/output_tune/{MODEL_NAME}/checkpoints"
NORM_POWER = 1
WEIGHT_REDUCTION = "sum"
BLOCK_REDUCTION = "sum"
MODEL_TYPE = "pretrain"
NUM_CALIB_DATA = 10
MAX_SEQ_LEN = 256
BATCH_SIZE_SHORTEND = 10
# DATASET_PATH = 'python_code_instructions_18k_alpaca_ru_prompt'
UPDATE_FIRST_STEP_SHORTEND = True

######## SVD LLM #########
PROFILING_MAT_PATH = None # Путь с предрасчетом значений для SVD
WHITENING_NSAMPLES = 256
RATIO = 0.6
MAX_LEN_WHITENING = 256
OUTPUT_SVD_SAVE = f"cache/svd_llm/output_whitening/{MODEL_NAME.split('/')[0]}"
MODEL_ADD_PATH  = f"/{MODEL_NAME.split('/')[1]}_{RATIO}_{MAX_LEN_WHITENING}.pt"

####### LORA #######
LORA_R = 8 # Rank of lora 
LORA_ALPHA = 32
NUM_EPOCHS = 2 # Number of fine tune epochs
LEARNING_RATE = 2e-4
BATCH_SIZE = 84
MICRO_BATCH_SIZE = 4 # number of different forwards before accumulation grads in each batch 
CUTOFF_LENGTH = 500
TRAIN_ON_INPUTS = False # Should model learn on input text also or not

LORA_TARGET_MODULES = "q_v_proj,q_u_proj,k_v_proj,k_u_proj,v_u_proj,\
v_v_proj,o_u_proj,o_v_proj,gate_u_proj,gate_v_proj,down_u_proj,down_v_proj,up_u_proj,up_v_proj,\
q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj"

LORA_DROPOUT = 0.05
VAL_SIZE = 1000
EVAL_STEPS = 50
DATA_PATH = 'LORA/python_code_instructions_18k_alpaca_ru'
OUTPUT_DIR = f"../cache/lora/lora_finetuned_{MODEL_NAME.split('/')[1]}/"

####### HUMAN_EVAL ########
MAX_NEW_TOKENS = 512
NUM_SAMPLES_PER_TASK = 1
SAVE_METRIC_PATH = f"{MODEL_NAME.split('/')[1]}_test_{MAX_NEW_TOKENS}_{NUM_SAMPLES_PER_TASK}_samples.jsonl"

SEED = 42

In [3]:
import random
import os
import re
import torch
import transformers
import gzip
import json

from tqdm import tqdm, trange
from datasets import load_dataset, load_from_disk, DatasetDict
from torch.utils.data.dataset import Dataset
from final_combination.utils_shortend import *
from final_combination.utils_svd_llm import *
from typing import Iterable, Dict
from final_combination.LLMPruner.peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoConfig

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from LORA.Prompter import Prompter, ZeroPrompter

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
config = AutoConfig.from_pretrained(MODEL_NAME)

In [5]:
config

LlamaConfig {
  "_name_or_path": "deepseek-ai/deepseek-coder-7b-instruct-v1.5",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "bos_token_id": 100000,
  "eos_token_id": 100015,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 30,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.35.2",
  "use_cache": true,
  "vocab_size": 102400
}

## SHORTEND LLM

In [None]:
def get_model(
    base_model=None,
    ckpt=None,
    lora_ckpt=None,
    tokenizer=None,
    model_type="pretrain",
    device="cuda",
    fix_decapoda_config=False,
    use_bfloat=False,
):
    tokenizer = base_model if tokenizer is None else tokenizer
    if model_type == "pretrain":
        config = AutoConfig.from_pretrained(base_model)
        if "gptq" in base_model.lower():
            from auto_gptq import AutoGPTQForCausalLM

            model = AutoGPTQForCausalLM.from_quantized(
                base_model,
                use_safetensors=True,
                trust_remote_code=True,
                use_triton=False,
                quantize_config=None,
            )
            tokenizer = AutoTokenizer.from_pretrained(tokenizer)
        elif (
            "LlamaForCausalLM" in config.__getattribute__("architectures")
            and "llama-3" not in base_model.lower()
        ):
            model = LlamaForCausalLM.from_pretrained(base_model, low_cpu_mem_usage=True)
            tokenizer = AutoTokenizer.from_pretrained(tokenizer) # ЗАМЕНИЛ НА ЭТО! ПРОВЕРИТЬ
            # tokenizer = LlamaTokenizer.from_pretrained(tokenizer)
        else:
            model = AutoModelForCausalLM.from_pretrained(
                base_model, low_cpu_mem_usage=True
            )
            tokenizer = AutoTokenizer.from_pretrained(tokenizer)
    elif model_type in ["pruneLLM", "tune_pruneLLM"]:
        pruned_dict = torch.load(ckpt, map_location="cpu")
        model = pruned_dict["model"]
        tokenizer = pruned_dict["tokenizer"]
        if model_type == "tune_pruneLLM":
            model = PeftModel.from_pretrained(
                model, lora_ckpt, torch_dtype=torch.float16, low_cpu_mem_usage=True
            )
    else:
        raise NotImplementedError
    description = "Model Type: {}\n Base: {} \n Pruned: {}\n LORA: {}".format(
        model_type, base_model, ckpt, lora_ckpt
    )

    if fix_decapoda_config:
        # unwind broken decapoda-research config
        tokenizer.pad_token_id = 0
    model = set_model_device_evalmode(model, device, fix_decapoda_config, use_bfloat)

    return model, tokenizer, description

Анализируем блоки LLM на основе метрики Taylor. Мериется метрика по датасету, выбираются не нужные слои и все сохраняется. 

In [None]:
set_seed(SEED)
os.makedirs(OUTPUT_SENSITIVITY, exist_ok=True)

norm_power = NORM_POWER
weight_reduction = WEIGHT_REDUCTION
block_reduction = BLOCK_REDUCTION
result_csv_weight = os.path.join(OUTPUT_SENSITIVITY, "weight_score.csv")
result_csv_block = os.path.join(OUTPUT_SENSITIVITY, "block_score_all.csv")
result_csv_block_detail = os.path.join(OUTPUT_SENSITIVITY, "block_score_detail.csv")
result_csv_block_sort = os.path.join(OUTPUT_SENSITIVITY, "block_score_sorted.csv")
block_order_path = os.path.join(OUTPUT_SENSITIVITY, "block_order.csv")

if not os.path.exists(block_order_path) or UPDATE_FIRST_STEP_SHORTEND:
    model, tokenizer, description = get_model(
        base_model=MODEL_NAME,
        ckpt=CHECKPOINT_PATH,
        lora_ckpt=CHECKPOINT_PATH,
        tokenizer=None,
        model_type=MODEL_TYPE,
        device=DEVICE,
        fix_decapoda_config=False,
        use_bfloat=False,
    )
    
    ### ЗАМЕНИТЬ!
    example_prompts = get_examples(
        dataset=DATASET,
        tokenizer=tokenizer,
        n_samples=NUM_CALIB_DATA ,
        seq_len=MAX_SEQ_LEN,
        field_name="prompt",
        add_bos_to_every=False,
    ).to(DEVICE)
    
    print("Do forward to collect gradient information")
    salience_dict = {}
    for i in range(0, example_prompts.size(0), BATCH_SIZE_SHORTEND):
        example_prompts_tmp = example_prompts[i : i + BATCH_SIZE_SHORTEND]
        loss = model(example_prompts_tmp, labels=example_prompts_tmp).loss
        loss.backward()
        for k, param in model.named_parameters():
            if param.requires_grad and "weight" in k and "embed_tokens" not in k:
                salience = param * param.grad
                salience = salience.data.clone().float()
    
                if k not in salience_dict.keys():
                    salience_dict[k] = salience
                else:
                    salience_dict[k] += salience
        model.zero_grad()
    
    # Compute scores of weight matrices -> Collec them
    block_info = {}
    with open(result_csv_weight, "w") as logfile:
        logwriter = csv.writer(logfile, delimiter=",")
        logwriter.writerow(["weight_name", "weight_score"])
        for k, param in model.named_parameters():
            if param.requires_grad and "weight" in k and "embed_tokens" not in k:
                block_idx = ".".join(k.split(".")[:3])  # 'model.layers.i'
                if "proj" in k or "lm_head" in k:  # output_dim x input_dim
                    weight_imp = (
                        salience_dict[k].abs().pow(norm_power).sum(1)
                    )  # [output_dim]
                elif "norm" in k:  # [output_dim]
                    weight_imp = salience_dict[k].abs().pow(norm_power)
    
                if weight_reduction == "sum":
                    weight_imp = weight_imp.sum(dim=0)
                elif weight_reduction == "mean":
                    weight_imp = weight_imp.mean(dim=0)
                elif weight_reduction == "max":
                    weight_imp = weight_imp.max(dim=0)[0]
                elif weight_reduction == "prod":
                    weight_imp = torch.prod(weight_imp, dim=0)
                else:
                    raise NotImplementedError
    
                weight_imp = weight_imp.item()
                logwriter.writerow([k, weight_imp])
                # print([k, weight_imp])
                if block_idx not in block_info.keys():
                    block_info[block_idx] = [weight_imp]
                else:
                    block_info[block_idx].append(weight_imp)
    
    # Compute block-level importance
    block_info_summary = {}
    with open(result_csv_block, "w") as logfile, open(
        result_csv_block_detail, "w"
    ) as logfile_detail:
        logwriter = csv.writer(logfile, delimiter=",")
        logwriter.writerow(["block_name", "block_score"])
        logwriter_detail = csv.writer(logfile_detail, delimiter=",")
        logwriter_detail.writerow(["block_name", "all_weight_scores"])
        for k, v in block_info.items():
            # print(k, v)
            logwriter_detail.writerow([k] + v)
    
            block_imp = torch.tensor(v)
            if block_reduction == "sum":
                block_imp = block_imp.sum(dim=0)
            elif block_reduction == "mean":
                block_imp = block_imp.mean(dim=0)
            elif block_reduction == "max":
                block_imp = block_imp.max(dim=0)[0]
            elif block_reduction == "prod":
                block_imp = torch.prod(block_imp, dim=0)
            else:
                raise NotImplementedError
    
            block_imp = block_imp.item()
            logwriter.writerow([k, block_imp])
            block_info_summary[k] = block_imp
    
    for k in ["model.norm.weight", "lm_head.weight"]:
        if k in block_info_summary:
            del block_info_summary[k]
    sorted_items = sorted(block_info_summary.items(), key=lambda x: x[1])
    block_order = []
    with open(result_csv_block_sort, "w") as logfile:
        logwriter = csv.writer(logfile, delimiter=",")
        logwriter.writerow(["rank", "block_name", "block_score", "block_index"])
        for rank, (key, value) in enumerate(sorted_items, start=1):
            logwriter.writerow([rank, key, value, key.split(".")[-1]])
            # print([rank, key, value, key.split(".")[-1]])
            block_order.append(int(key.split(".")[-1]))
    
    with open(block_order_path, "w") as logfile_order:
        logwriter_order = csv.writer(logfile_order, delimiter=",")
        logwriter_order.writerow(block_order)
    print("Done")
else:
    print(f"use the precomputed results at {block_order_path}")

In [None]:
try:
    del model, tokenizer, description
except:
    pass
torch.cuda.empty_cache()

Убираем сохраненные слои с наименьшой метрикой

In [None]:
set_seed(SEED)

model_orig, tokenizer, description = get_model(
    base_model=MODEL_NAME,
    ckpt=CHECKPOINT_PATH,
    lora_ckpt=CHECKPOINT_PATH,
    tokenizer=None,
    model_type=MODEL_TYPE,
    device=DEVICE,
    fix_decapoda_config=False,
    use_bfloat=False,
)

os.makedirs(OUTPUT_PRUNE, exist_ok=True)

# Load the precomputed block unimportance order
unimportance_order = []
with open(block_order_path, "r") as file:
    unimportance_order = [int(i) for i in str(next(file).strip()).split(",")]

if not False:
    last_block_index = model_orig.config.num_hidden_layers - 1
    keep_block_info = [
        0,
        1,
        2,
        3,
        last_block_index - 1,
        last_block_index,
    ]  # to keep first and last few blocks unpruned
    unimportance_order = [
        idx for idx in unimportance_order if idx not in keep_block_info
    ]

# Block-level pruning
model = get_block_pruned_network(
    model_orig,
    unimportance_order=unimportance_order,
    num_pruned_blocks=NUM_PRUNED_BLOCKS,
    device=DEVICE,
    fix_decapoda_config=False,
    use_bfloat=False,
)

# Save
torch.save({'model': model, 'tokenizer': tokenizer}, OUTPUT_PRUNE + "/model_shortend_llm.pt")

In [None]:
del example_prompts
del description
del model
del tokenizer
del salience_dict
del sorted_items
del unimportance_order
del loss

torch.cuda.empty_cache()

### SVD-LLM

In [4]:
pruned_dict = torch.load(OUTPUT_PRUNE + "/model_shortend_llm.pt", map_location=DEVICE)
tokenizer, model = pruned_dict['tokenizer'], pruned_dict['model']




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Собираем примеры для прогона их через через нейросеть и измеряем output

In [5]:
for name, module in model.named_modules():
    if isinstance(module, nn.Linear):
        module.raw_scaling_diag_matrix = 0
        module.register_forward_hook(hook)

data = load_dataset(DATASET)
train_val = data["train"]


messages=[
    {'role': 'user', 'content': train_val[i]['prompt']} for i in range(len(train_val))
]

inputs = []

for i in range(WHITENING_NSAMPLES): 
    messages=[{'role': 'user', 'content': train_val[i]['prompt']}]
    curr_inp = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", max_length = MAX_LEN_WHITENING, padding = True, truncation = True).to(model.device)
    inputs.append(curr_inp)

set_seed(SEED)
with torch.inference_mode(): 
    for item in tqdm(inputs):
        model(item)

for name, module in model.named_modules():
    if isinstance(module, nn.Linear):
        module._forward_hooks.clear()

for i in trange(len(model.model.layers)):
    subset = find_layers(model.model.layers[i])
    for name in subset:
        subset[name].raw_scaling_diag_matrix = subset[name].raw_scaling_diag_matrix.cpu()

100%|██████████| 256/256 [06:01<00:00,  1.41s/it]
100%|██████████| 30/30 [00:03<00:00,  8.02it/s]


In [6]:
profiling_mat = {}
print("Start Cholesky Decomposition...")
for i in tqdm(range(len(model.model.layers))):
    layer_profile = {}
    subset = find_layers(model.model.layers[i])
    for name in subset:
        raw_scaling_diag_matrix = subset[name].raw_scaling_diag_matrix.double().to(DEVICE)
        
        try:
            scaling_diag_matrix = torch.linalg.cholesky(raw_scaling_diag_matrix)
        except Exception as e:
            print("Warning: eigen scaling_diag_matrix is not positive!")
            eigenvalues = torch.linalg.eigvalsh(raw_scaling_diag_matrix)
            raw_scaling_diag_matrix += (- eigenvalues[0] + 1e-6) * torch.eye(raw_scaling_diag_matrix.shape[0]).to(DEVICE)
            scaling_diag_matrix = torch.linalg.cholesky(raw_scaling_diag_matrix)
            eigenvalues = None
            del eigenvalues
            
        layer_profile[name] = scaling_diag_matrix.cpu()
        scaling_diag_matrix = raw_scaling_diag_matrix = subset[name].raw_scaling_diag_matrix = None
        del scaling_diag_matrix, raw_scaling_diag_matrix, subset[name].raw_scaling_diag_matrix
        torch.cuda.empty_cache()
    
    profiling_mat[i] = layer_profile

Start Cholesky Decomposition...


  0%|          | 0/30 [00:00<?, ?it/s]



  3%|▎         | 1/30 [00:02<01:12,  2.51s/it]



 97%|█████████▋| 29/30 [00:22<00:00,  1.52it/s]



100%|██████████| 30/30 [00:23<00:00,  1.28it/s]


Применяем SVD разложение

In [7]:
from tqdm import trange
for i in trange(len(model.model.layers)):
    layer = model.model.layers[i]
    subset = find_layers(layer)

    svd_attn = SVD_LlamaAttention(config=model.config, ratio=RATIO)
    svd_mlp = SVD_LlamaMLP(hidden_size=layer.hidden_size, intermediate_size=model.config.intermediate_size, hidden_act=model.config.hidden_act, ratio=RATIO)

    for name in subset:
        skip_this_layer = False
        W = subset[name].weight.data.float()
        dtype = W.dtype
        scaling_diag_matrix = profiling_mat[i][name].cuda()
        try:
            scaling_matrix_inv = torch.linalg.inv(scaling_diag_matrix)
        except Exception as e:
            print("Warning: scaling_diag_matrix is not full rank!")
            scaling_diag_matrix += 1e-6 * torch.eye(scaling_diag_matrix.shape[0]).to(DEVICE)
            try:
                scaling_matrix_inv = torch.linalg.inv(scaling_diag_matrix)
            except:
                skip_this_layer = True
        if not(skip_this_layer):        
            scaling_diag_matrix = scaling_diag_matrix.float()
            scaling_matrix_inv = scaling_matrix_inv.float()
            W_scale = torch.matmul(W, scaling_diag_matrix)
            
            U, S, VT = torch.linalg.svd(W_scale, full_matrices=False)
            
            num_s_after_trunc = int(W.shape[0] * W.shape[1] * RATIO / (W.shape[0] + W.shape[1]))
            
            truc_s = S[:num_s_after_trunc]
            truc_u = U[:, :num_s_after_trunc]
            truc_v = torch.matmul(VT[:num_s_after_trunc, :], scaling_matrix_inv)
            truc_sigma = torch.diag(truc_s)
            
            #### Replace Attn, MLP ####
                
            sqrtSigma = torch.sqrt(truc_sigma)
            svd_u = torch.matmul(truc_u, sqrtSigma).cpu().to(dtype)
            svd_v = torch.matmul(sqrtSigma, truc_v).cpu().to(dtype)
            
            if "q_proj" in name:
                svd_attn.q_u_proj.weight.data = svd_u
                svd_attn.q_v_proj.weight.data = svd_v
            elif "k_proj" in name:
                svd_attn.k_u_proj.weight.data = svd_u
                svd_attn.k_v_proj.weight.data = svd_v
            elif "v_proj" in name:
                svd_attn.v_u_proj.weight.data = svd_u
                svd_attn.v_v_proj.weight.data = svd_v
            elif "o_proj" in name:
                svd_attn.o_u_proj.weight.data = svd_u
                svd_attn.o_v_proj.weight.data = svd_v
                layer.self_attn =  svd_attn
            elif "gate_proj" in name:
                svd_mlp.gate_u_proj.weight.data = svd_u
                svd_mlp.gate_v_proj.weight.data = svd_v
            elif "down_proj" in name:
                svd_mlp.down_u_proj.weight.data = svd_u
                svd_mlp.down_v_proj.weight.data = svd_v
            elif "up_proj" in name:
                svd_mlp.up_u_proj.weight.data = svd_u
                svd_mlp.up_v_proj.weight.data = svd_v
                layer.mlp = svd_mlp

100%|██████████| 30/30 [07:42<00:00, 15.41s/it]


In [8]:
print(f'num params after SVD compression = {sum( [ np.prod(item.size())  for item in model.parameters() ])}')
os.makedirs(OUTPUT_SVD_SAVE, exist_ok=True)
torch.save({'model': model, 'tokenizer': tokenizer}, OUTPUT_SVD_SAVE+MODEL_ADD_PATH)

num params after SVD compression = 4480897536


## LORA

In [6]:
# need to excelude LASER or other special layers from fine tune
# decoder_id_ls = [i for i in range(0,15)] + [i for i in range(18,30)]
decoder_id_ls = [i for i in range(0,30)]

In [7]:
torch.cuda.empty_cache()
# Load Pruned Model

pruned_dict = torch.load(OUTPUT_SVD_SAVE+MODEL_ADD_PATH, map_location='cpu')
tokenizer, model = pruned_dict['tokenizer'], pruned_dict['model']
gradient_accumulation_steps = BATCH_SIZE // MICRO_BATCH_SIZE

prompter = Prompter('alpaca')

if DEVICE == 'cuda':
    model.half()

tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LENGTH,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < CUTOFF_LENGTH
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result
    
def generate_and_tokenize_prompt(data_point):
    old = False
    if old:
        full_prompt = prompter.generate_prompt(
            data_point["instruction"],
            data_point["input"],
            data_point["output"],
        )
        
        tokenized_full_prompt = tokenize(full_prompt)
        if TRAIN_ON_INPUTS:
            user_prompt = prompter.generate_prompt(
                data_point["instruction"], data_point["input"]
            )
            tokenized_user_prompt = tokenize(
                user_prompt, add_eos_token=False
            )
            user_prompt_len = len(tokenized_user_prompt["input_ids"])
    
            if False:
                user_prompt_len -= 1
    
            tokenized_full_prompt["labels"] = [
                -100
            ] * user_prompt_len + tokenized_full_prompt["labels"][
                user_prompt_len:
            ]  # could be sped up, probably
    else:
        messages=[
            { 'role': 'user', 'content': data_point['prompt']}
        ]
        #return_tensors="pt"
        tokenized_full_prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
        tokenized_full_prompt = re.sub(r'### Response:', '', tokenized_full_prompt)
        tokenized_full_prompt = re.sub(r"### Instruction:\nBelow is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
, '', tokenized_full_prompt)
        tokenized_full_prompt = re.sub(r'### Output:', '### Response:', tokenized_full_prompt)
    return tokenize(tokenized_full_prompt)

def split_and_tokenizer(test_data, tokenizer, seq_len, field_name):
    test_ids = tokenizer("\n\n".join(test_data[field_name]), return_tensors='pt').input_ids[0]
    test_ids_batch = []
    nsamples = test_ids.numel() // seq_len

    test_set = []
    for i in range(nsamples):
        batch = test_ids[(i * seq_len):((i + 1) * seq_len)]
        test_set.append({
            'input_ids': batch,
            'labels': batch
        })
    return test_set
    
# Prepare For LoRA
model = prepare_model_for_int8_training(model)
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES.split(","),
    lora_dropout=LORA_DROPOUT,
    layers_to_transform = decoder_id_ls,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
model.print_trainable_parameters()  

# Load Train Dataset
try:
    data = load_dataset(DATASET)
except:
    data = DatasetDict({'train':load_from_disk(DATA_PATH)})
    
train_val = data["train"].train_test_split(
    test_size=VAL_SIZE, shuffle=True, seed=42
)
train_data = (
    train_val["train"].shuffle().map(generate_and_tokenize_prompt)
)

input_lenghts = [len(x) for x in train_data["input_ids"]]
max_source_length = int(np.percentile(input_lenghts, 95))
print(f"Max prompt length: {max_source_length}")

val_data = {
    DATA_PATH: train_val["test"].shuffle().map(generate_and_tokenize_prompt),
}

trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=MICRO_BATCH_SIZE,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,
        num_train_epochs=NUM_EPOCHS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=10,
        logging_first_step=True,
        optim="adamw_torch",
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=EVAL_STEPS,
        save_steps=100,
        output_dir=OUTPUT_DIR,
        save_total_limit=20,
        load_best_model_at_end=True,
        ddp_find_unused_parameters=None,
        group_by_length=False,
        report_to="none",
        run_name="none",
        metric_for_best_model="{}_loss".format(DATA_PATH),
    ),
    data_collator=transformers.DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)
model.config.use_cache = False
old_state_dict = model.state_dict

trainer.train(resume_from_checkpoint=True)

model.state_dict = old_state_dict
model.save_pretrained(OUTPUT_DIR)



trainable params: 13,117,440 || all params: 5,197,870,221 || trainable%: 0.252361822097905


Map:   0%|          | 0/17612 [00:00<?, ? examples/s]

Max prompt length: 472


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

comet_ml is installed but `COMET_API_KEY` is not set.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Detected kernel version 5.4.270, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  checkpoint_rng_state = torch.load(rng_file)


Step,Training Loss,Validation Loss,Lora/python Code Instructions 18k Alpaca Ru Loss
250,0.4465,No log,0.470408
300,0.4447,No log,0.465488
350,0.4516,No log,0.462208
400,0.448,No log,0.460686


In [8]:
torch.save({'model': model, 'tokenizer': tokenizer}, OUTPUT_DIR+MODEL_ADD_PATH)

## HUMAN_EVAL

In [4]:
pruned_dict = torch.load(OUTPUT_DIR+MODEL_ADD_PATH, map_location=DEVICE)
tokenizer, model = pruned_dict['tokenizer'], pruned_dict['model']

print(f'Final num of params = {sum([np.prod(item.size())  for item in model.parameters()])}')

  pruned_dict = torch.load(OUTPUT_DIR+MODEL_ADD_PATH, map_location=DEVICE)


Final num of params = 5197870221


In [5]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [6]:
%cd human-eval/human_eval

/trinity/home/team16/workspace/Compress-Code-LLMs-SMILES/human-eval/human_eval


In [7]:
HUMAN_EVAL ="data/HumanEval.jsonl.gz"

def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]:
    return {task["task_id"]: task for task in stream_jsonl(evalset_file)}


def stream_jsonl(filename: str) -> Iterable[Dict]:
    """
    Parses each jsonl line and yields it as a dictionary
    """
    if filename.endswith(".gz"):
        with open(filename, "rb") as gzfp:
            with gzip.open(gzfp, 'rt') as fp:
                for line in fp:
                    if any(not x.isspace() for x in line):
                        yield json.loads(line)
    else:
        with open(filename, "r") as fp:
            for line in fp:
                if any(not x.isspace() for x in line):
                    yield json.loads(line)


def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
    """
    Writes an iterable of dictionaries to jsonl
    """
    if append:
        mode = 'ab'
    else:
        mode = 'wb'
    filename = os.path.expanduser(filename)
    if filename.endswith(".gz"):
        with open(filename, mode) as fp:
            with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
                for x in data:
                    gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
    else:
        with open(filename, mode) as fp:
            for x in data:
                fp.write((json.dumps(x) + "\n").encode('utf-8'))

def generate_one_completion(prompt, max_new_tokens):
    messages=[
        { 'role': 'user', 'content': prompt}
    ]
    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
    outputs = model.generate(inputs=inputs, max_new_tokens=max_new_tokens, do_sample=False, top_k=50, top_p=0.95, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id)
    out = (tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True))
    return out

In [9]:
%cd ..

/trinity/home/team16/workspace/Compress-Code-LLMs-SMILES/human-eval


In [10]:
model = model.merge_and_unload()

In [11]:
problems = read_problems()

samples = [
    dict(task_id=task_id, completion=generate_one_completion(problems[task_id]["prompt"], max_new_tokens=MAX_NEW_TOKENS))
    for task_id in tqdm(problems)
    for _ in range(NUM_SAMPLES_PER_TASK)
]
write_jsonl(SAVE_METRIC_PATH, samples)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:100015 for open-end generation.
  1%|          | 1/164 [00:04<11:52,  4.37s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:100015 for open-end generation.
  1%|          | 2/164 [00:14<20:24,  7.56s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:100015 for open-end generation.
  2%|▏         | 3/164 [00:18<16:32,  6.16s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected beha

In [12]:
%cd human_eval

/trinity/home/team16/workspace/Compress-Code-LLMs-SMILES/human-eval/human_eval


In [15]:
import subprocess
import json

out = subprocess.run(['python', 'evaluate_functional_correctness.py','../'+SAVE_METRIC_PATH],stdout=subprocess.PIPE)

164it [00:00, 35114.90it/s]
100%|██████████| 164/164 [00:00<00:00, 266.72it/s]
100%|██████████| 164/164 [00:00<00:00, 53580.45it/s]


In [16]:
metrics = str(out.stdout).split('{')[-1].split('}')[0]

In [17]:
metrics_dict = {}
metrics = metrics.split(',')
for metric in metrics:
    k, val = metric.split(':')
    metrics_dict[k[1:][:-1]] = float(val)

In [18]:
metrics_dict

{'pass@1': 0.15853658536585366}