In [1]:
import os
import json
import math
import gc

import numpy as np

import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

from peft import (
    LoraConfig,
    get_peft_model,
    PeftModel,
    LoHaConfig,
)

from cortexsubsetloader import tokenize
from pytorch_optimizer import Ranger21, Lamb, DAdaptLion, SAM, DAdaptAdam, LOMO, SophiaH
from pytorch_optimizer.optimizer.sam import WSAM

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [2]:
# !pip install transformers==4.37.2
# !pip install -U --force-reinstall transformers

In [3]:
def filter_data(train, eval):
    evalf = []
    for f in eval:
        if f not in train:
            evalf.append(f)
    if len(evalf) < len(eval): print(f"Removed {len(eval)-len(evalf)} duplicates from eval")
    else: print("No duplicates found in eval")
    return evalf


def data_collator(features):
    batches = []
    for feature in features:
        inputs, prompt_len = feature
        data = [inputs]
        b_labels = inputs.clone()
        b_labels[:, :prompt_len] = -100
        labels = [b_labels]
            
        batch = {}
        batch['input_ids'] = torch.concat(data)
        batch['labels'] = torch.concat(labels)
        batches.append(batch)
    return batches


def get_data(train_name, eval_name, tokenizer, train_subset=None, eval_subset=None, shuffle=True, extend_train_length=0):
    if type(train_name) == str:
        with open(train_name, "r") as f:
            train_data = np.array(json.load(f))
    else: # list of str
        train_data = []
        for name in train_name:
            with open(name, "r") as f:
                train_data.extend(json.load(f))
        train_data = np.array(train_data)
    with open(eval_name, "r") as f:
        eval_data = np.array(json.load(f))

    if train_subset is not None:
        train_data = train_data[train_subset]
    if eval_subset is not None:
        eval_data = eval_data[-eval_subset:]
    
    eval_data = filter_data(train_data, eval_data)

    if shuffle:
        p = np.random.permutation(len(train_data))
        train_data = train_data[p]

    train_data = tokenize(tokenizer, train_data, 2048+extend_train_length)
    eval_data = tokenize(tokenizer, eval_data, 2048)

    train_data = data_collator(train_data)
    eval_data = data_collator(eval_data)

    return train_data, eval_data

# initial_loss_eps = 0.0001
intermed_check_step_split = 8


def simple_eval(model, eval_d):
    print("Evaluating", end=" ")
    model = model.to("cuda")
    model.eval()
    eval_loss = 0
    steps_so_far = 1
    for batch in eval_d:
        inputs = batch['input_ids'].to("cuda")
        labels = batch['labels'].to("cuda")
        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            eval_loss += outputs.loss.item() / len(eval_d)
        if steps_so_far % (len(eval_d) // intermed_check_step_split) == 0:
            print(".", end="")
            gc.collect(); torch.cuda.empty_cache()
        steps_so_far += 1
    model = model.to("cpu")
    gc.collect(); torch.cuda.empty_cache()
    print(f" Loss: {eval_loss:.8f}")

def evaluate(model, eval_d, return_to_cpu=False, return_stats=False, print_stats=True, 
             base_model=None, precompute_base_loss=True, device="cuda", instruction_finetuning=True, true_eps=0.01):
    print("Evaluating", end=" ")
    model = model.to("cuda")
    eval_loss = 0
    model.eval()
    steps_so_far = 1

    for batch in eval_d:
        inputs = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        loss = model(inputs, labels=labels).loss

        eval_loss += loss.item() / len(eval_d)

        if steps_so_far % (len(eval_d) // intermed_check_step_split) == 0:
            print(".", end="")
            gc.collect(); torch.cuda.empty_cache()
        steps_so_far += 1

    if return_to_cpu:
        model = model.to("cpu")

    gc.collect(); torch.cuda.empty_cache()

    data = {
        "loss": eval_loss,
    }

    if print_stats:
        print(f" Loss: {eval_loss:.8f}")
    if return_stats:
        return data



def train(model, train_d, eval_d, base_model=None, inf_training=False, training_device="cuda",
            acc_batch_size=32, instruction_finetuing=True, precalculate_base_loss=True, precalculate_batch_mult=1.5,
            lr=1e-4, weight_decay=0.001, lr_scheduler="constant", warmup_steps=0, betas=(0.9, 0.99), 
            use_sam=False, sam_rho=0.05, opt="adamw",
            manual_grad_clip_norm=0.0, wait_for_full_batch=True, sam_reuse_base_outputs=False,
            do_base_gradient=True, add_overshoot_penalty=True, ignore_overshot_samples=True, bad_sample_mult=1.0,
            loss_eps = 0.015, overshoot_buffer = 0.01, true_eps=0.01,
            prompt_dropout=0.0,
            eval_steps=1024, save_steps=1024, save_name="lora", do_save=True,
            average_stats=False,
            partial_eval_steps=0, partial_eval_size=128, save_n_start=0,
            gradient_checkpointing=False, excessive_cache_clearing=False):
    if warmup_steps is None:
        warmup_steps = (eval_steps // acc_batch_size) // 2
    LORA = True
    if base_model is not None:
        LORA = False
        # base_model = base_model.to("cuda")

    if inf_training:
        # from subsetfalconloader import SubsetFalconLoader
        pass

    if precalculate_base_loss and do_base_gradient:
        raise ValueError("Precalculating base loss will disconnect base gradients")

    gc.collect(); torch.cuda.empty_cache()
    model = model.to("cuda")
    model.enable_input_require_grads()

    if gradient_checkpointing:
        model.config.use_cache = False
        grad_check_kwargs = {"use_reentrant": False}
        if do_base_gradient:
            grad_check_kwargs["use_reentrant"] = True
        model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=grad_check_kwargs)
    model.train()


    if not use_sam:
        if opt == "dadapt_adam":
            optimizer = DAdaptAdam(model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay, fixed_decay=True)
        elif opt == "sophia":
            optimizer = SophiaH(model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        elif opt == "ranger":
            optimizer = Ranger21(model.parameters(), num_iterations=1, lr=lr, betas=betas, weight_decay=weight_decay,
                                 num_warm_up_iterations=0, num_warm_down_iterations=0)
        elif opt == "adamw" or opt == "adam":
            optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
        elif opt == "sgd":
            optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=betas[0], weight_decay=weight_decay)
        else:
            raise ValueError(f"Unknown optimizer {opt}")
    else:
        base_optimizer_args = {"lr": lr, "weight_decay": weight_decay, "betas": betas, "eps": 1e-8}

        if opt == "dadapt_adam":
            base_optimizer = DAdaptAdam
        elif opt == "sophia":
            base_optimizer = SophiaH
        elif opt == "ranger":
            base_optimizer = Ranger21
            base_optimizer_args["num_iterations"] = 1
            base_optimizer_args["num_warm_up_iterations"] = 0
            base_optimizer_args["num_warm_down_iterations"] = 0
        elif opt == "adamw" or opt == "adam":
            base_optimizer = torch.optim.AdamW
        elif opt == "sgd":
            base_optimizer = torch.optim.SGD
            del base_optimizer_args["betas"], base_optimizer_args["eps"]
            base_optimizer_args["momentum"] = betas[0]
        else:
            raise ValueError(f"Unknown optimizer {opt}")

        optimizer = SAM(model.parameters(), base_optimizer=base_optimizer, rho=sam_rho, adaptive=True, **base_optimizer_args)
        sam_optimizer = optimizer


    if lr_scheduler == "cosine":
        lr_scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, warmup_steps, (len(train_d)//acc_batch_size)+warmup_steps)
    else:
        lr_scheduler = transformers.get_constant_schedule_with_warmup(optimizer, warmup_steps)
    lr_scheduler.step() # don't want to start at 0


    steps_so_far = 1 # start at one to avoid all the modulo checks
    epoch_loss = 0; epoch_overshoot = 0; epoch_base_loss = 0; lora_diff = 0
    epoch_wr = 0; epoch_0eps_wr = 0
    fit_samples = 0; unfit_samples = 0
    sam_batch = []
    # lomo_batch_loss = []
    accum_steps = 0
    true_steps_taken = 0; prev_dot_step = -1
    last_tst = true_steps_taken
    sam_saved_base_outputs = []
    precalculated_base_outputs = []
    while len(train_d) > 0:

        batch = train_d.pop(0)
        inputs = batch['input_ids'].to(training_device)
        labels = batch['labels'].to(training_device)

        outputs_loss = model(inputs, labels=labels).loss

        # current_loss_eps = final_loss_eps 
        current_loss_eps = loss_eps # ((final_loss_eps * (min(steps_so_far * 8 / len(train_d), 1.0))) + initial_loss_eps) / (1.0 + initial_loss_eps)
        # current_loss_eps = 0.011 # lr_scheduler.get_last_lr()[0] * 100.0 # * 3.33

        loss = outputs_loss
        loss = loss / acc_batch_size

        loss.backward()
            
        accum_steps += 1
        true_steps_taken += 1
        if use_sam:
            sam_batch.append((inputs, labels))

        epoch_loss += loss.detach().item() * acc_batch_size

        if (true_steps_taken % (acc_batch_size // intermed_check_step_split) == 0) and accum_steps != prev_dot_step:
            prev_dot_step = accum_steps
            print(".", end="")
            gc.collect(); torch.cuda.empty_cache()

        if (((steps_so_far % acc_batch_size == 0 or steps_so_far == len(train_d)) and not wait_for_full_batch) or 
                                                                            (wait_for_full_batch and accum_steps == acc_batch_size)):
            if manual_grad_clip_norm > 0.0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), manual_grad_clip_norm)

            if not use_sam:
                optimizer.step()
                optimizer.zero_grad()
            else:
                sub_steps = 1
                sam_optimizer.first_step(zero_grad=True)
                
                for inputs, labels in sam_batch:
                    outputs = model(inputs, labels=labels)

                    loss = outputs.loss
                    loss = loss / accum_steps
                    
                    loss.backward()

                    sub_steps += 1
                    if sub_steps % (acc_batch_size // intermed_check_step_split) == 0:
                        print(".", end="")
                    if excessive_cache_clearing:
                        gc.collect(); torch.cuda.empty_cache()

                sam_optimizer.second_step(zero_grad=True)
                sam_batch = []

            if average_stats:
                stat_steps = steps_so_far
            else:
                stat_steps = accum_steps
            print(f"Step {steps_so_far}/{len(train_d)}\tLoss: {epoch_loss/stat_steps:.6f}\t","LR:", lr_scheduler.get_last_lr()[0])

            if not average_stats:
                epoch_loss = 0
            accum_steps = 0
            
            lr_scheduler.step()

            gc.collect(); torch.cuda.empty_cache()


        if do_save and true_steps_taken % save_steps == 0 and len(train_d) > 0 and true_steps_taken != last_tst:
            model.save_pretrained(save_name + '_' + str((true_steps_taken // save_steps) + save_n_start).format("02d"))
            
        do_full_eval = true_steps_taken % eval_steps == 0 and len(train_d) > 0 and true_steps_taken != last_tst
        if do_full_eval:
            evaluate(model, eval_d, base_model=base_model, device=training_device, instruction_finetuning=instruction_finetuing, true_eps=true_eps)
            model.train()
        if ((partial_eval_steps > 0 and true_steps_taken % partial_eval_steps == 0) and not do_full_eval 
                                            and len(train_d) > 0 and true_steps_taken != last_tst):
            evaluate(model, eval_d[:partial_eval_size], base_model=base_model, device=training_device, 
                     instruction_finetuning=instruction_finetuing, true_eps=true_eps)
            model.train()
        
        steps_so_far += 1
        last_tst = true_steps_taken

        if excessive_cache_clearing:
            gc.collect(); torch.cuda.empty_cache()

    if do_save:
        if save_n_start > 0:
            model.save_pretrained(save_name+"_X"+str(save_n_start))
        else:
            model.save_pretrained(save_name)

    model.eval()
    final_eval_stats = evaluate(model, eval_d, return_stats=True, base_model=base_model, device=training_device, 
                                instruction_finetuning=instruction_finetuing, true_eps=true_eps)

    model = model.to("cpu")
    gc.collect(); torch.cuda.empty_cache()

    return final_eval_stats

In [5]:
lora_name = "fulltune5"
model_name = "fulltune4"

neft_noise = 0.0 # bad actually?


rank=256
config = LoraConfig(
    r=rank, lora_alpha=32,
    target_modules=[
                    'q_proj',
                    'v_proj', 
                    "k_proj", 
                    "o_proj", 
                    "gate_proj", 
                    "up_proj", 
                    "down_proj"
                    ],  #   , 
    lora_dropout=0.0,
    bias="all", 
    task_type="CAUSAL_LM",
    use_rslora=True,
    # init_lora_weights="gaussian",
)

# config = LoHaConfig(
#     r=rank, 
#     alpha=rank,
#     target_modules=["q_proj", "v_proj"], #,  , "up_proj" # , "o_proj" , "k_proj", "down_proj"
#     rank_dropout=0.0,
#     module_dropout=0.0,
#     init_weights=True,
#     task_type="CAUSAL_LM",
# )

params = {
    'low_cpu_mem_usage': True,
    'trust_remote_code': False,
    'torch_dtype': torch.bfloat16,
    'use_safetensors': True,
    'attn_implementation': "flash_attention_2"
}


model = AutoModelForCausalLM.from_pretrained(model_name, **params, cache_dir="Models")

last_q = None
last_v = None
last_up = None
for name, param in model.named_parameters():
    if "q_proj" in name:
        last_q = param
    elif "k_proj" in name:
        # print(last_q.data.shape, param.data.shape)
        mult = torch.sqrt(torch.mean(torch.abs(last_q.data), dim=1, keepdim=True).transpose(0, 1) / torch.mean(torch.abs(param.data), dim=0))
        mult = torch.mean(mult)
        # mult = torch.sqrt(torch.mean(torch.abs(last_q.data)) / torch.mean(torch.abs(param.data)))
        # print(mult.shape)
        last_q.data = last_q.data / mult
        param.data = param.data * mult
        last_q = None
        # print(mult)
    if "v_proj" in name:
        last_v = param
    elif "o_proj" in name:
        # print(last_v.data.shape, param.data.shape)
        mult = torch.sqrt(torch.mean(torch.abs(last_v.data), dim=0, keepdim=True) / torch.mean(torch.abs(param.data), dim=1))
        mult = torch.mean(mult)
        # mult = torch.sqrt(torch.mean(torch.abs(last_v.data)) / torch.mean(torch.abs(param.data)))
        # print(mult.shape)
        last_v.data = last_v.data / mult
        param.data = param.data * mult#.transpose(0, 1)
        last_v = None
        # print(mult)
    if "up_proj" in name:
        last_up = param
    elif "down_proj" in name:
        # print(last_up.data.shape, param.data.shape)
        mult = torch.sqrt(torch.mean(torch.abs(last_up.data), dim=1, keepdim=True).transpose(0, 1) / torch.mean(torch.abs(param.data), dim=0))
        # mult = torch.sqrt(torch.mean(torch.abs(last_up.data)) / torch.mean(torch.abs(param.data)))
        # print(mult.shape)
        last_up.data = last_up.data / mult.transpose(0, 1)
        param.data = param.data * mult
        last_up = None
        # print(mult)

lora_model = model
for name, param in lora_model.named_parameters():
    if ("up_proj" not in name 
        and "down_proj" not in name 
        and "gate_proj" not in name 
        # and "o_proj" not in name 
        # and "k_proj" not in name 
        # and "v_proj" not in name 
        # and "q_proj" not in name
    ):
        param.requires_grad = True
    else:
        param.requires_grad = False
# lora_model = PeftModel.from_pretrained(model, model_id="Ordovician", is_trainable=True)
# lora_model = PeftModel.from_pretrained(model, model_id="Silurian", is_trainable=True)
# lora_model = lora_model.to("cuda")
# # lora_model = lora_model.merge_and_unload(progressbar=True)
# lora_model = get_peft_model(model, config)
# # # lora_model.to(torch.bfloat16)
# lora_model.print_trainable_parameters()
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it", trust_remote_code=False, use_fast=True, cache_dir="Models")
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=False, use_fast=True, cache_dir="Models")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
for name, param in lora_model.named_parameters():
    if "lora" not in name:
        print(name, param)

In [7]:
train_data, eval_data = get_data(train_name=['data/cortex_548_6656.json','data/cortex_547_6656.json',
                                            #  'data/cortex_542_6656.json',
# 'data/cortex_503_6656.json','data/cortex_506_6656.json','data/cortex_509_6656.json',
# 'data/cortex_370_6656.json','data/cortex_369_6656.json','data/cortex_367_6656.json'
],
                                 train_subset=np.arange(0, (6656*2)-512), eval_subset=512,
                                 eval_name='data/cortex_547_6656.json', tokenizer=tokenizer, shuffle=True)
# evaluate(lora_model, eval_data, return_to_cpu=True, print_stats=True, base_model=base_model)

Removed 181 duplicates from eval


In [8]:
evaluate(lora_model, eval_data, return_to_cpu=True, print_stats=True)

Evaluating ........ Loss: 0.63273114


In [10]:
train(lora_model, train_data, eval_data, base_model=None, inf_training=False, training_device="cuda",
        acc_batch_size=512, instruction_finetuing=True, precalculate_base_loss=True, precalculate_batch_mult=2.1,
        lr=3.2e-5, weight_decay=0.0, lr_scheduler="cosine", warmup_steps=0, betas=(0.8, 0.95),
        use_sam=False, sam_rho=0.1, opt="adam",
        loss_eps = 0.02, overshoot_buffer = -0.01, true_eps=0.01,
        manual_grad_clip_norm=0.0, wait_for_full_batch=True, sam_reuse_base_outputs=True,
        do_base_gradient=False, add_overshoot_penalty=False, ignore_overshot_samples=True, bad_sample_mult=1.01,
        prompt_dropout=0.0,
        eval_steps=2048, save_steps=2048, do_save=True, save_name=lora_name,
        average_stats=False,
        partial_eval_steps=0, partial_eval_size=128, save_n_start=0,
        # gradient_checkpointing=False, excessive_cache_clearing=False)
        gradient_checkpointing=True, excessive_cache_clearing=True)

........Step 512/12288	Loss: 0.594686	 LR: 3.1873835221031644e-05
........Step 1024/11776	Loss: 0.606488	 LR: 3.14973305780581e-05
........Step 1536/11264	Loss: 0.602072	 LR: 3.087642377421202e-05
........Step 2048/10752	Loss: 0.590941	 LR: 3.0020906880701815e-05
Evaluating ........ Loss: 0.63482267


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7fcd2c3a0250>>
Traceback (most recent call last):
  File "/home/alyx/.local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 770, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [9]:
evaluate(lora_model, eval_data, return_to_cpu=True, print_stats=True)

Evaluating ........ Loss: 0.00147147, Base Loss: 1.243601, Lora Diff: -0.01991747, WR: 62.70%, 0epsWR: 97.75%, OShL: 0.00919830


In [10]:
lora_model.save_pretrained(lora_name)

In [11]:
lora_model = lora_model.to("cuda")
lora_model = lora_model.merge_and_unload()
lora_model.config.name_or_path = "MesozoicMetallurgist/new_model"
# model_dir = "Models/merged_model"
model_dir = os.path.expanduser("~/finetuning-subnet/merged_model")
if not os.path.exists(model_dir):
    os.makedirs(model_dir, exist_ok=True)
else:
    # wipe the directory
    for f in os.listdir(model_dir):
        os.remove(os.path.join(model_dir, f))
lora_model.save_pretrained(save_directory=model_dir, safe_serialization=True)
tokenizer.save_pretrained(save_directory=model_dir)

('/home/alyx/finetuning-subnet/merged_model/tokenizer_config.json',
 '/home/alyx/finetuning-subnet/merged_model/special_tokens_map.json',
 '/home/alyx/finetuning-subnet/merged_model/tokenizer.model',
 '/home/alyx/finetuning-subnet/merged_model/added_tokens.json',
 '/home/alyx/finetuning-subnet/merged_model/tokenizer.json')

In [None]:
lora_model.save_pretrained(lora_name + '_' + "1")

In [None]:
evaluate(lora_model, eval_data, return_to_cpu=True, return_stats=True, print_stats=True)

Evaluating ........ Loss: 0.00202340, Base Loss: 1.398741, Lora Diff: -0.06646919, WR: 81.64%, 0epsWR: 92.29%, OShL: 0.05393791


{'loss': 0.0020233988761901855,
 'base_loss': 1.3987407684326172,
 'lora_diff': -0.06646919250488281,
 'head_to_head': 81.640625,
 'eps0_head_to_head': 92.28515625,
 'overshoot': 0.05393791198730469}

In [None]:
gc.collect(); torch.cuda.empty_cache()
evaluate(lora_model, eval_data, return_stats=True, base_model=None)

Evaluating ........ Loss: 0.01634118, Base Loss: 0.394893, Lora Diff: 0.00121789, WR: 5.08%, 0epsWR: 38.09%, OShL: 0.00017984


{'loss': 0.016341184635450645,
 'base_loss': 0.39489315043670103,
 'lora_diff': 0.0012178865729310928,
 'head_to_head': 5.078125,
 'eps0_head_to_head': 38.0859375,
 'overshoot': 0.00017984330224862788}

In [None]:
evaluate(lora_model, eval_data, return_stats=True)

Evaluating ........0.012056129032998797
 Loss: 0.01205613, Base Loss: 0.365259, Lora Diff: -0.00147909, WR: 13.87%, 0epsWR: 58.01%, OShL: 0.00109561


{'loss': 0.012056129032998797,
 'base_loss': 0.36525918503184585,
 'lora_diff': -0.0014790902009735873,
 'head_to_head': 13.8671875,
 'eps0_head_to_head': 58.0078125,
 'overshoot': 0.0010956073215311335}

In [12]:
!pip install transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable


In [None]:
# del model, lora_model, tokenizer, train_data, eval_data, trainer; gc.collect(); torch.cuda.empty_cache()
# gc.collect(); torch.cuda.empty_cache()

In [None]:
# lora_model = lora_model.merge_and_unload()
# lora_model.config.name_or_path = "MesozoicMetallurgist/new_model"
# model_dir = "Models/merged_model"
# model_dir = os.path.expanduser("~/finetuning-subnet/merged_model")
# if not os.path.exists(model_dir):
#     os.makedirs(model_dir, exist_ok=True)
# else:
#     # wipe the directory
#     for file in os.listdir(model_dir):
#         os.remove(os.path.join(model_dir, file))
# lora_model.save_pretrained(save_directory=model_dir, safe_serialization=True)
# tokenizer.save_pretrained(save_directory=model_dir)

# del lora_model, trainer, model, tokenizer

('/home/alyx/finetuning-subnet/merged_model/tokenizer_config.json',
 '/home/alyx/finetuning-subnet/merged_model/special_tokens_map.json',
 '/home/alyx/finetuning-subnet/merged_model/tokenizer.model',
 '/home/alyx/finetuning-subnet/merged_model/added_tokens.json',
 '/home/alyx/finetuning-subnet/merged_model/tokenizer.json')