In [1]:
import os
import json
import math
import time
import gc
import copy

import numpy as np

import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer


from cortexsubsetloader import CortexSubsetLoader

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

notebook_start_time = time.time()

In [8]:
mistral=False
llama3 = True
if not llama3:
    if not mistral:
        # lora_name "Carboniferous"
        lora_name = "Carnian"
        model_name = ""
        model_name_to_beat = model_name
    else:
        lora_name = "Helium"
        model_name = ""
        model_name_to_beat = model_name
else:
    lora_name = "Hadean"
    model_name = ""
    model_name_to_beat = model_name

params = {
    'low_cpu_mem_usage': True,
    'trust_remote_code': False,
    'torch_dtype': torch.bfloat16,
    'use_safetensors': True,
    # 'attn_implementation': "flash_attention_2"
}

# if not mistral:
#     model = AutoModelForCausalLM.from_pretrained(model_name, **params, token=HF_TOKEN, cache_dir="Models")
#     # model = model.to("cuda")
#     tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-2-zephyr-1_6b", trust_remote_code=False, use_fast=True, cache_dir="Models")
# else:
#     model = AutoModelForCausalLM.from_pretrained(model_name, **params, token=HF_TOKEN, cache_dir="Models") # MistralForCausalLM
#     # model = model.to("cuda")
#     tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", cache_dir="Models")
model = AutoModelForCausalLM.from_pretrained(model_name, **params, token=HF_TOKEN, cache_dir="Models")
# model = model.to("cuda")
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B-Instruct", cache_dir="Models")

# rank = 96
# config = LoraConfig(
#     r=rank, lora_alpha=rank*2,
#     target_modules=[
#                     'q_proj',
#                     'v_proj', 
#                     "k_proj", 
#                     "o_proj", 
#                     # "gate_proj", 
#                     "up_proj", 
#                     "down_proj"
#                     ],  #   , 
#     lora_dropout=0.0,
#     bias="none", task_type="CAUSAL_LM",
#     # use_rslora=True,
#     use_dora=True,
#     # init_lora_weights="gaussian",
# )

from utils import norm_model_weights

model = norm_model_weights(model)
model.config.name_or_path = "MesozoicMetallurgist/" + lora_name
        

# model.save_pretrained("Models/fixedscaling")
# base_model = None
# base_model = AutoModelForCausalLM.from_pretrained(model_name_to_beat, **params, cache_dir="Models")
# for name, param in base_model.named_parameters():
#     param.requires_grad = False

lora_model = model
# for name, param in lora_model.named_parameters():
#     if not mistral:
#         param.requires_grad = True
#     else:
#         if ("up_proj" not in name and "down_proj" not in name 
#             and "gate_proj" not in name 
#             and "embed_" not in name
#             # and "q_proj" not in name
#             # and "k_proj" not in name 
#             # and "v_proj" not in name 
#             # and "o_proj" not in name 
#         ):
#             param.requires_grad = True
#         else:
#             param.requires_grad = False

# lora_model = PeftModel.from_pretrained(model, model_id="Ypresian_", is_trainable=True)
# lora_model = lora_model.merge_and_unload(progressbar=True)
# lora_model = get_peft_model(model, config)
# lora_model.print_trainable_parameters()
# tokenizer = AutoTokenizer.from_pretrained("MesozoicMetallurgist/nous-Hauterivian", trust_remote_code=False, use_fast=True, cache_dir="Models")
# lora_model = lora_model.to("cpu")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
print(validate_parameters(lora_model, print_vals=True))

{'q_proj': 83.640625, 'k_proj': 43.328125, 'v_proj': 19.912109375, 'o_proj': 39.94921875, 'up_proj': 81.15625, 'down_proj': 81.421875}
{'q_proj': 91.0, 'k_proj': 49.5, 'v_proj': 30.5, 'o_proj': 61.0, 'up_proj': 95.5, 'down_proj': 95.5}
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
True


In [7]:
print(validate_parameters(lora_model, print_vals=True))

{'q_proj': 145.125, 'k_proj': 24.953125, 'v_proj': 180.875, 'o_proj': 4.4208984375, 'up_proj': 204.5, 'down_proj': 32.375}
{'q_proj': 158.0, 'k_proj': 26.375, 'v_proj': 322.0, 'o_proj': 5.78125, 'up_proj': 260.0, 'down_proj': 35.75}
[0.0, 0.0, 0.25, 0.0, 0.625, 0.0]
True


In [30]:
train_data, eval_data = get_data(train_name=['data/cortex_686_8704.json',
],
                                 train_subset=np.arange(0,32),
                                 eval_subset=32,
                                 eval_name='data/cortex_686_8704.json', tokenizer=tokenizer, shuffle=False)
evaluate(lora_model, eval_data, return_to_cpu=True, print_stats=True, base_model=base_model)

No duplicates found in eval
Evaluating ........ Loss: 0.49611994, Base Loss: 0.403597, Lora Diff: 0.17448707, WR: 0.00%, 0epsWR: 0.00%, OShL: 0.00000000


In [5]:
from validation import validate_parameters

# print(validate_parameters(lora_model, print_vals=True))
# print(validate_parameters(base_model, print_vals=True))

In [None]:
model.config.name_or_path = upload_name
tokenizer.push_to_hub(repo_id=upload_name, private=True)
commit_info = model.push_to_hub(repo_id=upload_name, safe_serialization=True, private=True)

In [None]:
for name, param in lora_model.named_parameters():
    if "lora" not in name:
        print(name, param, param.data.shape)

In [6]:
train_data, eval_data = get_data(train_name=['data/cortex_686_8704.json',
],
                                 train_subset=np.arange(0,32),
                                 eval_subset=32,
                                 eval_name='data/cortex_686_8704.json', tokenizer=tokenizer, shuffle=False)
evaluate(lora_model, eval_data, return_to_cpu=True, print_stats=True, base_model=base_model)

No duplicates found in eval
Evaluating ........ Loss: 0.00985266, Base Loss: 0.403597, Lora Diff: -0.00016907, WR: 0.00%, 0epsWR: 56.25%, OShL: 0.00000000


In [37]:
train_data, eval_data = get_data(train_name=['data/cortex_686_8704.json',
],
                                 train_subset=np.arange(0,32),
                                 eval_subset=512,
                                 eval_name='data/cortex_686_8704.json', tokenizer=tokenizer, shuffle=False)
evaluate(lora_model, eval_data, return_to_cpu=True, print_stats=True, base_model=base_model)

No duplicates found in eval
Evaluating ........ Loss: 0.01000053, Base Loss: 0.665697, Lora Diff: 0.00001144, WR: 2.34%, 0epsWR: 50.29%, OShL: 0.00000000


In [11]:
train_data = []
eval_data = []
train(lora_model, tokenizer, train_data, eval_data, base_model=base_model, inf_training=False, training_device="cuda",
        acc_batch_size=512, instruction_finetuing=True, precalculate_batch_mult=2.25,
        lr=1.6e-5, weight_decay=0.0, lr_scheduler="cosine", warmup_steps=4, warmup_end_offset=10, betas=(0.8, 0.95),
        use_sam=False, sam_rho=0.05, adaptive_sam=False, wsam_variant=True,
        opt="adamw",
        loss_eps = 0.02, overshoot_buffer = -0.01, true_eps=0.01, ignore_below=0.0, ignore_auto_percent=1,
        remerging=False, revert=False,
        # remerging=True, revert=True,
        remerge_eval=False, remerge_ratio=0.3, eval_revert_if={"loss": 0.001, "head_to_head": -6.25, "eps0_head_to_head": -12.5},
        simple_loss=True, process_base_loss=True, precalc_eval_base=True, relative_loss=False,
        manual_grad_clip_norm=1.0, sam_grad_clip_norm=None,
        add_overshoot_penalty=False, ignore_overshot_samples=True, bad_sample_mult=1.0,
        eval_steps=2048, do_save=False, save_name=lora_name, save_n_start=0,
        average_stats=False,
        gradient_checkpointing=False, excessive_cache_clearing=False)

WARN: no training data provided, enabling infinite training
WARN: no evaluation data provided, acquiring new data
WARN: no training data provided, acquiring new data
WARN: simple loss is enabled, this will disable base model processing
Base loss processing is disabled, disabling ignore overshot samples
Note: precalced eval base loss does not account for pretrained fine-tuning
........Eval Base Loss: 0.612937
........Step 512/1792	Loss: 0.827209 OShL: 0.000e+00	Base: 0.6129 Diff: 2.1427e-01 	WR: 23.05% 0eps: 23.83%  	LR: 4.00e-06 fit: 0/0
........Step 1024/1280	Loss: 0.823925 OShL: 0.000e+00	Base: 0.6129 Diff: 2.1099e-01 	WR: 19.92% 0eps: 20.70%  	LR: 8.00e-06 fit: 0/0
........Step 1536/1920	Loss: 0.704217 OShL: 0.000e+00	Base: 0.6129 Diff: 9.1280e-02 	WR: 43.75% 0eps: 43.75%  	LR: 1.20e-05 fit: 0/0
........Step 2048/1408	Loss: 0.679326 OShL: 0.000e+00	Base: 0.6129 Diff: 6.6389e-02 	WR: 49.41% 0eps: 50.00%  	LR: 1.60e-05 fit: 0/0
Evaluating ........ Loss: 0.01009590, Base Loss: 0.612937

KeyboardInterrupt: 

In [None]:
from validation import *

In [None]:
validate_improvement("tomaszki/stablelm-1", "MesozoicMetallurgist/zeta-Ladinian", False, n_runs=1, samples=768, dedup=False)
validate_improvement("MesozoicMetallurgist/zeta-Anisian", "0x0dad0/beta_s03", False, n_runs=1, samples=768, dedup=False)
validate_improvement("MesozoicMetallurgist/zeta-Anisian", "tomaszki/stablelm-0", False, n_runs=1, samples=768, dedup=False)
validate_improvement("MesozoicMetallurgist/zeta-Induan", "MesozoicMetallurgist/zeta-Anisian", False, n_runs=1, samples=768, dedup=False)

In [12]:
### Save the model
lora_model.save_pretrained(lora_name)

config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

In [13]:
lora_model = lora_model.to("cuda")
lora_model = lora_model.merge_and_unload()
# lora_model = norm_model_weights(lora_model)
# simple_eval(lora_model, eval_data)
lora_model = lora_model.to("cpu")

In [4]:
lora_model.config.name_or_path = "MesozoicMetallurgist/new_model"
# model_dir = "Models/merged_model"
model_dir = os.path.expanduser("~/finetuning-subnet/merged_model_l3")
if not os.path.exists(model_dir):
    os.makedirs(model_dir, exist_ok=True)
else:
    # wipe the directory
    for file in os.listdir(model_dir):
        os.remove(os.path.join(model_dir, file))
lora_model.save_pretrained(save_directory=model_dir, safe_serialization=True)
tokenizer.save_pretrained(save_directory=model_dir)
lora_model = lora_model.to("cpu")
gc.collect(); torch.cuda.empty_cache()

In [8]:
evaluate(lora_model, eval_data, return_to_cpu=True, print_stats=True)

Evaluating ........ Loss: 0.00798187, Base Loss: 0.557989, Lora Diff: -0.00123047, WR: 18.10%, 0epsWR: 66.80%, OShL: 0.00037500
