In [None]:
from transformers import AutoModelForCausalLM

from utils import *
from training import *
from validation import *

In [None]:
model_type = "stablelm" # mistral, gemma, stablelm

hf_account_name = "" # huggingface.co username
save_name = "" # name to save and upload the model as
model_name = "stabilityai/stablelm-2-zephyr-1_6b" # hf_repo/model_name
model_name_to_beat = model_name # set to the same unless comparing against a different model

params = load_local_config()
model = AutoModelForCausalLM.from_pretrained(model_name, **params)
model.config.name_or_path = save_name
model = model.to("cuda")

tokenizer = get_tokenizer(model_type) # alternatively can input tokenizer repo and model name

model = norm_model_weights(model) # normalize weights to prevent exploding gradients, is not correct or stable for all models, ymmv
        
base_model = AutoModelForCausalLM.from_pretrained(model_name_to_beat, **params)
for name, param in base_model.named_parameters():
    param.requires_grad = False

trainer = Trainer(model, tokenizer, base_model)

In [None]:
# print_model_params(model)

print(validate_parameters(model, print_vals=True))
print(validate_parameters(base_model, print_vals=True))

In [None]:
trainer.train(acc_batch_size=512, opt="adamw", lr=4e-5, lr_schedule="cosine", weight_decay=0.0, betas=(0.9, 0.99), max_batch_steps=None,
                warmup_steps=4, warmup_cycle_offset=-1,
                grad_clip_norm=1.0, ignore_overshot_samples=True, bad_sample_mult=1.0, ignore_sample_loss_below=0.0, precalc_batch_mult=2.25,
                remerging=False, remerge_ratio=0.75,
                base_relative_loss=False, loss_eps = 0.02, overshoot_buffer = -0.01, eval_eps=0.01,
                eval_n_batches=4, eval_size=64, revert=False, eval_revert_if={"loss": 0.004, "head_to_head": -12.5, "eps0_head_to_head": -22.5},
                save_name="test", do_save=True, cortex_steps=5, 
                gradient_checkpointing=True, excessive_cache_clearing=False, device="cuda")

In [None]:
validate_improvement(model, base_model, samples=768, tokenizer_name=model_type, dedup=False)

In [None]:
upload_name = hf_account_name + "/" + save_name
tokenizer.push_to_hub(repo_id=upload_name, private=True)
commit_info = model.push_to_hub(repo_id=upload_name, safe_serialization=True, private=True)
print(commit_info.oid)