In [1]:
import random
import sys
import time
import gc

import numpy as np

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

from cortexsubsetloader import CortexSubsetLoader

# model i is always pre-finetune, and thus gets the buffer
# model j is post our finetune

def iswin(loss_i, loss_j, epsilon=0.01):
    # Adjust loss based on timestamp and pretrain epsilon
    loss_i = (1 - epsilon) * loss_i
    return loss_i > loss_j

def compute_losses(model, batches, device):
    # Iterate over each page and corresponding batches
    losses = []
    print()
    with torch.inference_mode():
        model.to(device)
        model.eval()
        steps = 0
        for inputs, prompt_len in batches:
            inputs = inputs.to(device)
            labels = inputs.clone()
            labels[:, :prompt_len] = -100 # Only calculate loss on response
            outputs = model(inputs, labels=labels)
            loss = outputs.loss.item()  # Extract scalar loss value
            losses.append(loss)
            steps += 1
            if steps % (len(batches) // 16) == 0:
                print(".", end="")
    return losses


def norm_model_weights(model):
    last_q = None
    lqb = None
    lqkm = None
    last_v = None
    lvb = None
    lvom = None
    last_up = None
    bias = False
    for name, param in model.named_parameters():
        if "q_proj" in name:
            if "bias" in name:
                bias = True
                lqb = param
            else:
                last_q = param
        if "k_proj" in name:
            if "bias" in name and lqkm is not None:
                param.data = param.data * lqkm
            else:
                mult = torch.sqrt(torch.mean(torch.abs(last_q.data), dim=1, keepdim=True).transpose(0, 1) / 
                                torch.mean(torch.abs(param.data), dim=0, keepdim=True))
                mult = torch.mean(mult)
                last_q.data = last_q.data / mult#.transpose(0, 1)
                if bias:
                    lqb.data = lqb.data / mult#.transpose(0, 1).flatten()
                param.data = param.data * mult # 
                lqkm = mult
                
        if "v_proj" in name:
            if "bias" in name:
                lvb = param
            else:
                last_v = param
        if "o_proj" in name:
            if "bias" in name:
                param.data = param.data * lvom
            else:
                mult = torch.sqrt(torch.mean(torch.abs(last_v.data), dim=1, keepdim=True).transpose(0, 1).repeat(1, 
                                                                            int(param.data.shape[0] / last_v.data.shape[0])) / 
                                torch.mean(torch.abs(param.data), dim=0, keepdim=True))
                last_v.data = last_v.data / mult.transpose(0, 1)[:last_v.data.shape[0]]
                if bias:
                    lvb.data = lvb.data / mult.transpose(0, 1).flatten()
                param.data = param.data * mult
                lvom = mult

        if "up_proj" in name:
            last_up = param
        if "down_proj" in name:
            # print(last_up.data.shape, param.data.shape)
            mult = torch.sqrt(torch.mean(torch.abs(last_up.data), dim=1, keepdim=True).transpose(0, 1) / 
                            torch.mean(torch.abs(param.data), dim=0, keepdim=True))
            last_up.data = last_up.data / mult.transpose(0, 1)
            param.data = param.data * mult
            # print(mult, mult.shape)
    return model


def validate_improvement(test_model_name, lora_name, lora=True, rescale_for_lora=False, n_runs=10, gpu=True, samples=400, dedup=True,
                         mistral=False):
    if type(test_model_name) == str and type(lora_name) == str:
        print("Testing", test_model_name, "against", lora_name)
    win_count = 0; prev_data = []
    win_count_0eps = 0
    avg_loss_diff = 0
    avg_loss_i = 0
    avg_loss_j = 0

    n_runs_done = 0
    for i in range(n_runs):
        # try:
            if type(test_model_name) is str:
                test_model = AutoModelForCausalLM.from_pretrained(test_model_name, **params, cache_dir="Models")
            else:
                test_model = test_model_name
            for name, param in test_model.named_parameters():
                param.requires_grad = False
            if not mistral:
                tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-2-zephyr-1_6b", cache_dir="Models")
            else:
                tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", cache_dir="Models")

            cortex_data = CortexSubsetLoader(
                latest=True, running=True,
                random_seed=random.randint(0, sys.maxsize),
                max_samples=samples, page_size=samples,
                steps=1,
            )
            
            batches = cortex_data.tokenize(tokenizer)
            dedup_batches = []
            for batch in batches: 
                any_same = False
                for prev in prev_data:
                    if batch[0].shape == prev[0].shape:
                        if (batch[0][0] == prev[0][0]).all():
                            any_same = True
                            break
                if not any_same:
                    dedup_batches.append(batch)
            if len(dedup_batches) < len(batches):
                if dedup:
                    batches = dedup_batches
                    print("Removed", len(batches) - len(dedup_batches), "duplicate batches")
                else:
                    print("Found", len(batches) - len(dedup_batches), "duplicate batches, not deduplicating")
            elif len(dedup_batches) == len(batches):
                print("No duplicates found")

            device = "cuda:0" if gpu else "cpu"
            base_loss = compute_losses(test_model, batches, device)

            if lora:
                if rescale_for_lora:
                    test_model = norm_model_weights(test_model)
                lora_model = PeftModel.from_pretrained(test_model, lora_name, adapter_name=lora_name)
                lora_model = lora_model.merge_and_unload()
            else:
                test_model = None; gc.collect(); torch.cuda.empty_cache()
                if type(lora_name) is str:
                    lora_model = AutoModelForCausalLM.from_pretrained(lora_name, **params, cache_dir="Models")
                else:
                    lora_model = lora_name
                for name, param in lora_model.named_parameters():
                    param.requires_grad = False
            lora_loss = compute_losses(lora_model, batches, device)

            per_loss_win = np.mean([iswin(base, lora) for base, lora in zip(base_loss, lora_loss)])
            per_loss_win_b = np.mean([iswin(base, lora, epsilon=0.0) for base, lora in zip(base_loss, lora_loss)])
            win_count += per_loss_win
            win_count_0eps += per_loss_win_b

            avg_loss_i += np.mean(base_loss)
            avg_loss_j += np.mean(lora_loss) 
            loss_diff = np.mean(np.array(lora_loss) - np.array(base_loss))
            avg_loss_diff += loss_diff

            print("Base: ", np.mean(base_loss), "\tNew: ", np.mean(lora_loss), "\tDiff: ", loss_diff)
            print("\tWinRate: ", per_loss_win, "\t0Eps WinRate: ", per_loss_win_b)

            n_runs_done += 1
            prev_data = prev_data + batches
            del test_model, lora_model; gc.collect(); torch.cuda.empty_cache()
        # except:
        #     continue

    avg_loss_diff /= n_runs_done
    avg_loss_i /= n_runs_done
    avg_loss_j /= n_runs_done

    print()
    print("Avg Loss Base: ", avg_loss_i)
    print("Avg Loss New: ", avg_loss_j)
    print("Avg Loss Diff: ", avg_loss_diff)
    print("Final Win Rate: ", win_count / n_runs_done)
    print("Final 0Eps Win Rate: ", win_count_0eps / n_runs_done)

params = {
    'low_cpu_mem_usage': True,
    'trust_remote_code': False,
    'torch_dtype': torch.bfloat16,
    'use_safetensors': True,
    'attn_implementation': "flash_attention_2"
}

def print_model_params(model, norm=False):
    model = AutoModelForCausalLM.from_pretrained(model, **params, cache_dir="Models")
    if norm:
        model = norm_model_weights(model)
    for name, param in model.named_parameters():
        print(name, param)
    del model; gc.collect(); torch.cuda.empty_cache()

def check_matching_weights(model0, model1):
    model0 = AutoModelForCausalLM.from_pretrained(model0, **params, cache_dir="Models")
    model1 = AutoModelForCausalLM.from_pretrained(model1, **params, cache_dir="Models")
    mismatch_diffs = []
    any_mismatch = False
    for (name0, param0), (name1, param1) in zip(model0.named_parameters(), model1.named_parameters()):
        if not (param0.data == param1.data).all():
            any_mismatch = True
            diff = torch.sum(torch.abs(param0.data - param1.data)).item()
            mismatch_diffs.append(diff)
            print("Mismatched weights", name0, name1, diff)
    if not any_mismatch:
        print("No mismatched weights")
    else:
        print("Mean abs mismatched", np.mean(mismatch_diffs), "Std abs mismatched", np.std(mismatch_diffs))
    del model0, model1; gc.collect(); torch.cuda.empty_cache()

def validate_parameters(base_model, eps_soft=500, eps_soft_percent_threshold=0.3, eps_hard=2500, norm=False):
    if type(base_model) is str:
        base_model = AutoModelForCausalLM.from_pretrained(base_model, **params, cache_dir="Models")
    else:
        base_model = base_model
    if norm:
        base_model = norm_model_weights(base_model)

    exceed_counts = {'q_proj': 0, 'k_proj': 0, 'v_proj': 0, 'o_proj': 0, 'up_proj': 0, 'down_proj': 0}
    avg_norms = {'q_proj': 0.0, 'k_proj': 0.0, 'v_proj': 0.0, 'o_proj': 0.0, 'up_proj': 0.0, 'down_proj': 0.0}
    total_counts = {'q_proj': 0, 'k_proj': 0, 'v_proj': 0, 'o_proj': 0, 'up_proj': 0, 'down_proj': 0}

    for layer in base_model.model.layers:
        for proj in ['k_proj', 'v_proj', 'q_proj', 'o_proj']:
            weight_norm = getattr(layer.self_attn, proj).weight.norm().item()
            if weight_norm > eps_hard:
                return False
            elif weight_norm > eps_soft:
                exceed_counts[proj] += 1
            total_counts[proj] += 1
            avg_norms[proj] += weight_norm

        # up_proj and down_proj are in the mlp layer
        for proj in ['up_proj', 'down_proj']:
            weight_norm = getattr(layer.mlp, proj).weight.norm().item()
            if weight_norm > eps_hard:
                return False
            elif weight_norm > eps_soft:
                exceed_counts[proj] += 1
            total_counts[proj] += 1
            avg_norms[proj] += weight_norm

    # Calculating and printing percentages
    percentages = [exceed_counts[proj] / total_counts[proj] if total_counts[proj] > 0 else 0.0 for proj in exceed_counts]
    for key, value in total_counts.items():
        avg_norms[key] = avg_norms[key] / value
    print(percentages, avg_norms)
    del base_model; gc.collect(); torch.cuda.empty_cache()
    return np.mean(np.array(percentages)[[1, 2, 4]]) <= eps_soft_percent_threshold

In [5]:
validate_improvement("tomaszki/stablelm-1", "MesozoicMetallurgist/zeta-Ladinian", False, n_runs=1, samples=768, dedup=False)

Testing tomaszki/stablelm-1 against MesozoicMetallurgist/zeta-Ladinian
No duplicates found

................

config.json:   0%|          | 0.00/744 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.29G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/121 [00:00<?, ?B/s]


................Base:  0.7294731140136719 	New:  0.6798674265543619 	Diff:  -0.0496056874593099
	WinRate:  0.8138020833333334 	0Eps WinRate:  0.8515625

Avg Loss Base:  0.7294731140136719
Avg Loss New:  0.6798674265543619
Avg Loss Diff:  -0.0496056874593099
Final Win Rate:  0.8138020833333334
Final 0Eps Win Rate:  0.8515625


In [3]:
validate_improvement("MesozoicMetallurgist/zeta-Anisian", "0x0dad0/beta_s03", False, n_runs=1, samples=768, dedup=False)

Testing MesozoicMetallurgist/zeta-Anisian against 0x0dad0/beta_s03


You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


No duplicates found

................
................Base:  0.7434326807657877 	New:  0.7078625361124674 	Diff:  -0.03557014465332031
	WinRate:  0.5989583333333334 	0Eps WinRate:  0.69140625

Avg Loss Base:  0.7434326807657877
Avg Loss New:  0.7078625361124674
Avg Loss Diff:  -0.03557014465332031
Final Win Rate:  0.5989583333333334
Final 0Eps Win Rate:  0.69140625


In [19]:
validate_improvement("MesozoicMetallurgist/zeta-Anisian", "tomaszki/stablelm-0", False, n_runs=1, samples=768, dedup=False)

Testing MesozoicMetallurgist/zeta-Anisian against tomaszki/stablelm-0
No duplicates found

................

config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/1 [00:00<?, ?it/s]

model-00001-of-00001.safetensors:   0%|          | 0.00/3.29G [00:00<?, ?B/s]


................Base:  0.6679238080978394 	New:  0.6688983043034872 	Diff:  0.0009744962056477865
	WinRate:  0.13671875 	0Eps WinRate:  0.265625

Avg Loss Base:  0.6679238080978394
Avg Loss New:  0.6688983043034872
Avg Loss Diff:  0.0009744962056477865
Final Win Rate:  0.13671875
Final 0Eps Win Rate:  0.265625


In [15]:
validate_improvement("MesozoicMetallurgist/zeta-Induan", "MesozoicMetallurgist/zeta-Anisian", False, n_runs=1, samples=768, dedup=False)

Testing MesozoicMetallurgist/zeta-Induan against MesozoicMetallurgist/zeta-Anisian
No duplicates found

................
................Base:  0.749762217203776 	New:  0.6579113006591797 	Diff:  -0.09185091654459636
	WinRate:  0.9375 	0Eps WinRate:  0.9453125

Avg Loss Base:  0.749762217203776
Avg Loss New:  0.6579113006591797
Avg Loss Diff:  -0.09185091654459636
Final Win Rate:  0.9375
Final 0Eps Win Rate:  0.9453125


In [13]:
validate_improvement("coffiee/s11", "MesozoicMetallurgist/zeta-Anisian", False, n_runs=1, samples=768, dedup=False)

Testing coffiee/s11 against MesozoicMetallurgist/zeta-Anisian
No duplicates found

................
................Base:  0.6664326985677084 	New:  0.6242529551188151 	Diff:  -0.04217974344889323
	WinRate:  0.9921875 	0Eps WinRate:  0.9986979166666666

Avg Loss Base:  0.6664326985677084
Avg Loss New:  0.6242529551188151
Avg Loss Diff:  -0.04217974344889323
Final Win Rate:  0.9921875
Final 0Eps Win Rate:  0.9986979166666666


In [12]:
validate_improvement("MesozoicMetallurgist/zeta-Anisian", "coffiee/s11", False, n_runs=1, samples=768, dedup=False)

Testing MesozoicMetallurgist/zeta-Anisian against coffiee/s11
No duplicates found

................
................Base:  0.8217595418294271 	New:  0.8159128824869791 	Diff:  -0.005846659342447917
	WinRate:  0.5963541666666666 	0Eps WinRate:  0.6197916666666666

Avg Loss Base:  0.8217595418294271
Avg Loss New:  0.8159128824869791
Avg Loss Diff:  -0.005846659342447917
Final Win Rate:  0.5963541666666666
Final 0Eps Win Rate:  0.6197916666666666
