In [1]:
# python3 -m jupyterlab --no-browser --ip=0.0.0.0 --port=5050 --allow-root --NotebookApp.token='local'
import json
import transformers
import time
import logging
import os
import wandb
import random
import sys
from dataclasses import dataclass, field
from typing import Optional
import datasets
import numpy as np
from datasets import load_dataset, load_metric
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from src.models.modeling_roberta import RobertaForSequenceClassification
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version
from train_utils import *

# helper to init and set hyperparams for Ray Tune search
def model_init(peft_config, hyperparams = None, use_monarch = True):
    torch.manual_seed(42)
    model_name_or_path = "roberta-large"
    model = RobertaForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path=model_name_or_path,
        from_tf=bool(".ckpt" in model_name_or_path),
    )
    
    # Hyperparameter search
    if hyperparams is not None:
        for k in peft_config.keys():
            if k in hyperparams.keys():
                print("Overriding {} with {}".format(k, peft_config[k]))
                peft_config[k] = hyperparams[k]

    if use_monarch:
        model.roberta.set_peft_config(peft_config)
    # NOTE: Ray doesn't support torch.compile and it also causes a bug with trainer...
    # if torch.__version__.startswith("2") and not do_tune:
    #     model = torch.compile(model)
    return model

[1709388724.560207] [80c05f596a00:21676:f]        vfs_fuse.c:281  UCX  ERROR inotify_add_watch(/tmp) failed: No space left on device
[2024-03-02 14:12:18,472] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
# lora_dict = torch.load("results/lora/cola/model/checkpoint-13547/pytorch_model.bin")
# for name, module in lora_dict.items():
#     if "lora" in name:
#         print(name, ",", module.shape)

## Check layer-wise differences

In [5]:
# Enter checkpoints you want to compare
ckpt_1 = "main_init.pt"
ckpt_2 = "train_init.pt"
ckpt_1 = torch.load(ckpt_1, map_location='cpu')
ckpt_2 = torch.load(ckpt_2, map_location='cpu')
for name, module in ckpt_1.items():
    if not torch.allclose(module, ckpt_2[name]):
        print(name, " is updated")


In [9]:
peft_config = json.load("task_configs/glue_peft_configs/peft_monarch.json")
model = model_init(peft_config, use_monarch = True)
init_dict = model.state_dict() # initial weights
trained_model = torch.load

for name, module in init_dict.items():

    if not torch.allclose(module.cuda(), monarch_dict[name]):
        print(name, " is updated")

roberta.encoder.layer.0.attention.self.query.bias  is updated
roberta.encoder.layer.0.attention.self.query.blkdiag1  is updated
roberta.encoder.layer.0.attention.self.key.bias  is updated
roberta.encoder.layer.0.attention.self.key.blkdiag1  is updated
roberta.encoder.layer.0.attention.self.value.bias  is updated
roberta.encoder.layer.0.attention.self.value.blkdiag1  is updated
roberta.encoder.layer.1.attention.self.query.bias  is updated
roberta.encoder.layer.1.attention.self.query.blkdiag1  is updated
roberta.encoder.layer.1.attention.self.key.bias  is updated
roberta.encoder.layer.1.attention.self.key.blkdiag1  is updated
roberta.encoder.layer.1.attention.self.value.bias  is updated
roberta.encoder.layer.1.attention.self.value.blkdiag1  is updated
roberta.encoder.layer.2.attention.self.query.bias  is updated
roberta.encoder.layer.2.attention.self.query.blkdiag1  is updated
roberta.encoder.layer.2.attention.self.key.bias  is updated
roberta.encoder.layer.2.attention.self.key.blkdiag1 

## Test grad hook (ignore this please)

In [13]:
import torch.nn as nn
import torch.nn.functional as F
import torch

def hook_fn(module, grad_input, grad_output):
    print(f"{module} has dW {grad_input[1]} and scaler value {module.scaler}")
    
    
def factor_balance(mid_blksz, out_blksz):
    total = mid_blksz * out_blksz

class Scaler(nn.Module):
    def __init__(self, out_features):
        super().__init__()
        self.scaler = nn.Parameter(torch.zeros(1))
        
    def forward(self, x):
        # x.requires_grad_(True)
        x = self.scaler * x
        # layernorm to avoid vanishing gradient
        return x

x = torch.ones(100, 100, dtype=torch.float32)
y = torch.full((100, 100), 2, dtype=torch.float32)
model = Scaler(100)
model.register_backward_hook(hook_fn)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
for i in range(100):
    loss = F.mse_loss(model(x), y)
    loss.backward()
    optimizer.step()
    

Scaler() has dW None and scaler value Parameter containing:
tensor([0.], requires_grad=True)
Scaler() has dW None and scaler value Parameter containing:
tensor([0.4000], requires_grad=True)
Scaler() has dW None and scaler value Parameter containing:
tensor([1.1200], requires_grad=True)
Scaler() has dW None and scaler value Parameter containing:
tensor([2.0160], requires_grad=True)
Scaler() has dW None and scaler value Parameter containing:
tensor([2.9088], requires_grad=True)
Scaler() has dW None and scaler value Parameter containing:
tensor([3.6198], requires_grad=True)
Scaler() has dW None and scaler value Parameter containing:
tensor([4.0069], requires_grad=True)
Scaler() has dW None and scaler value Parameter containing:
tensor([3.9926], requires_grad=True)
Scaler() has dW None and scaler value Parameter containing:
tensor([3.5798], requires_grad=True)
Scaler() has dW None and scaler value Parameter containing:
tensor([2.8510], requires_grad=True)
Scaler() has dW None and scaler va