In [3]:
# You might wanna connect to docker env using the command below inside docker, 
# then select jupyter server http://127.0.0.1:5050/lab?token=  (default password is 'local')
# python3 -m jupyterlab --no-browser --ip=0.0.0.0 --port=5050 --allow-root --NotebookApp.token='local' --NotebookApp.password='local'
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../")
import os
print(os.getcwd())
import torch
from tests.ops.test_blockdiag_butterfly_projection import test_trained_weight_approx
from src.models.layers.monarch_linear import MonarchLinear
import bitsandbytes

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


ModuleNotFoundError: No module named 'tests.ops'

## Basic usage 

In [None]:
import json 
from train_utils import param_stats, PEFT_CONFIG_PATH

peft_config = json.load(open(PEFT_CONFIG_PATH, "r"))
monarch = MonarchLinear(in_features=1024, out_features=1024, peft_config=peft_config)

x = torch.randn(16, 1024, device="cuda")
print("out.shape:", monarch(x).shape)
print("monarch factor shape (nblocks, block rank, block size): ", monarch.blkdiag1.shape, monarch.blkdiag2.shape)
param_stats(monarch)
monarch

out.shape: torch.Size([16, 1024])
monarch factor shape (nblocks, block rank, block size):  torch.Size([4, 4, 256]) torch.Size([4, 256, 4])
Total parameters: 0.01M,
         trainable parameters: 0.01M (100.00%)


MonarchLinear(in_features=1024, out_features=1024, nblocks=4, requires_grad=True)

### Setting layer config

In [None]:
print(f"Names of exact layers to adapt with Monarch: ", peft_config["layers_to_adapt"])
print("Can also modify the below options:")
print("adapt querys and keys only: ", peft_config["q_v"])
print("adapt mlp:", peft_config["mlp"])
print("making block rank = block size:", peft_config["square"])
# Can safely ignore other settings
peft_config

Names of exact layers to adapt with Monarch:  ['query', 'value', 'key']
Can also modify the below options:
adapt querys and keys only:  False
adapt mlp: False
making block rank = block size: False


{'monarch': True,
 'square': False,
 'nblocks': 4,
 'blk_r': 4,
 'blk_sz': None,
 'layers_to_adapt': ['query', 'value', 'key'],
 'q_v': False,
 'adapter': True,
 'scaler': False,
 'layernorm': True,
 'large_lr': False,
 'new_lr': 0.005,
 'scaler_type': 'scaler',
 'from_lora': '',
 'mlp': False,
 'lora_style_init': False,
 'use_mult_factor': False,
 'affine': False}

In [None]:
peft_config["blk_r"] = 8
peft_config["blk_sz"] = 512
monarch = MonarchLinear(in_features=1024, out_features=1024, peft_config=peft_config)
print("monarch factor shape:", monarch.blkdiag1.shape, monarch.blkdiag2.shape)

monarch factor shape: torch.Size([2, 8, 512]) torch.Size([2, 512, 8])


## Dense matrix approximation

In [None]:
# Project to two monarch factors using SVD.
# We don't use it for our setup, but if curious check blockdiag_butterfly_project_einsum_rank
# from torch.testing import assert_allclose
# from copy import deepcopy
# m, n = 1024, 512
# weights = torch.randn(m, n, device="cuda")
# monarch = MonarchLinear(in_features=n, out_features=m, weights=weights, peft_config=peft_config)
# x = torch.eye(n, device="cuda")
# assert_allclose(monarch(x),  x @ weights.T )


## Lora-style model adaptation for (theoretically) any model from Huggingface

In [None]:
from transformers import (
    AutoConfig,
    AutoTokenizer,
    set_seed,
    AutoModel
)
# model_name = "microsoft/deberta-large"
model_name = "meta-llama/Llama-2-7b"
model = AutoModel.from_pretrained(model_name)
param_stats(model)

Total parameters: 386.39M,
         trainable parameters: 386.39M (100.00%)


### Look up for the specific layer names to adapt 

In [None]:
model

DebertaModel(
  (embeddings): DebertaEmbeddings(
    (word_embeddings): Embedding(50265, 1024, padding_idx=0)
    (LayerNorm): DebertaLayerNorm()
    (dropout): StableDropout()
  )
  (encoder): DebertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x DebertaLayer(
        (attention): DebertaAttention(
          (self): DisentangledSelfAttention(
            (in_proj): Linear(in_features=1024, out_features=3072, bias=False)
            (pos_dropout): StableDropout()
            (pos_proj): Linear(in_features=1024, out_features=1024, bias=False)
            (pos_q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): StableDropout()
          )
          (output): DebertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): DebertaLayerNorm()
            (dropout): StableDropout()
          )
        )
        (intermediate): DebertaIntermediate(
          (dense): Linear(in_features=1024, out_f

In [None]:
from transformers.models.deberta.modeling_deberta import DebertaAttention, DebertaIntermediate
from train_utils import init_monarch_layers

peft_config = json.load(open(PEFT_CONFIG_PATH, "r"))
peft_config['layers_to_adapt'] = ["in_proj"]
init_monarch_layers(model, peft_config, target_classes=[DebertaAttention, DebertaIntermediate])
param_stats(model)

Adapted in_proj (3072, 1024) with monarch layers: torch.Size([4, 4, 256]), torch.Size([4, 768, 4])
Total parameters: 386.77M,
         trainable parameters: 0.38M (0.10%)


## Wanna see what layers are adapted? 🥹

In [None]:
param_stats(model, print_trainable=True)

encoder.layer.0.attention.self.in_proj.blkdiag1 : 0.0039M, torch.Size([4, 4, 256])
encoder.layer.0.attention.self.in_proj.blkdiag2 : 0.0117M, torch.Size([4, 768, 4])
encoder.layer.1.attention.self.in_proj.blkdiag1 : 0.0039M, torch.Size([4, 4, 256])
encoder.layer.1.attention.self.in_proj.blkdiag2 : 0.0117M, torch.Size([4, 768, 4])
encoder.layer.2.attention.self.in_proj.blkdiag1 : 0.0039M, torch.Size([4, 4, 256])
encoder.layer.2.attention.self.in_proj.blkdiag2 : 0.0117M, torch.Size([4, 768, 4])
encoder.layer.3.attention.self.in_proj.blkdiag1 : 0.0039M, torch.Size([4, 4, 256])
encoder.layer.3.attention.self.in_proj.blkdiag2 : 0.0117M, torch.Size([4, 768, 4])
encoder.layer.4.attention.self.in_proj.blkdiag1 : 0.0039M, torch.Size([4, 4, 256])
encoder.layer.4.attention.self.in_proj.blkdiag2 : 0.0117M, torch.Size([4, 768, 4])
encoder.layer.5.attention.self.in_proj.blkdiag1 : 0.0039M, torch.Size([4, 4, 256])
encoder.layer.5.attention.self.in_proj.blkdiag2 : 0.0117M, torch.Size([4, 768, 4])
enco