In [1]:
# You might wanna connect to docker env using the command below inside docker, 
# then select jupyter server http://127.0.0.1:5050/lab?token=  (default password is 'local')
# python3 -m jupyterlab --no-browser --ip=0.0.0.0 --port=5050 --allow-root --NotebookApp.token='local' --NotebookApp.password='local'
%load_ext autoreload
%autoreload 2
import sys
import os
sys.path.append("../")
os.chdir("../")
import torch
from src.models.layers.monarch_linear import MonarchLinear
import bitsandbytes

  warn(f"Failed to load image Python extension: {e}")


## Basic usage 

In [2]:
import json 
from train_utils import param_stats

peft_config = json.load(open("task_configs/roberta_glue/peft_config.json", "r"))
monarch = MonarchLinear(in_features=1024, out_features=1024, peft_config=peft_config, as_adapter=False)

x = torch.randn(16, 1024, device="cuda")
print("out.shape:", monarch(x).shape)
print("monarch factor shape (nblocks, block rank, block size): ", monarch.blkdiag1.shape, monarch.blkdiag2.shape)
param_stats(monarch)
monarch

out.shape: torch.Size([16, 1024])
monarch factor shape (nblocks, block rank, block size):  torch.Size([4, 4, 256]) torch.Size([4, 256, 4])
Total parameters: 0.009M,
         trainable parameters: 0.009M (100.000%)


MonarchLinear(in_features=1024, out_features=1024, nblocks=4, requires_grad=True)

### Setting layer config

In [3]:
print(f"Names of exact layers to adapt with Monarch: ", peft_config["layers_to_adapt"])
print("Can also modify the below options:")
print("adapt querys and keys only: ", peft_config["q_v"])
print("adapt mlp:", peft_config["mlp"])
print("making block rank = block size:", peft_config["square"])
# Can safely ignore other settings
peft_config

Names of exact layers to adapt with Monarch:  ['query', 'value', 'key']
Can also modify the below options:
adapt querys and keys only:  False
adapt mlp: False
making block rank = block size: False


{'monarch': True,
 'square': False,
 'nblocks': 4,
 'blk_r': 4,
 'blk_sz': None,
 'layers_to_adapt': ['query', 'value', 'key'],
 'q_v': False,
 'adapter': True,
 'scaler': False,
 'layernorm': True,
 'large_lr': False,
 'new_lr': 0.005,
 'scaler_type': 'scaler',
 'from_lora': '',
 'mlp': False,
 'lora_style_init': False,
 'use_mult_factor': False,
 'affine': False}

In [4]:
peft_config["blk_r"] = 8
peft_config["blk_sz"] = 512
monarch = MonarchLinear(in_features=1024, out_features=1024, peft_config=peft_config)
print("monarch factor shape:", monarch.blkdiag1.shape, monarch.blkdiag2.shape)

monarch factor shape: torch.Size([2, 8, 512]) torch.Size([2, 512, 8])


## Dense matrix approximation

In [20]:
# Project to two monarch factors using SVD.
from src.ops.blockdiag_butterfly_einsum import blockdiag_butterfly_project_einsum_rank
from src.models.layers.blockdiag_butterfly_multiply import blockdiag_butterfly_multiply
weights = torch.randn(1024, 1024, device="cuda")
nblocks = 4
rank = 1
i = torch.eye(1024, device="cuda")
blkdiag1, blkdiag2, rev1, rev2 = blockdiag_butterfly_project_einsum_rank(weights, nblocks, nblocks, rank, reverse=True)
# rev1, rev2 = blockdiag_butterfly_project_einsum_rank(weights, nblocks, nblocks, rank, reverse=True)
(weights - blockdiag_butterfly_multiply(i, blkdiag1, blkdiag2)), blockdiag_butterfly_multiply(i, rev1, rev2)
# We don't use it for our setup, but if curious check blockdiag_butterfly_project_einsum_rank
# from torch.testing import assert_allclose
# from copy import deepcopy
# m, n = 1024, 512
# weights = torch.randn(m, n, device="cuda")
# monarch = MonarchLinear(in_features=n, out_features=m, weights=weights, peft_config=peft_config)
# x = torch.eye(n, device="cuda")
# assert_allclose(monarch(x),  x @ weights.T )


(tensor([[ 0.2072, -0.5451, -0.5087,  ...,  1.7620, -0.2555,  1.6147],
         [ 1.1842,  0.5658,  0.7478,  ...,  1.4341,  0.3450,  2.3831],
         [ 0.0557, -1.2869,  1.5041,  ...,  1.0188,  0.0603,  0.2268],
         ...,
         [-0.7492,  0.5547, -1.5522,  ..., -0.7151,  0.5899, -1.0649],
         [ 3.0028, -0.8860, -1.1527,  ..., -1.1671,  1.2047, -0.7684],
         [ 0.2830,  0.6385, -0.0437,  ...,  1.3922, -1.7917, -1.1091]],
        device='cuda:0'),
 tensor([[ 0.2072,  1.0174,  0.0955,  ..., -0.5979,  2.8725,  0.3590],
         [-0.3782,  0.5658, -1.3578,  ...,  0.6550, -0.9059,  0.5369],
         [-0.5486,  0.8186,  1.5042,  ..., -1.3950, -1.2826, -0.3231],
         ...,
         [ 1.6108,  1.3338,  0.8617,  ..., -0.7152, -1.2136,  1.4127],
         [-0.1251,  0.3648,  0.1901,  ...,  0.6362,  1.2047, -1.8486],
         [ 1.5389,  2.4849,  0.5061,  ..., -1.0854, -0.7117, -1.1092]],
        device='cuda:0'))

## Lora-style model adaptation for (theoretically) any model from Huggingface

In [6]:
from transformers import (
    AutoConfig,
    AutoTokenizer,
    set_seed,
    AutoModel
)
model_name = "roberta-large"
# model_name = "meta-llama/Llama-2-7b"  # This one requies api key
model = AutoModel.from_pretrained(model_name)
param_stats(model)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total parameters: 338.897M,
         trainable parameters: 338.897M (100.000%)


355359744

### Look up for the specific layer names to adapt 

In [7]:
model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      

In [8]:
from train_utils import init_monarch_layers

peft_config = json.load(open("task_configs/roberta_glue/peft_config.json", "r"))
peft_config['layers_to_adapt'] = ["query", "key", "value"]
init_monarch_layers(model, peft_config)
param_stats(model)

Adapted key (1024, 1024) with monarch layers: torch.Size([4, 4, 256]), torch.Size([4, 256, 4])
Adapted value (1024, 1024) with monarch layers: torch.Size([4, 4, 256]), torch.Size([4, 256, 4])
Adapted query (1024, 1024) with monarch layers: torch.Size([4, 4, 256]), torch.Size([4, 256, 4])
Total parameters: 339.460M,
         trainable parameters: 0.633M (0.186%)


663552

## Wanna see what layers are adapted? 🥹

In [9]:
param_stats(model, print_trainable=True)

encoder.layer.0.attention.self.query.bias : 0.0010M, torch.Size([1024])
encoder.layer.0.attention.self.query.blkdiag1 : 0.0039M, torch.Size([4, 4, 256])
encoder.layer.0.attention.self.query.blkdiag2 : 0.0039M, torch.Size([4, 256, 4])
encoder.layer.0.attention.self.key.bias : 0.0010M, torch.Size([1024])
encoder.layer.0.attention.self.key.blkdiag1 : 0.0039M, torch.Size([4, 4, 256])
encoder.layer.0.attention.self.key.blkdiag2 : 0.0039M, torch.Size([4, 256, 4])
encoder.layer.0.attention.self.value.bias : 0.0010M, torch.Size([1024])
encoder.layer.0.attention.self.value.blkdiag1 : 0.0039M, torch.Size([4, 4, 256])
encoder.layer.0.attention.self.value.blkdiag2 : 0.0039M, torch.Size([4, 256, 4])
encoder.layer.1.attention.self.query.bias : 0.0010M, torch.Size([1024])
encoder.layer.1.attention.self.query.blkdiag1 : 0.0039M, torch.Size([4, 4, 256])
encoder.layer.1.attention.self.query.blkdiag2 : 0.0039M, torch.Size([4, 256, 4])
encoder.layer.1.attention.self.key.bias : 0.0010M, torch.Size([1024])


663552