In [1]:
# You might wanna connect to docker env using the command below inside docker, 
# then select jupyter server http://127.0.0.1:5050/lab?token=  (default password is 'local')
# python3 -m jupyterlab --no-browser --ip=0.0.0.0 --port=5050 --allow-root --NotebookApp.token='local' --NotebookApp.password='local'
%load_ext autoreload
%autoreload 2
import sys
import os
sys.path.append("../")
os.chdir("../")
print(os.getcwd())
import torch
from src.layers.monarch_linear import MonarchLinear
import bitsandbytes

/data/wenxuan/sparse_matrix_fine_tuning


  warn(f"Failed to load image Python extension: {e}")


## Basic usage 

In [2]:
import json 
from train_utils import param_stats

peft_config = json.load(open("task_configs/monarch_roberta_glue/peft_config.json", "r"))
monarch = MonarchLinear(in_features=1024, out_features=1024, peft_config=peft_config, as_adapter=False)

x = torch.randn(16, 1024, device="cuda")
print("out.shape:", monarch(x).shape)
print("monarch factor shape (nblocks, block rank, block size): ", monarch.blkdiag1.shape, monarch.blkdiag2.shape)
param_stats(monarch)
monarch

out.shape: torch.Size([16, 1024])
monarch factor shape (nblocks, block rank, block size):  torch.Size([4, 4, 256]) torch.Size([4, 256, 4])
Total parameters: 0.009M,
         trainable parameters: 0.009M (100.000%)


MonarchLinear(in_features=1024, out_features=1024, nblocks=4, requires_grad=True)

### Setting layer config

In [3]:
print(f"Names of exact layers to adapt with Monarch: ", peft_config["target_modules"])
print("Can also modify the below options:")
print("adapt querys and keys only: ", peft_config["q_v"])
print("adapt mlp:", peft_config["mlp"])
print("making block rank = block size:", peft_config["square"])
# Can safely ignore other settings
peft_config

Names of exact layers to adapt with Monarch:  ['query', 'value', 'key']
Can also modify the below options:
adapt querys and keys only:  False
adapt mlp: False
making block rank = block size: False


{'monarch': True,
 'square': False,
 'nblocks': 4,
 'blk_r': 4,
 'blk_sz': None,
 'target_modules': ['query', 'value', 'key'],
 'q_v': False,
 'adapter': True,
 'scaler': False,
 'layernorm': True,
 'large_lr': False,
 'new_lr': 0.005,
 'scaler_type': 'scaler',
 'from_lora': '',
 'mlp': False,
 'lora_style_init': False,
 'use_mult_factor': False,
 'affine': False}

In [4]:
peft_config["blk_r"] = 8
peft_config["blk_sz"] = 512
monarch = MonarchLinear(in_features=1024, out_features=1024, peft_config=peft_config)
print("monarch factor shape:", monarch.blkdiag1.shape, monarch.blkdiag2.shape)

monarch factor shape: torch.Size([2, 8, 512]) torch.Size([2, 512, 8])


## Dense matrix approximation

In [22]:
# Project to two monarch factors using SVD.
from src.ops.blockdiag_butterfly_einsum import blockdiag_butterfly_project_einsum_rank, blockdiag_butterfly_multiply_einsum_rank
from src.ops.blockdiag_butterfly_multiply import blockdiag_butterfly_multiply
torch.random.manual_seed(0)
dim = 768
weights = torch.randn(dim, dim, device="cuda")
nblocks = 4
rank = 8 
i = torch.eye(dim, device="cuda")
blkdiag1, blkdiag2, rev1, rev2 = blockdiag_butterfly_project_einsum_rank(weights.T, nblocks, nblocks, rank, reverse=True)
print(blkdiag1.shape, blkdiag2.shape, rev1.shape, rev2.shape)
(weights - blockdiag_butterfly_multiply(i, blkdiag1, blkdiag2)), blockdiag_butterfly_multiply(i, rev1, rev2)

# We don't use it for our setup, but if curious check blockdiag_butterfly_project_einsum_rank
# from torch.testing import assert_allclose
# from copy import deepcopy
# m, n = 1024, 512
# weights = torch.randn(m, n, device="cuda")
# monarch = MonarchLinear(in_features=n, out_features=m, weights=weights, peft_config=peft_config)
# x = torch.eye(n, device="cuda")
# assert_allclose(monarch(x),  x @ weights.T )


torch.Size([4, 32, 192]) torch.Size([4, 192, 32]) torch.Size([4, 736, 192]) torch.Size([4, 192, 736])


(tensor([[-0.7431, -0.2139, -2.8119,  ..., -0.8477, -1.4322,  0.1578],
         [-1.3386, -0.3959,  0.5944,  ..., -0.1258, -0.6600, -0.4448],
         [-1.7184, -0.6263, -0.5038,  ...,  1.3683,  0.0636, -0.1049],
         ...,
         [ 1.6679,  0.8312,  0.0915,  ..., -0.5639, -0.1153,  0.7217],
         [ 1.5410, -0.8582,  0.1301,  ..., -1.2981, -0.1931,  0.7246],
         [ 0.8562, -1.1198, -0.3811,  ..., -0.1794, -0.0373,  1.6991]],
        device='cuda:0'),
 tensor([[-0.7432, -0.2139, -2.8120,  ..., -0.8478, -1.4323,  0.1578],
         [-1.3386, -0.3960,  0.5943,  ..., -0.1257, -0.6601, -0.4448],
         [-1.7184, -0.6263, -0.5038,  ...,  1.3683,  0.0636, -0.1050],
         ...,
         [ 1.6680,  0.8312,  0.0915,  ..., -0.5639, -0.1153,  0.7217],
         [ 1.5410, -0.8582,  0.1301,  ..., -1.2982, -0.1931,  0.7247],
         [ 0.8563, -1.1199, -0.3812,  ..., -0.1794, -0.0373,  1.6991]],
        device='cuda:0'))

In [6]:
rand = torch.randn(100, 100)
U, S, Vt = torch.linalg.svd(rand)
rank = 10
principal = U[:, :rank] @ torch.diag(S[:rank]) @ Vt[:rank, :]
least = U[:, rank:] @ torch.diag(S[rank:]) @ Vt[rank:, :]
rand - principal, least


(tensor([[-1.0965, -1.2275, -0.1063,  ..., -0.1150,  0.0686,  0.6476],
         [-0.0931,  0.2505, -0.5180,  ...,  0.6185,  0.9747, -0.2307],
         [-0.0970,  0.0775,  0.7878,  ...,  1.5981, -0.3175,  1.6818],
         ...,
         [-1.1939,  0.2523,  1.0238,  ..., -1.1577,  0.3693, -0.7117],
         [ 1.0135, -1.5772,  1.6735,  ...,  0.0610, -0.1385, -0.3062],
         [ 0.7105, -0.2479,  0.6746,  ..., -0.1854,  0.4084, -0.5091]]),
 tensor([[-1.0965, -1.2275, -0.1063,  ..., -0.1150,  0.0686,  0.6476],
         [-0.0931,  0.2505, -0.5180,  ...,  0.6185,  0.9747, -0.2307],
         [-0.0970,  0.0775,  0.7878,  ...,  1.5981, -0.3175,  1.6818],
         ...,
         [-1.1939,  0.2523,  1.0238,  ..., -1.1577,  0.3693, -0.7117],
         [ 1.0135, -1.5772,  1.6735,  ...,  0.0610, -0.1385, -0.3062],
         [ 0.7105, -0.2479,  0.6746,  ..., -0.1854,  0.4084, -0.5091]]))

## Lora-style model adaptation for (theoretically) any model from Huggingface

In [7]:
from transformers import (
    AutoConfig,
    AutoTokenizer,
    set_seed,
    AutoModel
)
model_name = "roberta-large"
# model_name = "meta-llama/Llama-2-7b"  # This one requies api key
model = AutoModel.from_pretrained(model_name)
param_stats(model)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total parameters: 338.897M,
         trainable parameters: 338.897M (100.000%)


355359744

### Look up for the specific layer names to adapt 

In [8]:
model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      

In [9]:
from train_utils import init_monarch_layers

peft_config = json.load(open("task_configs/monarch_roberta_glue/peft_config.json", "r"))
peft_config['target_modules'] = ["query", "key", "value"]
init_monarch_layers(model, peft_config)
param_stats(model)

Adapted query (1024, 1024) with monarch layers: torch.Size([4, 4, 256]), torch.Size([4, 256, 4])
Adapted key (1024, 1024) with monarch layers: torch.Size([4, 4, 256]), torch.Size([4, 256, 4])
Adapted value (1024, 1024) with monarch layers: torch.Size([4, 4, 256]), torch.Size([4, 256, 4])
Total parameters: 339.460M,
         trainable parameters: 0.633M (0.186%)


663552

## Wanna see what layers are adapted? 🥹

In [10]:
param_stats(model, print_trainable=True)

encoder.layer.0.attention.self.query.bias : 0.0010M, torch.Size([1024])
encoder.layer.0.attention.self.query.blkdiag1 : 0.0039M, torch.Size([4, 4, 256])
encoder.layer.0.attention.self.query.blkdiag2 : 0.0039M, torch.Size([4, 256, 4])
encoder.layer.0.attention.self.key.bias : 0.0010M, torch.Size([1024])
encoder.layer.0.attention.self.key.blkdiag1 : 0.0039M, torch.Size([4, 4, 256])
encoder.layer.0.attention.self.key.blkdiag2 : 0.0039M, torch.Size([4, 256, 4])
encoder.layer.0.attention.self.value.bias : 0.0010M, torch.Size([1024])
encoder.layer.0.attention.self.value.blkdiag1 : 0.0039M, torch.Size([4, 4, 256])
encoder.layer.0.attention.self.value.blkdiag2 : 0.0039M, torch.Size([4, 256, 4])
encoder.layer.1.attention.self.query.bias : 0.0010M, torch.Size([1024])
encoder.layer.1.attention.self.query.blkdiag1 : 0.0039M, torch.Size([4, 4, 256])
encoder.layer.1.attention.self.query.blkdiag2 : 0.0039M, torch.Size([4, 256, 4])
encoder.layer.1.attention.self.key.bias : 0.0010M, torch.Size([1024])


663552