In [1]:
import torch
import torch.nn as nn
from functools import partial

from minsara import SaRAParametrization,add_sara, apply_to_sara, disable_sara, enable_sara, get_sara_params, merge_sara, name_is_sara, remove_sara,get_sara_state_dict
# _ = torch.set_grad_enabled(False)


  from .autonotebook import tqdm as notebook_tqdm


# a simple model

In [2]:
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        # 正确地将模型定义为类的属性
        self.model = nn.Sequential(
            nn.Linear(in_features=15, out_features=15),
            # nn.ReLU(),  # 可选：添加一个非线性激活层以提升模型的表达能力
            # nn.Linear(in_features=70, out_features=),
        )

    def forward(self, x):
        # 定义前向传播
        return self.model(x)

    # def __repr__(self):
    #     # 返回模型的简化字符串表示
    #     return "<MyModel with 2 layers>"


In [3]:
model = MyModel()

x = torch.randn(1, 15)
print("The RANDOM x",x)
y = model(x)
print("original y is",y) # original y is tensor([[ 0.1539, -0.4083, -0.3811]])
# Y0 = y


The RANDOM x tensor([[ 0.0924, -0.8733,  0.7537, -1.7091,  2.2377, -0.9350, -0.3164,  0.7776,
         -0.2826,  0.3074,  0.3646, -0.7074, -2.3209, -0.3096, -1.8750]])
original y is tensor([[-1.6596,  0.8482,  0.1055,  0.6464, -0.0511,  0.8792,  0.7584,  1.3777,
          0.5091, -0.3930, -0.0987, -0.7105,  0.0673, -1.3950,  0.4987]],
       grad_fn=<AddmmBackward0>)


In [4]:
sara_config = {
    nn.Linear: {
        "weight": partial(SaRAParametrization.from_linear, rank=15),
    },
}


In [5]:
def print_vector_parameters(model):
    r"""
    Returns the number of trainable parameters and number of all parameters in the model.
    """
    trainable_params = 0
    vector_params = 0
    all_param = 0
    for n, param in model.named_parameters():
        num_params = param.numel()
        all_param += num_params
        if 'original_module' in n:
            continue
        if param.requires_grad:
            trainable_params += num_params
            # if "lora_" not in n:
            #     print(n)
            if "vector_z" in n:
                vector_params += num_params
    print(
        f"vector params: {vector_params:,d} || trainable params: {trainable_params:,d} || all params: {all_param:,d} || trainable%: {100 * trainable_params / all_param}"
    )
    return vector_params


# add sara to the model

In [6]:
add_sara(model, sara_config=sara_config)
# model.eval()
# 遍历模型的所有参数
# 冻结所有层的梯度
print_vector_parameters(model)
for param in model.parameters():
    param.requires_grad = False
# 假设我们有一个特定的层 layer_to_train，
# 我们想要为它启用梯度
for param in get_sara_params(model):
    param.requires_grad = True
print_vector_parameters(model)
    
# for name, param in model.named_parameters():
#     print(f"{name} : {param.requires_grad}")
# for name, param in model.named_parameters():
#     if param.requires_grad:
#         print(f"{name} 是可训练的: {param.requires_grad}")
        
# parameters = [
#     {"params": list(get_sara_params(model))},
# ]
# print(parameters)
# exit()


vector params: 15 || trainable params: 705 || all params: 705 || trainable%: 100.0
vector params: 15 || trainable params: 15 || all params: 705 || trainable%: 2.127659574468085


15

In [7]:
y = model(x)
print("y after add sara",y) # y after add lora tensor([[ 0.2840, -0.3440, -0.4243]])
print(model)  # <MyModel with 2 layers>


y after add sara tensor([[-1.6596,  0.8482,  0.1055,  0.6464, -0.0511,  0.8792,  0.7584,  1.3777,
          0.5091, -0.3930, -0.0987, -0.7105,  0.0673, -1.3950,  0.4987]],
       grad_fn=<AddmmBackward0>)
MyModel(
  (model): Sequential(
    (0): ParametrizedLinear(
      in_features=15, out_features=15, bias=True
      (parametrizations): ModuleDict(
        (weight): ParametrizationList(
          (0): SaRAParametrization()
        )
      )
    )
  )
)


## get the sara params

In [8]:
aaa = get_sara_params(model, print_shapes=True)

for item in aaa:
    # print the trainable params
    print(item)


model.0.parametrizations.weight.0.vector_z torch.Size([15])
Parameter containing:
tensor([2.2514, 1.9614, 1.6007, 1.5082, 1.3850, 1.2004, 1.1712, 0.9489, 0.7911,
        0.7053, 0.6024, 0.4700, 0.3626, 0.2641, 0.0294], requires_grad=True)


# now let's try to disable sara, the output is the same as before sara is added

In [9]:
# now let's try to disable sara, the output is the same as before sara is added
disable_sara(model)
y = model(x)
# assert torch.allclose(y, Y0)
print("y after disable sara",y) #y after disable sara tensor([[ 0.1539, -0.4083, -0.3811]])


y after disable sara tensor([[-1.6596,  0.8482,  0.1055,  0.6464, -0.0511,  0.8792,  0.7584,  1.3777,
          0.5091, -0.3930, -0.0987, -0.7105,  0.0673, -1.3950,  0.4987]])


# enable sara again

In [10]:

enable_sara(model)
y = model(x)
# assert torch.allclose(y, Y1)
print("enable_sara again",y) # enable_sara again tensor([[ 0.2840, -0.3440, -0.4243]])


enable_sara again tensor([[-1.6596,  0.8482,  0.1055,  0.6464, -0.0511,  0.8792,  0.7584,  1.3777,
          0.5091, -0.3930, -0.0987, -0.7105,  0.0673, -1.3950,  0.4987]],
       grad_fn=<AddmmBackward0>)


# let's save the state dict for later use

In [11]:

state_dict_to_save = get_sara_state_dict(model)
state_dict_to_save.keys()


dict_keys(['model.0.parametrizations.weight.0.vector_z'])

In [12]:
print(model)


MyModel(
  (model): Sequential(
    (0): ParametrizedLinear(
      in_features=15, out_features=15, bias=True
      (parametrizations): ModuleDict(
        (weight): ParametrizationList(
          (0): SaRAParametrization()
        )
      )
    )
  )
)


# you can remove sara from the model

In [13]:

remove_sara(model)


In [14]:
print(model)


MyModel(
  (model): Sequential(
    (0): Linear(in_features=15, out_features=15, bias=True)
  )
)


In [15]:
state_dict_to_save


{'model.0.parametrizations.weight.0.vector_z': tensor([2.2514, 1.9614, 1.6007, 1.5082, 1.3850, 1.2004, 1.1712, 0.9489, 0.7911,
         0.7053, 0.6024, 0.4700, 0.3626, 0.2641, 0.0294])}

In [16]:
# # 假设 'model' 是您的 MyModel 实例
# # Sequential 容器是通过 'model.model' 访问的
# # ParametrizedLinear 层是 Sequential 容器的第一个模块

# # 首先获取 Sequential 容器内的 ParametrizedLinear 实例
# parametrized_linear_layer = model.model[0]

# # 现在，parametrized_linear_layer 是 ParametrizedLinear 的一个实例
# # 您可以直接从中访问 parametrizations 属性
# # print(model)
# # from labml.logger import inspect
# # inspect(model)
# # inspect(parametrized_linear_layer.parametrizations)

# import pysnooper
# with pysnooper.snoop():
#     parametrization = parametrized_linear_layer.parametrizations['weight'][0]
#     vector_z = parametrization.vector_z

# print("first_submodule_parametrizations.weight[0].vector_z", vector_z)
for name, param in model.state_dict().items():
    print(name, param.size())


model.0.bias torch.Size([15])
model.0.weight torch.Size([15, 15])


In [17]:

from torchkeras import summary
summary(model, input_shape=(15,))


--------------------------------------------------------------------------
Layer (type)                            Output Shape              Param #
Linear-1                                    [-1, 15]                  240
Total params: 240
Trainable params: 0
Non-trainable params: 240
--------------------------------------------------------------------------
Input size (MB): 0.000057
Forward/backward pass size (MB): 0.000114
Params size (MB): 0.000916
Estimated Total Size (MB): 0.001087
--------------------------------------------------------------------------




In [18]:
# lets try to load the sara back
# first we need to add sara to the model
add_sara(model, sara_config=sara_config)

print(model)
# then we can load the sara parameters
# strict=False is needed because we are loading a subset of the parameters
_ = model.load_state_dict(state_dict_to_save, strict=False) 
# y = model(x)
# print("add sara again after remove sara",y) # add sara again after remove sara tensor([[ 0.2840, -0.3440, -0.4243]])
# assert torch.allclose(y, Y1)
for name, param in model.state_dict().items():
    print(name, param.size())


MyModel(
  (model): Sequential(
    (0): ParametrizedLinear(
      in_features=15, out_features=15, bias=True
      (parametrizations): ModuleDict(
        (weight): ParametrizationList(
          (0): SaRAParametrization()
        )
      )
    )
  )
)
model.0.bias torch.Size([15])
model.0.parametrizations.weight.original torch.Size([15, 15])
model.0.parametrizations.weight.0.lora_A torch.Size([15, 15])
model.0.parametrizations.weight.0.lora_B torch.Size([15, 15])
model.0.parametrizations.weight.0.vector_z torch.Size([15])
model.0.parametrizations.weight.0.lora_dropout_mask torch.Size([1, 15])


In [19]:
# we can merge it to make it a normal linear layer, so there is no overhead for inference
merge_sara(model)
# y = model(x)
# print("after merge the sara",y) # after merge the sara tensor([[ 0.2840, -0.3440, -0.4243]])
# assert torch.allclose(y, Y1)
for name, param in model.state_dict().items():
    print(name, param.size())


model.0.bias torch.Size([15])
model.0.weight torch.Size([15, 15])


# model now has no sara parameters

In [20]:

model


MyModel(
  (model): Sequential(
    (0): Linear(in_features=15, out_features=15, bias=True)
  )
)

## Training a model

In [21]:
model = torch.nn.Linear(in_features=30, out_features=30)
# Step 1: Add sara to the model
add_sara(model)

# Step 2: Collect the parameters, pass them to the optimizer

parameters = [
    {"params": list(get_sara_params(model))},
]

optimizer = torch.optim.AdamW(parameters, lr=1e-3)

# Step 3: Train the model
# for _ in range(100):
#     x = torch.randn(1, 30)
#     y = model(x)
#     loss = y.sum()
#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()
# ...

# simulate training, update the sara parameters
model.apply(apply_to_sara(lambda x: torch.nn.init.normal_(x.lora_A)))
model.apply(apply_to_sara(lambda x: torch.nn.init.normal_(x.lora_B)))

# Step 4: export the sara parameters
state_dict = model.state_dict()
sara_state_dict = {k: v for k, v in state_dict.items() if name_is_sara(k)}


AssertionError: (torch.Size([30, 30]), 768)

## Loading and Inferencing with sara

In [None]:
# Step 1: Add sara to your model
add_sara(model)

# Step 2: Load the sara parameters
_ = model.load_state_dict(sara_state_dict, strict=False)

# Step 3: Merge the sara parameters into the model
merge_sara(model)


## Inferencing with multiple sara models

In [None]:
# # to avoid re-adding sara to the model when rerun the cell, remove sara first 
# remove_sara(model)
# # Step 1: Add sara to your model
# add_sara(model)

# # Step 2: Load the sara parameters

# # fake 3 sets of sara parameters
# sara_state_dict_0 = sara_state_dict
# sara_state_dict_1 = {k: torch.ones_like(v) for k, v in sara_state_dict.items()}
# sara_state_dict_2 = {k: torch.zeros_like(v) for k, v in sara_state_dict.items()}
# sara_state_dicts = [sara_state_dict_0, sara_state_dict_1, sara_state_dict_2]

# load_multiple_sara(model, sara_state_dicts)

# # Step 3: Select which sara to use at inference time
# Y0 = select_sara(model, 0)(x)
# Y1 = select_sara(model, 1)(x)
# Y2 = select_sara(model, 2)(x)


In [None]:
# Y0, Y1, Y2


In [None]:
# remove_sara(model)
# init_state_dict = model.state_dict()
# # verify that it's the same as if we load the sara parameters one by one
# for state_dict in sara_state_dicts:
#     remove_sara(model)
#     _ = model.load_state_dict(init_state_dict, strict=False)
#     add_sara(model)
#     _ = model.load_state_dict(state_dict, strict=False)
#     merge_sara(model)
#     y = model(x)
#     print(y)


In [None]:
# class Test():
#     def __init__(self,num=1, layer=None):
#         # self.layer = layer
#         # for arg in args:
#             # print(arg)
#         self.layer = layer
#         self.num = num
# layer = torch.nn.Sequential(
#             torch.nn.Linear(5, 3),
#             torch.nn.ReLU(),
#             torch.nn.Linear(3, 3),
#             torch.nn.ReLU()
# )
# test = Test(1,layer=layer)
# # print(test)        
# # print(test.num) # 1
# # print(test.layer) # Linear(in_features=5, out_features=3, bias=True)
# # inspect(test.layer)
# print(test.layer[0])
# print(test.layer[0].weight)
# # print(test.layer[0]) 
# # print(test.layer[0].weight) 
# """Parameter containing:
# tensor([[-0.1163,  0.1544,  0.0566, -0.2275,  0.4066],
#         [-0.0287, -0.3928,  0.2575, -0.1188, -0.0773],
#         [-0.0870, -0.2780,  0.2427,  0.0463, -0.0287]], requires_grad=True)"""
        
# # print(test.layer.weight.shape) # torch.Size([3, 5])

# # print(test.layer.weight.dtype) # torch.float32

