In [None]:
import torch
print(torch.__version__)

2.8.0+cu126


In [None]:
if torch.cuda.is_available():
    print(torch.cuda.get_device_name())
else:
    print("CUDA is not available")

CUDA is not available


In [4]:
from typing_extensions import Self
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass
from typing import Optional, Literal, Union
from safetensors.torch import save_file

class LoRA_Base_Layer():

    def __init__(self,
                 rank=8,
                 lora_alpha=8,
                 lora_dropout=0.0,
                 use_rslora=True):
        self.rank=rank
        self.lora_alpha= lora_alpha
        self.lora_dropout=nn.Dropout(lora_dropout) if lora_dropout>0 else lambda x:x
        self.use_rslora=use_rslora

        self_scaling=self.lora_alpha/self.rank**0.5 if use_rslora else self.lora_alpha/self.rank

    def _load_pretrained_weights(self, state_dict):
        self.weight.data = state_dict["weight"]
        if "bias" in state_dict.keys():
            self.bias.data = state_dict["bias"]

class LoRA_Linear_Layer(nn.Linear, LoRA_Base_Layer):
    def __init__(self,
                 in_features,
                 out_features,
                 bias=0, rank=8,
                 lora_alpha=8,
                 lora_dropout=0.0,
                 use_rslora=True,
                 **kwargs):
      nn.Linear.__init__(self,
                         in_features,
                         out_features,
                         bias,
                         **kwargs)
      LoRA_Base_Layer.__init__(self, rank=rank,
                               lora_alpha=lora_alpha,
                               lora_dropout=lora_dropout,
                               use_rslora=use_rslora)
      self.weight.requires_grad = False

      self.LoRA_A=nn.Parameter(torch.zeros(in_features, rank))
      self.LoRA_B=nn.Parameter(torch.zeros(rank, out_features))

      nn.init.kaiming_uniform_(self.LoRA_A, a=math.sqrt(5))

    def merged_weights(self):
      merged_weights=self.weight.data + self.scaling * (self.LoRA_A @ self.LoRA_B)
      state_dict = {"weight":self.weight, "bias":self.bias}
      if self.bias is not None:
        state_dict["bias"]=self.bias

      merged_linear=nn.Linear(self.in_features,
                              self.out_features,
                              bias=True if self.bias is not None else False)

      merged_linear.load_state_dict(state_dict)

      return merged_linear


    def forward(self, x):
      original_layer_out = F.linear(x, self.weight, bias=self.bias)
      LoRA_Multiplication = (self.LoRA_A @ self.LoRA_B)* self.scaling
      LoRA_rank_out=self.lora_dropout(x) @ LoRA_Multiplication
      return original_layer_out + LoRA_rank_out


if __name__ == "__main__":

  layer=LoRA_Linear_Layer(16,32, rank=2)
  rand=torch.rand(4,16)

  print(rand)


Parameter containing:
tensor([[ 0.2750,  0.3382,  0.4056, -0.2825,  0.4470],
        [ 0.2311,  0.1802,  0.2980,  0.2500,  0.3069],
        [-0.3084, -0.1089, -0.4079, -0.2216,  0.1440],
        [-0.1434, -0.2686,  0.2684, -0.1192,  0.4258],
        [-0.0921, -0.0599, -0.2717, -0.2048, -0.1918]], requires_grad=True)
None
LoRA_Linear_Layer(in_features=5, out_features=5, bias=False)
