In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from time import time

class MyModelLoRA(nn.Module):
    def __init__(self, input_dims, hidden_dims, out_dims, rank, num_layers):
        super().__init__()
        self.layers = nn.ModuleList([nn.Linear(input_dims if i == 0 else hidden_dims, hidden_dims) for i in range(num_layers)])
        self.output_layer = nn.Linear(hidden_dims, out_dims)
        self.rank = rank

        # Initialize LoRA matrices A and B with smaller scale
        self.A = nn.ParameterList([nn.Parameter(torch.randn(hidden_dims, rank) * 0.01) for _ in range(num_layers)])
        self.B = nn.ParameterList([nn.Parameter(torch.randn(rank, input_dims if i == 0 else hidden_dims) * 0.01) for i in range(num_layers)])
        
    def forward(self, x):
        for i, layer in enumerate(self.layers):
            modified_W = layer.weight + self.A[i] @ self.B[i]
            x = F.linear(x, modified_W, layer.bias)
            x = F.relu(x)
        x = self.output_layer(x)
        return x

# Model, optimizer, and loss function
model = MyModelLoRA(10000, 1000, 128, rank=30, num_layers=300).to('cuda')
for layer in model.layers:
    for param in layer.parameters():
        param.requires_grad = False
params_to_optimize = list(model.A.parameters()) + list(model.B.parameters()) + list(model.output_layer.parameters())
optimizer = torch.optim.AdamW(params_to_optimize, lr=1e-3)
loss_function = nn.MSELoss()

# Training parameters
epochs = 5
batch_size = 64
grad_clip = 1.0  # Gradient clipping threshold

# Training loop
for epoch in range(epochs):
    epoch_loss = 0
    start_time = time()

    for _ in range(100):  # Number of batches
        # Generate random data and labels
        data = torch.rand(batch_size, 10000).to('cuda')
        target = torch.rand(batch_size, 128).to('cuda')

        # Forward pass
        output = model(data)

        # Calculate loss
        loss = loss_function(output, target)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()

        # Gradient clipping
        nn.utils.clip_grad_norm_(params_to_optimize, grad_clip)

        optimizer.step()

        epoch_loss += loss.item()

    end_time = time()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/100:.4f}, Time: {end_time - start_time:.2f} seconds")

print("Training completed.")


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 10.92 GiB total capacity; 1.75 GiB already allocated; 22.19 MiB free; 1.76 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from time import time

class MyModelLoRA(nn.Module):
    def __init__(self, input_dims, hidden_dims, out_dims, rank, num_layers):
        super().__init__()
        self.layers = nn.ModuleList([nn.Linear(input_dims if i == 0 else hidden_dims, hidden_dims) for i in range(num_layers)])
        self.output_layer = nn.Linear(hidden_dims, out_dims)
        self.rank = rank
        self.num_layers = num_layers

        # Initialize LoRA matrices A and B with smaller scale
        self.A = nn.ParameterList([nn.Parameter(torch.randn(hidden_dims, rank) * 0.01) for _ in range(num_layers)])
        self.B = nn.ParameterList([nn.Parameter(torch.randn(rank, input_dims if i == 0 else hidden_dims) * 0.01) for i in range(num_layers)])
        
    def forward(self, x):
        lora_start_layer = int(self.num_layers * 4 / 5)  # Start applying LoRA at this layer
        for i, layer in enumerate(self.layers):
            if i >= lora_start_layer:  # Apply LoRA only to the last 1/5 layers
                modified_W = layer.weight + self.A[i] @ self.B[i]
                x = F.linear(x, modified_W, layer.bias)
            else:
                x = layer(x)  # Regular linear layer
            x = F.relu(x)
        x = self.output_layer(x)
        return x

# Model, optimizer, and loss function
model = MyModelLoRA(10000, 1000, 128, rank=30, num_layers=300).to('cuda')
for layer in model.layers:
    for param in layer.parameters():
        param.requires_grad = False
params_to_optimize = list(model.A.parameters()) + list(model.B.parameters()) + list(model.output_layer.parameters())
optimizer = torch.optim.AdamW(params_to_optimize, lr=1e-3)
loss_function = nn.MSELoss()

# Training parameters
epochs = 5
batch_size = 64
grad_clip = 1.0  # Gradient clipping threshold

# Training loop
for epoch in range(epochs):
    epoch_loss = 0
    start_time = time()

    for _ in range(100):  # Number of batches
        # Generate random data and labels
        data = torch.rand(batch_size, 10000).to('cuda')
        target = torch.rand(batch_size, 128).to('cuda')

        # Forward pass
        output = model(data)

        # Calculate loss
        loss = loss_function(output, target)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()

        # Gradient clipping
        nn.utils.clip_grad_norm_(params_to_optimize, grad_clip)

        optimizer.step()

        epoch_loss += loss.item()

    end_time = time()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/100:.4f}, Time: {end_time - start_time:.2f} seconds")

print("Training completed.")


Epoch 1/5, Loss: 0.1064, Time: 3.61 seconds
Epoch 2/5, Loss: 0.0835, Time: 3.58 seconds
Epoch 3/5, Loss: 0.0836, Time: 3.56 seconds
Epoch 4/5, Loss: 0.0837, Time: 3.52 seconds
Epoch 5/5, Loss: 0.0837, Time: 3.61 seconds
Training completed.


In [4]:
nn.Parameter(torch.Tensor(3,4))

Parameter containing:
tensor([[ 2.6807e+15,  4.5794e-41,  1.1781e-10,  4.5796e-41],
        [ 2.6919e+29, -1.7085e-20,  2.6808e+15,  4.5794e-41],
        [ 1.1962e-10,  4.5796e-41, -1.3418e-19, -9.7663e-30]],
       requires_grad=True)

In [7]:
nn.Parameter(torch.Tensor(3,4))

Parameter containing:
tensor([[5.0188e-38, 0.0000e+00, 3.8804e-36, 0.0000e+00],
        [2.3873e+14, 4.5794e-41, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00]], requires_grad=True)

# Lora Layer , with kaiming and down scale

In [None]:
class LoRALayer(nn.Module):
    def __init__(self, in_features, out_features, rank, alpha=1, scale=0.01):
        super(LoRALayer, self).__init__()
        self.rank = rank
        self.alpha = alpha

        self.W = nn.Linear(in_features, out_features, bias=False)  # Original weights
        self.lora_W1 = nn.Parameter(torch.Tensor(out_features, rank))  # Low-rank matrix W1
        self.lora_W2 = nn.Parameter(torch.Tensor(rank, in_features))  # Low-rank matrix W2

        nn.init.kaiming_uniform_(self.lora_W1, a=math.sqrt(5))
        nn.init.kaiming_uniform_(self.lora_W2, a=math.sqrt(5))

        # Scale down the LoRA matrices
        self.lora_W1.data *= scale
        self.lora_W2.data *= scale

    def forward(self, x):
        lora_adjustment = self.lora_W1 @ self.lora_W2
        W_lora = self.W.weight + self.alpha * lora_adjustment
        return F.linear(x, W_lora, self.W.bias)
