In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from time import time

class MyModelLoRA(nn.Module):
    def __init__(self, input_dims, hidden_dims, out_dims, rank, num_layers):
        super().__init__()
        self.layers = nn.ModuleList([nn.Linear(input_dims if i == 0 else hidden_dims, hidden_dims) for i in range(num_layers)])
        self.output_layer = nn.Linear(hidden_dims, out_dims)
        self.rank = rank

        # Initialize LoRA matrices A and B with smaller scale
        self.A = nn.ParameterList([nn.Parameter(torch.randn(hidden_dims, rank) * 0.01) for _ in range(num_layers)])
        self.B = nn.ParameterList([nn.Parameter(torch.randn(rank, input_dims if i == 0 else hidden_dims) * 0.01) for i in range(num_layers)])
        
    def forward(self, x):
        for i, layer in enumerate(self.layers):
            modified_W = layer.weight + self.A[i] @ self.B[i]
            x = F.linear(x, modified_W, layer.bias)
            x = F.relu(x)
        x = self.output_layer(x)
        return x

# Model, optimizer, and loss function
model = MyModelLoRA(10, 1000, 128, rank=30, num_layers=32).to('cuda')
for layer in model.layers:
    for param in layer.parameters():
        param.requires_grad = False
params_to_optimize = list(model.A.parameters()) + list(model.B.parameters()) + list(model.output_layer.parameters())
optimizer = torch.optim.AdamW(params_to_optimize, lr=1e-3)
loss_function = nn.MSELoss()

# Training parameters
epochs = 5
batch_size = 64
grad_clip = 1.0  # Gradient clipping threshold

# Training loop
for epoch in range(epochs):
    epoch_loss = 0
    start_time = time()

    for _ in range(100):  # Number of batches
        # Generate random data and labels
        data = torch.rand(batch_size, 10).to('cuda')
        target = torch.rand(batch_size, 128).to('cuda')

        # Forward pass
        output = model(data)

        # Calculate loss
        loss = loss_function(output, target)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()

        # Gradient clipping
        nn.utils.clip_grad_norm_(params_to_optimize, grad_clip)

        optimizer.step()

        epoch_loss += loss.item()

    end_time = time()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/100:.4f}, Time: {end_time - start_time:.2f} seconds")

print("Training completed.")


Epoch 1/5, Loss: 0.1050, Time: 0.93 seconds
Epoch 2/5, Loss: 0.0837, Time: 0.94 seconds
Epoch 3/5, Loss: 0.0835, Time: 0.94 seconds
Epoch 4/5, Loss: 0.0837, Time: 0.91 seconds
Epoch 5/5, Loss: 0.0835, Time: 0.91 seconds
Training completed.


In [15]:
for name, param in model.state_dict().items():
    if name in 

layers.0.weight
layers.0.bias
layers.1.weight
layers.1.bias
layers.2.weight
layers.2.bias
layers.3.weight
layers.3.bias
layers.4.weight
layers.4.bias
layers.5.weight
layers.5.bias
layers.6.weight
layers.6.bias
layers.7.weight
layers.7.bias
layers.8.weight
layers.8.bias
layers.9.weight
layers.9.bias
layers.10.weight
layers.10.bias
layers.11.weight
layers.11.bias
layers.12.weight
layers.12.bias
layers.13.weight
layers.13.bias
layers.14.weight
layers.14.bias
layers.15.weight
layers.15.bias
layers.16.weight
layers.16.bias
layers.17.weight
layers.17.bias
layers.18.weight
layers.18.bias
layers.19.weight
layers.19.bias
layers.20.weight
layers.20.bias
layers.21.weight
layers.21.bias
layers.22.weight
layers.22.bias
layers.23.weight
layers.23.bias
layers.24.weight
layers.24.bias
layers.25.weight
layers.25.bias
layers.26.weight
layers.26.bias
layers.27.weight
layers.27.bias
layers.28.weight
layers.28.bias
layers.29.weight
layers.29.bias
layers.30.weight
layers.30.bias
layers.31.weight
layers.31.b

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from time import time

class MyModelLoRA(nn.Module):
    def __init__(self, input_dims, hidden_dims, out_dims, rank, num_layers):
        super().__init__()
        self.layers = nn.ModuleList([nn.Linear(input_dims if i == 0 else hidden_dims, hidden_dims) for i in range(num_layers)])
        self.output_layer = nn.Linear(hidden_dims, out_dims)
        self.rank = rank
        self.num_layers = num_layers

        # Initialize LoRA matrices A and B with smaller scale
        self.A = nn.ParameterList([nn.Parameter(torch.randn(hidden_dims, rank) * 0.01) for _ in range(num_layers)])
        self.B = nn.ParameterList([nn.Parameter(torch.randn(rank, input_dims if i == 0 else hidden_dims) * 0.01) for i in range(num_layers)])
        
    def forward(self, x):
        lora_start_layer = int(self.num_layers * 4 / 5)  # Start applying LoRA at this layer
        for i, layer in enumerate(self.layers):
            if i >= lora_start_layer:  # Apply LoRA only to the last 1/5 layers
                modified_W = layer.weight + self.A[i] @ self.B[i]
                x = F.linear(x, modified_W, layer.bias)
            else:
                x = layer(x)  # Regular linear layer
            x = F.relu(x)
        x = self.output_layer(x)
        return x

# Model, optimizer, and loss function
model = MyModelLoRA(10000, 1000, 128, rank=30, num_layers=300).to('cuda')
for layer in model.layers:
    for param in layer.parameters():
        param.requires_grad = False
params_to_optimize = list(model.A.parameters()) + list(model.B.parameters()) + list(model.output_layer.parameters())
optimizer = torch.optim.AdamW(params_to_optimize, lr=1e-3)
loss_function = nn.MSELoss()

# Training parameters
epochs = 5
batch_size = 64
grad_clip = 1.0  # Gradient clipping threshold

# Training loop
for epoch in range(epochs):
    epoch_loss = 0
    start_time = time()

    for _ in range(100):  # Number of batches
        # Generate random data and labels
        data = torch.rand(batch_size, 10000).to('cuda')
        target = torch.rand(batch_size, 128).to('cuda')

        # Forward pass
        output = model(data)

        # Calculate loss
        loss = loss_function(output, target)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()

        # Gradient clipping
        nn.utils.clip_grad_norm_(params_to_optimize, grad_clip)

        optimizer.step()

        epoch_loss += loss.item()

    end_time = time()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss/100:.4f}, Time: {end_time - start_time:.2f} seconds")

print("Training completed.")


Epoch 1/5, Loss: 0.1064, Time: 3.61 seconds
Epoch 2/5, Loss: 0.0835, Time: 3.58 seconds
Epoch 3/5, Loss: 0.0836, Time: 3.56 seconds
Epoch 4/5, Loss: 0.0837, Time: 3.52 seconds
Epoch 5/5, Loss: 0.0837, Time: 3.61 seconds
Training completed.
