In [1]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

from models import Trained_Resnet34, LoRAResnet34
from data_loader import get_dataloaders, get_cifar100_dataloaders
from train import accuracy, train_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# WARNING: RESTART KERNEL BEFORE RUNNING
# Check device and clear GPU memory
import gc

print("=" * 80)
print("WARNING: If you get CUDA errors, restart the kernel (Kernel -> Restart Kernel)")
print("=" * 80)
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    # Clear any existing GPU memory
    torch.cuda.empty_cache()
    gc.collect()
    print(f"GPU Memory Allocated: {torch.cuda.memory_allocated(0)/1024**3:.2f} GB")
    print(f"GPU Memory Reserved: {torch.cuda.memory_reserved(0)/1024**3:.2f} GB")
print("=" * 80)


PyTorch: 2.8.0+cu128
CUDA: True
GPU: NVIDIA GeForce RTX 3070 Ti
GPU Memory Allocated: 0.00 GB
GPU Memory Reserved: 0.00 GB


# Hyperparameters Configuration

In [None]:
# Hyperparameters Configuration
# Change these values to experiment with different settings

BATCH_SIZE = 96
LEARNING_RATE = 0.001
NUM_EPOCHS = 20
OPTIMIZER = "Adam"
LORA_R = 8
LORA_ALPHA = 16

print("Hyperparameters:")
print(f"  Batch Size: {BATCH_SIZE}")
print(f"  Learning Rate: {LEARNING_RATE}")
print(f"  Number of Epochs: {NUM_EPOCHS}")
print(f"  Optimizer: {OPTIMIZER}")
print(f"  LoRA r: {LORA_R}")
print(f"  LoRA alpha: {LORA_ALPHA}")

Hyperparameters:
  Batch Size: 96
  Learning Rate: 0.001
  Number of Epochs: 5
  Optimizer: Adam
  LoRA r: 8
  LoRA alpha: 16


In [4]:
# CIFAR-100 Experiment

In [5]:
# Load CIFAR-100 with separate train, validation, and test sets
train_loader_cifar, val_loader_cifar, test_loader_cifar = get_cifar100_dataloaders(batch_size=BATCH_SIZE)

print(f"\nData loaders created successfully!")
print(f"Training batches: {len(train_loader_cifar)}")
print(f"Validation batches: {len(val_loader_cifar)}")
print(f"Test batches: {len(test_loader_cifar)}")

CIFAR-100 DataLoaders created:
  Training samples: 50000
  Validation samples: 5000
  Test samples: 5000

Data loaders created successfully!
Training batches: 520
Validation batches: 27
Test batches: 27


# Comparison: Baseline ResNet vs LoRA ResNet


In [6]:
# Train Baseline ResNet Model
print("="*80)
print("TRAINING BASELINE RESNET MODEL")
print("="*80)

# Clear GPU memory aggressively
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
gc.collect()

baseline_model = Trained_Resnet34(num_classes=100)

baseline_metrics = train_model(
    baseline_model,
    train_loader_cifar,
    val_loader_cifar,
    optimizer_name=OPTIMIZER,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    num_epochs=NUM_EPOCHS,
    model_save_path="baseline_best.pth",
    plot=False
)

# Test accuracy
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
baseline_test_acc = accuracy(baseline_model, test_loader_cifar, device)
baseline_metrics['test_acc'] = baseline_test_acc
print(f"\nBaseline Test Accuracy: {baseline_test_acc:.4f}")


TRAINING BASELINE RESNET MODEL
Model created with frozen backbone
Device: cuda
batch_size=96, lr=0.001, epochs=5
Trainable params: 51,300 (0.2%)

Epoch 1/5
Iter 500 | Train Loss: 2.5238 | Train Acc: 0.4550 | Val Acc: 0.5826 | Val Loss: 1.5880
  ✓ Saved best model (val_acc: 0.5826)

Epoch 1 Summary:
  Average Training Loss: 2.5860
  Epoch Time: 96.68s

Epoch 2/5
Iter 1000 | Train Loss: 1.6464 | Train Acc: 0.4671 | Val Acc: 0.6210 | Val Loss: 1.3978
  ✓ Saved best model (val_acc: 0.6210)

Epoch 2 Summary:
  Average Training Loss: 2.0835
  Epoch Time: 35.45s

Epoch 3/5
Iter 1500 | Train Loss: 1.8629 | Train Acc: 0.4990 | Val Acc: 0.6306 | Val Loss: 1.3044
  ✓ Saved best model (val_acc: 0.6306)

Epoch 3 Summary:
  Average Training Loss: 2.0031
  Epoch Time: 36.54s

Epoch 4/5
Iter 2000 | Train Loss: 2.0707 | Train Acc: 0.5169 | Val Acc: 0.6402 | Val Loss: 1.2670
  ✓ Saved best model (val_acc: 0.6402)

Epoch 4 Summary:
  Average Training Loss: 1.9680
  Epoch Time: 37.58s

Epoch 5/5
Iter 2500

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "m:\FUCK SCHOOL\Foundation ML\.venv\lib\site-packages\torch\utils\data\_utils\worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "m:\FUCK SCHOOL\Foundation ML\.venv\lib\site-packages\torch\utils\data\_utils\fetch.py", line 55, in fetch
    return self.collate_fn(data)
  File "m:\FUCK SCHOOL\Foundation ML\.venv\lib\site-packages\torch\utils\data\_utils\collate.py", line 398, in default_collate
    return collate(batch, collate_fn_map=default_collate_fn_map)
  File "m:\FUCK SCHOOL\Foundation ML\.venv\lib\site-packages\torch\utils\data\_utils\collate.py", line 211, in collate
    return [
  File "m:\FUCK SCHOOL\Foundation ML\.venv\lib\site-packages\torch\utils\data\_utils\collate.py", line 212, in <listcomp>
    collate(samples, collate_fn_map=collate_fn_map)
  File "m:\FUCK SCHOOL\Foundation ML\.venv\lib\site-packages\torch\utils\data\_utils\collate.py", line 155, in collate
    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
  File "m:\FUCK SCHOOL\Foundation ML\.venv\lib\site-packages\torch\utils\data\_utils\collate.py", line 270, in collate_tensor_fn
    storage = elem._typed_storage()._new_shared(numel, device=elem.device)
  File "m:\FUCK SCHOOL\Foundation ML\.venv\lib\site-packages\torch\storage.py", line 1203, in _new_shared
    untyped_storage = torch.UntypedStorage._new_shared(
  File "m:\FUCK SCHOOL\Foundation ML\.venv\lib\site-packages\torch\storage.py", line 414, in _new_shared
    return cls._new_using_filename_cpu(size)
RuntimeError: Couldn't open shared file mapping: <torch_26232_145590723_0>, error code: <1455>


# Train LoRA ResNet Model


In [None]:
print("="*80)
print("TRAINING LORA RESNET MODEL")
print("="*80)

# Clear GPU memory (keeping baseline model for comparison)
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
gc.collect()

print(f"GPU Memory after cleanup: {torch.cuda.memory_allocated(0)/1024**3:.2f} GB allocated")

lora_model = LoRAResnet34(num_classes=100, lora_r=LORA_R, lora_alpha=LORA_ALPHA)

lora_metrics = train_model(
    lora_model,
    train_loader_cifar,
    val_loader_cifar,
    optimizer_name=OPTIMIZER,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    num_epochs=NUM_EPOCHS,
    model_save_path="lora_best.pth",
    plot=False
)

# Test accuracy
lora_test_acc = accuracy(lora_model, test_loader_cifar, device)
lora_metrics['test_acc'] = lora_test_acc
print(f"\nLoRA Test Accuracy: {lora_test_acc:.4f}")


## Comparison Plots


In [None]:
# Comparison plots
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Training Accuracy
axes[0, 0].plot(baseline_metrics['iters'], baseline_metrics['train_acc'], 
                label='Baseline ResNet', marker='o', markersize=4, linewidth=2)
axes[0, 0].plot(lora_metrics['iters'], lora_metrics['train_acc'], 
                label='LoRA ResNet', marker='s', markersize=4, linewidth=2)
axes[0, 0].set_title('Training Accuracy', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Iterations')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Validation Accuracy
axes[0, 1].plot(baseline_metrics['iters'], baseline_metrics['val_acc'], 
                label='Baseline ResNet', marker='o', markersize=4, linewidth=2)
axes[0, 1].plot(lora_metrics['iters'], lora_metrics['val_acc'], 
                label='LoRA ResNet', marker='s', markersize=4, linewidth=2)
axes[0, 1].set_title('Validation Accuracy', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Iterations')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Training Loss
axes[1, 0].plot(baseline_metrics['iters'], baseline_metrics['train_loss'], 
                label='Baseline ResNet', marker='o', markersize=4, linewidth=2)
axes[1, 0].plot(lora_metrics['iters'], lora_metrics['train_loss'], 
                label='LoRA ResNet', marker='s', markersize=4, linewidth=2)
axes[1, 0].set_title('Training Loss', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Iterations')
axes[1, 0].set_ylabel('Loss')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Trainable Parameters Comparison
models = ['Baseline\nResNet', 'LoRA\nResNet']
params = [baseline_metrics['trainable_params']/1000, lora_metrics['trainable_params']/1000]
axes[1, 1].bar(models, params, alpha=0.8, color=['#1f77b4', '#ff7f0e'])
axes[1, 1].set_title('Trainable Parameters', fontsize=14, fontweight='bold')
axes[1, 1].set_ylabel('Parameters (thousands)')
axes[1, 1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()


## Results Comparison


In [None]:
# Display comparison results
print("="*80)
print("BASELINE RESNET RESULTS")
print("="*80)
print(f"Trainable Params: {baseline_metrics['trainable_params']:,}")
print(f"Best Val Accuracy: {baseline_metrics['best_val_acc']:.4f}")
print(f"Test Accuracy: {baseline_metrics['test_acc']:.4f}")
print(f"Total Training Time: {baseline_metrics['total_time']/60:.2f} min")
print(f"Best Epoch: {baseline_metrics['best_epoch']}")

print("\n" + "="*80)
print("LORA RESNET RESULTS")
print("="*80)
print(f"Trainable Params: {lora_metrics['trainable_params']:,}")
print(f"Best Val Accuracy: {lora_metrics['best_val_acc']:.4f}")
print(f"Test Accuracy: {lora_metrics['test_acc']:.4f}")
print(f"Total Training Time: {lora_metrics['total_time']/60:.2f} min")
print(f"Best Epoch: {lora_metrics['best_epoch']}")

print("\n" + "="*80)
print("COMPARISON")
print("="*80)
param_reduction = 100 * (1 - lora_metrics['trainable_params'] / baseline_metrics['trainable_params'])
acc_diff = lora_metrics['test_acc'] - baseline_metrics['test_acc']
time_diff = lora_metrics['total_time'] - baseline_metrics['total_time']

print(f"Parameter Reduction: {param_reduction:.1f}%")
print(f"Accuracy Difference: {acc_diff:+.4f} (LoRA - Baseline)")
print(f"Time Difference: {time_diff:+.2f}s (LoRA - Baseline)")
print(f"\nLoRA achieves {lora_metrics['test_acc']/baseline_metrics['test_acc']*100:.1f}% of baseline accuracy")
print(f"with only {lora_metrics['trainable_params']/baseline_metrics['trainable_params']*100:.1f}% of trainable parameters")
