# Ligthning

In [None]:
import sys
sys.path.append('..')

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from typing import Any

import matplotlib.pyplot as plt

import os
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
from torchvision import transforms
from torchvision.datasets import MNIST, CIFAR100
from torch.utils.data import DataLoader
import lightning as L
import torch.optim as optim

from PIL import Image
from PIL import Image, ImageOps
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder

from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import DeviceStatsMonitor, StochasticWeightAveraging

from lightning.pytorch.tuner import Tuner
from lightning.pytorch.loggers import TensorBoardLogger

from lit_modules import LitAlexNet, TinyImageNetDataModule, AlexNet, CifarDataModule

from lightning import Trainer

from torchmetrics import Accuracy

from tqdm.notebook import tqdm
import gc


In [None]:
# Assuming CifarDataModule and AlexNet are defined and imported correctly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# datamodule = CifarDataModule(batch_size=32)
data_path = "../data/tiny-imagenet/tiny-imagenet-200/train"

datamodule = TinyImageNetDataModule(data_path=data_path, batch_size=128)
datamodule.setup("train")

ld = datamodule.train_dataloader()
temp_ld = [list(next(iter(ld)))]

print(f"{len(temp_ld) = }")

alexnet = AlexNet(num_classes=1000)
alexnet.to(device)

print(f"{alexnet.features[3].bias}")
plt.hist(alexnet.features[3].bias.cpu().detach().numpy())
plt.show()

# You must enable gradients by calling `requires_grad_()` on the parameters if it's not already done
for param in alexnet.parameters():
    param.requires_grad_()

# Define the loss function
criterion = torch.nn.CrossEntropyLoss()
acc_top_1 = Accuracy(task="multiclass", num_classes=1000, top_k=1)
acc_top_1.to(device)

acc_top_5 = Accuracy(task="multiclass", num_classes=1000, top_k=5)
acc_top_5.to(device)

# Assume 'optimizer' is defined (e.g., Adam, SGD, etc.)
optimizer = torch.optim.Adam(alexnet.parameters(), lr=0.01)
optimizer = torch.optim.SGD(alexnet.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)

losses = []
acc_1 = []
acc_5 = []

for _ in tqdm(range(500)):
    # for i, (x, y) in tqdm(enumerate(temp_ld)):
    for i, (x, y) in enumerate(temp_ld):
        
        x = x.to(device)
        y = y.to(device)
        
        # print(f"{x.size() = }")
        # print(f"{y.size() = }")
        # print(f"{y = }")
    
        # Forward pass: Compute predicted outputs by passing inputs to the model
        outputs = alexnet(x)
        # print(f"{outputs.size() = }")
        # print(f"{outputs = }")
    
        # Compute the loss
        loss = criterion(outputs, y)
        losses.append(loss.cpu().detach().numpy())

        # print(f"{acc_top_1(F.softmax(outputs, dim=1), y) = }")
        accuracy_1 = acc_top_1(F.softmax(outputs, dim=1), y).item()
        accuracy_5 = acc_top_5(F.softmax(outputs, dim=1), y).item()
        
        acc_1.append(accuracy_1)
        acc_5.append(accuracy_5)
        
    
        # Zero the gradients before running the backward pass.
        optimizer.zero_grad()
    
        # Backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
    
        # Perform a single optimization step (parameter update)
        optimizer.step()
    
        # Check the gradients for the first batch
        # if i == 0:
        #     for name, parameter in alexnet.named_parameters():
        #         if parameter.requires_grad:
        #             print(f"{name} - gradient: {parameter.grad}")
    
        # Visualize the images in the second batch
        if i == 1:
            # temp = x[0].permute(1, 2, 0).detach().numpy()
            # plt.imshow(temp)
            # plt.show()
            # temp = x[1].permute(1, 2, 0).detach().numpy()
            # plt.imshow(temp)
            # plt.show()
    
            pass
    
        # Break after the second batch
        if i == 10:
            break

print(f"{alexnet.features[3].bias}")
print(f"{alexnet.features[3].bias.grad}")


plt.hist(alexnet.features[3].bias.cpu().detach().numpy())
plt.show()

plt.plot(losses)
plt.show()

plt.plot(acc_1)
plt.plot(acc_5)
plt.legend(["Top-1", "Top-2"])
plt.show()

### Experiments 


- Version0
    - Overfit on the single batch
- Version1
    - Training on all dataset
- Version2
    - Experiment:
        - Add Early Stoppping
        - Add LR scheduler
        - Remove StochasticWeightAveraging(swa_lrs=1e-2)
        - Add Learning Rate Logging to tensorboard on each Epoch
    - Result:
        - The Model stop at Epoch 9
        - Validation Top1_Err = 36%
        - Validation Top5_Err = 5%
        - Train Loss = 0.7368
        - Val Loss = 1.174
 
---
- Version0
    - Expriment:
        - Correct the learning_rate of Adam optimizer
        - Change the EarlyStopping setting EarlyStopping(monitor="val_loss", mode="min", patience=10, check_val_every_n_epoch=3)
        - Set the Learning Rate = 0.01
    - Result
        - Training isn't starting
- Version1
    - Experiment:
        - Set the Learning Rate = 0.001
    - Result:
        - Val Top1_Err = 32%
        - Val Top5_Err = 4.6%
        - Train Loss = 0.2
        - Val Loss = 1
- Version2
    - Experiment:
        - Test overfitting On one Batch
    - Result:
        - Train Loss = 2
     
- Version3
    - Experiment:
        - Test overfitting On one Batch
        - Set the learning rate to 0.01
    - Result:
        - Train Loss = 2

- Version4
    - Experiment:
        - Adam to SGD
        - Test overfitting On one Batch
        - Set the learning rate to 0.01
    - Result:
        - Train Loss = 2

- Version5
    - Experiment:
        - Adam to SGD
        - Test overfitting On one Batch
        - Set the learning rate to 0.001
    - Result:
        - Train Loss = 2
     
- Version6
    - Experiment:
        - Adam to SGD
        - Test overfitting On one Batch
        - Set the learning rate to 0.003
    - Result:
        - Train Loss = 2

- Version7
    - Experiment:
        - Adam to SGD
        - Test overfitting On one Batch
        - Set the learning rate to 0.1
    - Result:
        - Train Loss = 2

- Version8
    - Experiment:
        - Adam to SGD
        - Test overfitting On one Batch
        - Set the learning rate to 0.0001
    - Result:
        - Train Loss = 2

- Version9
    - Experiment:
        - Adam
        - Test overfitting On one Batch
        - Set the learning rate to 0.001
    - Result:
        - Train Loss = 2
     
- Version10
    - Experiment:
        - Adam
        - Remove the Learning rate scheduler
        - StochasticWeightAveraging(swa_lrs=1e-2)
        - Test overfitting On one Batch
        - Set the learning rate to 0.001
    - Result:
        - Train Loss = 2

- Version11
    - Experiment:
        - Adam
        - Remove the Learning rate scheduler
        - StochasticWeightAveraging(swa_lrs=1e-2)
        - Test overfitting On one Batch
        - Set the learning rate to 0.01
    - Result:
        - Train Loss = 2
     
- Version12
    - Experiment:
        - Adam
        - Remove the Learning rate scheduler
        - StochasticWeightAveraging(swa_lrs=1e-2)
        - Test overfitting On one Batch
        - Set the learning rate to 0.003
    - Result:
        - Epoch = 125
        - Train Loss = 0.07

- Version13
    - Experiment:
        - Batch Finder And Learning Rate Finder
    - Results:
        - Batchsize = 32K
        - Learning Rate = 1e-3

- Version14
    - Experiment:
        - Set the learning rate to 0.003
        - Batch size = 512
        - Learning Rate = 3e-3
    - Results:
        - Not Working

- Version14
    - Experiment:
        - Set the learning rate to 0.003
        - Batch size = 256
        - Learning Rate = 3e-3
    - Results:

- Version15
    - Experiment:
        - Set the learning rate to 0.003
        - Batch size = 256
    - Results:
        - Val Top1 = 33%
        - Val Top5 = 5%
        - Train Loss = 0.7

- Version16
    - Experiment:
        - Set the learning rate to 0.003
        - Batch size = 256
        - set dropout rate = 0.5
        - add weight decay = 1e-4
    - Results:
        - The best model ever is made
 
---
- Version17
    - Experiment:
        - ImageNet
        - Overfit Test
    - Results:
        - The best model ever is made

- Version18
    - Experiment:
        - ImageNet
        - Overfit Test
        - LR = 0.01
    - Results:
        - 

- Version19
    - Experiment:
        - ImageNet
        - Overfit Test
        - LR = 0.001
        - weight_decay=5e-4
    - Results:
        - The best model ever is made

- Version20
    - Experiment:
        - ImageNet
        - Overfit Test
        - LR = 0.001
        - weight_decay=5e-4
    - Results:
        - shit

- Version21
    - Experiment:
        - ImageNet
        - Overfit Test
        - LR = 0.01
        - weight_decay=5e-4
    - Results:
        - shit
     
- Version21
    - Experiment:
        - ImageNet
        - Overfit Test
        - LR = 0.01
        - weight_decay=1e-4
    - Results:
        - shit

- Version22
    - Experiment:
        - ImageNet
        - Overfit Test
        - LR = 0.001
        - weight_decay=1e-4
    - Results:
        - shit

---

- Version22
    - Experiment:
        - ImageNet
        - Overfit Test
        - Add new initialization Kaiman He
        - Add BatchNorm after each layer
        - Using ELU activation instead of RELU (Because the ELU has the value for negative elements)
    - Results:
        - 

In [None]:
data_path = "../data/tiny-imagenet/tiny-imagenet-200/train"
datamodule = TinyImageNetDataModule(data_path=data_path, batch_size=96)
# datamodule = CifarDataModule(batch_size=256)


model = LitAlexNet(learning_rate=1e-3, num_classes=1000, example_input_array=(96, 3, 224, 224))
tb_logger = TensorBoardLogger('.')

trainer = Trainer(
        max_epochs=200,
        # callbacks=[EarlyStopping(monitor="val_loss", mode="min", patience=10)],
        callbacks=[StochasticWeightAveraging(swa_lrs=1e-2)],
        fast_dev_run=False,
        devices="auto",
        accelerator="gpu",
        num_nodes=1,
        strategy="auto",
        # overfit_batches=1,
        gradient_clip_val=0.5,
        logger=tb_logger,
        # check_val_every_n_epoch = 1,
        # log_every_n_steps=1
    )

trainer.fit(model=model, datamodule=datamodule)

In [None]:
data_path = "../data/tiny-imagenet/tiny-imagenet-200/train"
# datamodule = TinyImageNetDataModule(data_path=data_path, batch_size=96)
datamodule = CifarDataModule(batch_size=96)


model = LitAlexNet(learning_rate=3e-3, num_classes=10, example_input_array=(96, 3, 224, 224))
tb_logger = TensorBoardLogger('.')

trainer = Trainer(
        max_epochs=200,
        # callbacks=[EarlyStopping(monitor="val_loss", mode="min", patience=10)],
        callbacks=[StochasticWeightAveraging(swa_lrs=1e-2)],
        fast_dev_run=False,
        devices="auto",
        accelerator="gpu",
        num_nodes=1,
        strategy="auto",
        overfit_batches=1,
        gradient_clip_val=0.5,
        logger=tb_logger,
        # check_val_every_n_epoch = 1,
        # log_every_n_steps=1
    )

tuner = Tuner(trainer)
# Auto-scale batch size by growing it exponentially (default)
tuner.scale_batch_size(model, datamodule=datamodule, mode="power")

# finds learning rate automatically
# sets hparams.lr or hparams.learning_rate to that learning rate
# Run learning rate finder
lr_finder = tuner.lr_find(model, datamodule=datamodule)

# Results can be found in
print(lr_finder.results)

# Pick point based on plot, or get suggestion
new_lr = lr_finder.suggestion()

fig = lr_finder.plot(suggest=True)

# update hparams of the model
model.hparams.learning_rate = new_lr

In [None]:
# Clear GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Collect garbage
gc.collect()

In [None]:
# Clear GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Collect garbage
gc.collect()

!python ../main.py