<a href="https://colab.research.google.com/github/Shakilkhan24/Playground_DL/blob/main/wandb_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m300.2/300.2 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from tqdm import tqdm
import wandb

# Define the CNN architecture
class CIFARCNN(nn.Module):
    def __init__(self):
        super(CIFARCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.conv3 = nn.Conv2d(64, 64, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 4 * 4, 512)
        self.fc2 = nn.Linear(512, 10)
        self.dropout = nn.Dropout(0.25)

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = self.pool(torch.relu(self.conv3(x)))
        x = x.view(-1, 64 * 4 * 4)
        x = self.dropout(torch.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

# Initialize wandb
wandb.init(project="Experiment", config={
    "architecture": "CIFARCNN",
    "dataset": "CIFAR-10",
    "epochs": 10,
    "batch_size": 64,
    "learning_rate": 0.001
})

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and preprocess CIFAR-10 dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
trainloader = DataLoader(trainset, batch_size=64, shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
testloader = DataLoader(testset, batch_size=64, shuffle=False, num_workers=2)

# Initialize the model, loss function, and optimizer
model = CIFARCNN().to(device)

wandb.watch(model, log="all")  # wandb.watch(model,log='all') {logging all the gradients and parameters}


criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in tqdm(trainloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

    train_loss = running_loss / len(trainloader)
    train_accuracy = 100. * correct / total

    # Log training metrics
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": train_loss,
        "train_accuracy": train_accuracy
    })

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Training Loss: {train_loss:.4f}")
    print(f"Training Accuracy: {train_accuracy:.2f}%")

    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in testloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            val_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    val_loss = val_loss / len(testloader)
    val_accuracy = 100. * correct / total

    # Log validation metrics
    wandb.log({
        "val_loss": val_loss,
        "val_accuracy": val_accuracy
    })

    print(f"Validation Loss: {val_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.2f}%")
    print()

print("Training finished!")

# Test the model and log predictions
model.eval()
all_predictions = []
all_labels = []
test_images = []

with torch.no_grad():
    for inputs, labels in testloader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = outputs.max(1)
        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        test_images.extend(inputs.cpu().numpy())

# Log predictions and images
num_images = min(25, len(all_predictions))
images = [wandb.Image(img.transpose(1, 2, 0),
                      caption=f"True: {all_labels[i]}, Pred: {all_predictions[i]}")
          for i, img in enumerate(test_images[:num_images])]

wandb.log({"test_predictions": images})

# Log confusion matrix
wandb.sklearn.plot_confusion_matrix(all_labels, all_predictions, labels=list(range(10)))

# Finish the wandb run
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33msk2448868[0m ([33mshakil_khan_team[0m). Use [1m`wandb login --relogin`[0m to force relogin


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:06<00:00, 26527040.82it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


Epoch 1/10: 100%|██████████| 782/782 [01:04<00:00, 12.15it/s]

Epoch 1/10
Training Loss: 1.4211
Training Accuracy: 48.32%





Validation Loss: 1.1307
Validation Accuracy: 59.72%



Epoch 2/10: 100%|██████████| 782/782 [01:04<00:00, 12.08it/s]

Epoch 2/10
Training Loss: 1.0116
Training Accuracy: 64.23%





Validation Loss: 0.9283
Validation Accuracy: 67.58%



Epoch 3/10: 100%|██████████| 782/782 [01:09<00:00, 11.19it/s]

Epoch 3/10
Training Loss: 0.8241
Training Accuracy: 70.94%





Validation Loss: 0.7971
Validation Accuracy: 72.32%



Epoch 4/10: 100%|██████████| 782/782 [01:05<00:00, 11.87it/s]

Epoch 4/10
Training Loss: 0.7068
Training Accuracy: 75.15%





Validation Loss: 0.7620
Validation Accuracy: 73.53%



Epoch 5/10: 100%|██████████| 782/782 [01:07<00:00, 11.66it/s]

Epoch 5/10
Training Loss: 0.6178
Training Accuracy: 78.32%





Validation Loss: 0.7789
Validation Accuracy: 73.44%



Epoch 6/10: 100%|██████████| 782/782 [01:07<00:00, 11.63it/s]

Epoch 6/10
Training Loss: 0.5413
Training Accuracy: 80.97%





Validation Loss: 0.7258
Validation Accuracy: 75.80%



Epoch 7/10: 100%|██████████| 782/782 [01:06<00:00, 11.83it/s]

Epoch 7/10
Training Loss: 0.4659
Training Accuracy: 83.43%





Validation Loss: 0.7318
Validation Accuracy: 75.64%



Epoch 8/10: 100%|██████████| 782/782 [01:05<00:00, 11.85it/s]

Epoch 8/10
Training Loss: 0.4128
Training Accuracy: 85.43%





Validation Loss: 0.7490
Validation Accuracy: 76.02%



Epoch 9/10: 100%|██████████| 782/782 [01:07<00:00, 11.62it/s]

Epoch 9/10
Training Loss: 0.3577
Training Accuracy: 87.30%





Validation Loss: 0.7534
Validation Accuracy: 76.39%



Epoch 10/10: 100%|██████████| 782/782 [01:05<00:00, 11.94it/s]

Epoch 10/10
Training Loss: 0.3109
Training Accuracy: 88.97%





Validation Loss: 0.8068
Validation Accuracy: 76.01%

Training finished!


VBox(children=(Label(value='0.061 MB of 0.061 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_accuracy,▁▄▅▆▆▇▇▇██
train_loss,█▅▄▃▃▂▂▂▁▁
val_accuracy,▁▄▆▇▇█████
val_loss,█▅▂▂▂▁▁▁▁▂

0,1
epoch,10.0
train_accuracy,88.974
train_loss,0.31089
val_accuracy,76.01
val_loss,0.80679


In [13]:
from torchsummary import summary
wandb.init()
model_summary = summary(model, input_size=(3, 32, 32))
wandb.log({"model_summary": wandb.Html(str(model_summary))})


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 32, 32]             896
         MaxPool2d-2           [-1, 32, 16, 16]               0
            Conv2d-3           [-1, 64, 16, 16]          18,496
         MaxPool2d-4             [-1, 64, 8, 8]               0
            Conv2d-5             [-1, 64, 8, 8]          36,928
         MaxPool2d-6             [-1, 64, 4, 4]               0
            Linear-7                  [-1, 512]         524,800
           Dropout-8                  [-1, 512]               0
            Linear-9                   [-1, 10]           5,130
Total params: 586,250
Trainable params: 586,250
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 0.52
Params size (MB): 2.24
Estimated Total Size (MB): 2.76
-------------------------------------------

In [15]:
images = next(iter(trainloader))[0]
wandb.log({"sample_images": [wandb.Image(img) for img in images]})
wandb.finish()

VBox(children=(Label(value='0.302 MB of 0.302 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [17]:
wandb.init()
torch.save(model.state_dict(), "model.pth")
wandb.save("model.pth")
wandb.finish()

VBox(children=(Label(value='2.250 MB of 2.250 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torch.utils.tensorboard import SummaryWriter
import numpy as np

# Set random seed for reproducibility
torch.manual_seed(42)

# Define a simple neural network
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the input
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Hyperparameters
input_size = 28 * 28  # MNIST image size
hidden_size = 128
num_classes = 10
num_epochs = 10
batch_size = 64
learning_rate = 0.001

# Load MNIST dataset
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize the model, loss function, and optimizer
model = SimpleNN(input_size, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Initialize TensorBoard writer
writer = SummaryWriter('runs/simple_nn_experiment')

# Training loop
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for i, (images, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

        # Log training loss and accuracy
        if i % 100 == 99:  # Log every 100 mini-batches
            writer.add_scalar('Training Loss', running_loss / 100, epoch * len(train_loader) + i)
            writer.add_scalar('Training Accuracy', 100. * correct / total, epoch * len(train_loader) + i)
            running_loss = 0.0
            correct = 0
            total = 0

    # Evaluate on test set
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in test_loader:
            outputs = model(images)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

    test_accuracy = 100. * correct / total

    # Log test loss and accuracy
    writer.add_scalar('Test Loss', test_loss / len(test_loader), epoch)
    writer.add_scalar('Test Accuracy', test_accuracy, epoch)

    print(f'Epoch [{epoch+1}/{num_epochs}], Test Accuracy: {test_accuracy:.2f}%')

    # Log model weights and gradients
    for name, param in model.named_parameters():
        writer.add_histogram(f'Parameters/{name}', param, epoch)
        writer.add_histogram(f'Gradients/{name}', param.grad, epoch)

    # Log a batch of images
    if epoch == 0:
        images, labels = next(iter(train_loader))
        img_grid = torchvision.utils.make_grid(images)
        writer.add_image('MNIST Images', img_grid)

    # Log confusion matrix
    if epoch == num_epochs - 1:
        all_preds = []
        all_labels = []
        for images, labels in test_loader:
            outputs = model(images)
            _, preds = outputs.max(1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        cm = confusion_matrix(all_labels, all_preds)
        figure = plot_confusion_matrix(cm, class_names=range(10))
        writer.add_figure('Confusion Matrix', figure, epoch)

# Log the model graph
dummy_input = torch.randn(1, 1, 28, 28)
writer.add_graph(model, dummy_input)

# Close the TensorBoard writer
writer.close()

print("Training finished! TensorBoard logs saved.")

# Helper function to plot confusion matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, class_names):
    figure = plt.figure(figsize=(8, 8))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title("Confusion Matrix")
    plt.colorbar()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=45)
    plt.yticks(tick_marks, class_names)

    # Normalize the confusion matrix
    cm = np.around(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis], decimals=2)

    threshold = cm.max() / 2.
    for i, j in np.ndindex(cm.shape):
        plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > threshold else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    return figure