In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import threading

import torch
from torch import nn
from torch.utils.data import DataLoader, random_split
import torchvision
from torchvision.transforms import v2
from torchvision.transforms.v2 import ToTensor
from torchvision import datasets
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [13]:
print(torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
device1 = torch.device("cuda:0")
device2 = torch.device("cuda:1")

1
GPU 0: Tesla T4


In [14]:
transform_train = v2.Compose([
    v2.RandomHorizontalFlip(p=0.5),
    v2.RandomCrop(32, padding=4),
    v2.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    v2.RandomRotation(15),
    ToTensor(),
    v2.Normalize(
        mean=[0.5071, 0.4867, 0.4408],
        std=[0.2675, 0.2565, 0.2761]
    )
])
transform_test = v2.Compose([
    ToTensor(),
    v2.Normalize(
        mean=[0.5071, 0.4867, 0.4408],
        std=[0.2675, 0.2565, 0.2761]
    )
])
trainset = datasets.CIFAR100(root = './data', train = True, download= True, transform = transform_train)
testset = datasets.CIFAR100(root = './data', train = False, download= True, transform = transform_test)

train_size = int(0.8* len(trainset))
valid_size = len(trainset) - train_size
trainset, validset = random_split(trainset, [train_size, valid_size])

train_loader = DataLoader(trainset, batch_size = 64, shuffle = True)
valid_loader = DataLoader(validset, batch_size = 64, shuffle = False)
test_loader = DataLoader(testset, batch_size = 64, shuffle = False)



In [15]:
def train_dataloader_model(
    model, train_loader, valid_loader, optim, loss_fn,
    device=None, max_epochs=100, diff=1e-3, patience=5, scheduler = None, name = None
):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)

    train_losses = []
    valid_losses = []
    valid_accuracies = []

    valid_loss_min = float("inf")
    patience_counter = 0

    for epoch in range(max_epochs):
        # ----------------------------
        # TRAINING PHASE
        # ----------------------------
        model.train()
        running_train_loss = 0.0

        for X_train, y_train in train_loader:
            X_train, y_train = X_train.to(device), y_train.to(device)

            optim.zero_grad()
            y_pred = model(X_train)
            loss = loss_fn(y_pred, y_train)
            loss.backward()
            optim.step()

            running_train_loss += loss.item()

        epoch_train_loss = running_train_loss / len(train_loader)
        train_losses.append(epoch_train_loss)

        # ----------------------------
        # VALIDATION PHASE
        # ----------------------------
        model.eval()
        running_valid_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for X_valid, y_valid in valid_loader:
                X_valid, y_valid = X_valid.to(device), y_valid.to(device)
                y_pred = model(X_valid)

                # Compute validation loss
                loss = loss_fn(y_pred, y_valid)
                running_valid_loss += loss.item()

                # Compute accuracy
                _, predicted = torch.max(y_pred, 1)
                correct += (predicted == y_valid).sum().item()
                total += y_valid.size(0)

        epoch_valid_loss = running_valid_loss / len(valid_loader)
        epoch_valid_acc = correct / total

        valid_losses.append(epoch_valid_loss)
        valid_accuracies.append(epoch_valid_acc)

        scheduler.step(epoch_valid_loss)
        # ----------------------------
        # EARLY STOPPING CHECKS
        # ----------------------------
        if epoch > 0 and abs(train_losses[-1] - train_losses[-2]) < diff:
            print(f"{name}|epoch {epoch}: loss diff < {diff} → early convergence.")
            break

        if epoch_valid_loss < valid_loss_min - diff:
            valid_loss_min = epoch_valid_loss
            patience_counter = 0
        else:
            patience_counter += 1

        print(
            f"{name}|Epoch [{epoch+1}/{max_epochs}] | "
            f"Train Loss: {epoch_train_loss:.4f} | "
            f"Valid Loss: {epoch_valid_loss:.4f} | "
            f"Valid Acc: {epoch_valid_acc*100:.2f}%"
        )

        if patience_counter >= patience:
            print(f"\nEarly stopping: no improvement for {patience} epochs.")
            break

    # ----------------------------
    # PLOTTING BEFORE RETURN
    # ----------------------------
    epochs = range(1, len(train_losses) + 1)

    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, label="Train Loss")
    plt.plot(epochs, valid_losses, label="Valid Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training vs Validation Loss")
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(epochs, valid_accuracies, label="Valid Accuracy", color="green")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title("Validation Accuracy")
    plt.legend()

    plt.tight_layout()
    plt.show()

## ResNeXt

In [16]:
class ResNeXtBlock(nn.Module):
    """The ResNeXt block."""
    def __init__(self,in_channels, out_channels, groups, bot_mul, use_1x1conv=False,
                 strides=1):
        super().__init__()
        bot_channels = int(round(out_channels * bot_mul))
        bot_channels = max(bot_channels, groups)
        self.conv1 = nn.Conv2d(in_channels, bot_channels, kernel_size=1, stride=1)
        self.bn1 = nn.BatchNorm2d(bot_channels)

        self.conv2 = nn.Conv2d(bot_channels, bot_channels, kernel_size=3,
                                   stride=strides, padding=1,
                                   groups=bot_channels//groups)
        self.bn2 = nn.BatchNorm2d(bot_channels)

        self.conv3 = nn.Conv2d(bot_channels, out_channels, kernel_size=1, stride=1)
        self.bn3 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU()
        if use_1x1conv:
            self.conv4 = nn.Conv2d(in_channels, out_channels, kernel_size=1,
                                       stride=strides)
            self.bn4 = nn.BatchNorm2d(out_channels)
        else:
            self.conv4 = None

    def forward(self, X):
        Y = self.relu(self.bn1(self.conv1(X)))
        Y = self.relu(self.bn2(self.conv2(Y)))
        Y = self.bn3(self.conv3(Y))
        if self.conv4:
            X = self.bn4(self.conv4(X))
        return self.relu(Y + X)
class ResNeXt(nn.Module):
    def block(self, num_residuals, in_channel, out_channels,
              groups=32, bot_mul=0.25, first_block=False):
        blk = []
        for i in range(num_residuals):
            if i == 0 and not first_block:
               blk.append(ResNeXtBlock(in_channel, out_channels,
                                    groups, bot_mul, use_1x1conv=True, strides=2))
            else:
                blk.append(ResNeXtBlock(out_channels, out_channels,
                                    groups, bot_mul))
            in_channel = out_channels
        return nn.Sequential(*blk)

    def __init__(self, arch, num_classes=100):
        super(ResNeXt, self).__init__()
        self.b1 = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64), nn.ReLU(),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
        self.net = nn.Sequential(self.b1)
        for i, b in enumerate(arch):
            self.net.add_module(f'b{i+2}', self.block(*b, first_block=(i==0)))
        self.net.add_module('last', nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)), nn.Flatten(),
            nn.Linear(512, num_classes)))
    def forward(self, x):
        return self.net(x)

class ResNeXt18(ResNeXt):
    def __init__(self, num_classes=100):
        super().__init__(((2, 64, 64), (2, 64, 128), (2, 128, 256), (2, 256, 512))
                         , num_classes)


## SE_ResNeXt




In [17]:
class SEBlock(nn.Module):
    def __init__(self, num_channels, r = 16):
        super().__init__()
        self.GlobalPooling = nn.AdaptiveAvgPool2d((1, 1))
        self.fc1 = nn.Linear(num_channels, num_channels//r)
        self.fc2 = nn.Linear(num_channels//r, num_channels)
        self.net = nn.Sequential(
            self.GlobalPooling,nn.Flatten(), self.fc1, nn.ReLU(), self.fc2, nn.Sigmoid()
        )
    def forward(self, x):
        y = self.net(x)
        y = y[:, :, None, None]
        return x * y
class SE_ResNeXtBlock(ResNeXtBlock):
    def __init__(self, in_channels, out_channels, groups, bot_mul, use_1x1conv=False,
                 strides=1):
        super().__init__(in_channels, out_channels, groups, bot_mul, use_1x1conv,
                 strides)
        self.se = SEBlock(out_channels)

    def forward(self, X):
        Y = self.relu(self.bn1(self.conv1(X)))
        Y = self.relu(self.bn2(self.conv2(Y)))
        Y = self.bn3(self.conv3(Y))
        if self.conv4:
            X = self.bn4(self.conv4(X))
        Y = self.se(Y)
        return self.relu(Y + X)
class SE_ResNeXt(ResNeXt):
    def block(self, num_residuals, in_channel, out_channels,
              groups=32, bot_mul=0.25, first_block=False):
        blk = []
        for i in range(num_residuals):
            if i == 0 and not first_block:
               blk.append(SE_ResNeXtBlock(in_channel, out_channels,
                                    groups, bot_mul, use_1x1conv=True, strides=2))
            else:
                blk.append(SE_ResNeXtBlock(out_channels, out_channels,
                                    groups, bot_mul))
            in_channel = out_channels
        return nn.Sequential(*blk)
    def __init__(self, arch, num_classes=100):
        super().__init__(arch, num_classes)
class SE_ResNeXt18(SE_ResNeXt):
    def __init__(self, num_classes=100):
        super().__init__(((2, 64, 64), (2, 64, 128), (2, 128, 256), (2, 256, 512))
                         , num_classes)

In [18]:
model1 = ResNeXt18().to(device1)
optim1 = torch.optim.Adam(model1.parameters(), lr=1e-3, weight_decay = 1e-4)
model2 = SE_ResNeXt18().to(device2)
optim2 = torch.optim.Adam(model2.parameters(), lr=1e-3, weight_decay = 1e-4)
loss_fn = nn.CrossEntropyLoss()
scheduler1 = ReduceLROnPlateau(optim1, mode='min', factor=0.5, patience=3)
scheduler2 = ReduceLROnPlateau(optim2, mode='min', factor=0.5, patience=3)

AcceleratorError: CUDA error: invalid device ordinal
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
total_params1 = sum(p.numel() for p in model1.parameters())
print(f"Total parameters of model1: {total_params1:,}")
total_params2 = sum(p.numel() for p in model2.parameters())
print(f"Total parameters of model2: {total_params2:,}")

In [None]:
t1 = threading.Thread(
    target=train_dataloader_model,
    args=(model1, train_loader, valid_loader),
    kwargs={
        'optim': optim1,
        'loss_fn': loss_fn,
        'scheduler': scheduler1,
        'name': 'model1'
    }
)

t2 = threading.Thread(
    target=train_dataloader_model,
    args=(model2, train_loader, valid_loader),
    kwargs={
        'optim': optim2,
        'loss_fn': loss_fn,
        'scheduler': scheduler2,
        'name': 'model2'
    }
)
t1.start()
t2.start()

t1.join()
t2.join()