In [None]:
# NOTE: This code was used for experiments as-is.
# Naming and structure may not follow programming best practices.
# Focus is on reproducibility.
#This code was developed for internal experimentation and contains hardcoded values for various test cases.
#It was not refactored for modularity, but the logic matches the experiments reported in the paper.

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
from torchvision import datasets, transforms, models
import torch.nn.functional as F
from tqdm import tqdm
import numpy as np
import random
def set_all_seeds(seed):
    torch.manual_seed(seed)            # Sets the seed for PyTorch's CPU operations
    torch.cuda.manual_seed_all(seed)   # Sets the seed for all GPUs
    torch.backends.cudnn.deterministic = True   # Makes cuDNN deterministic
    torch.backends.cudnn.benchmark = False      # Disables cuDNN benchmarking
    np.random.seed(seed)               # Sets NumPy's random seed
    random.seed(seed)                  # Sets Python's built-in random module seed
    print(f"Seed set to: {seed}")
#######################################################
#######################################################
#######################################################
#set_all_seeds(40)
#set_all_seeds(41)
set_all_seeds(42)
#set_all_seeds(43)
#set_all_seeds(44)
#set_all_seeds(45)
#######################################################
#######################################################
#######################################################
# Define custom Swish activation function
class Swish(nn.Module):
    def forward(self, x):
         #return x * torch.sigmoid(x)  #Swish
         #return 1.25*x * torch.sigmoid(x) #ESwish(UP)
         #return x*(torch.sigmoid(x)+0.125*torch.exp(-0.5*x**2))  #SwishPlus(UP)
         #return 0.95*x * torch.sigmoid(x) #ESwish(DOWN)
         #return x*(torch.sigmoid(x)-0.025*torch.exp(-0.5*x**2))  #SwishPlus(DOWN)
         #return x * torch.tanh(F.softplus(x))   #Mish
         #return x * torch.tanh(F.softplus(0.9454113159514*x)/0.9454113159514)  #PMish(UP)
         return x * torch.tanh(F.softplus(x)) +0.025*x*torch.exp(-0.5*x**2)  #MishPlus(UP)
         #return x * torch.tanh(F.softplus(1.34198859922*x)/1.34198859922)  #PMish(DOWN)
         #return x * torch.tanh(F.softplus(x)) -0.125*x*torch.exp(-0.5*x**2)  #MishPlus(DOWN)
         #return torch.relu(x) #ReLU

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set hyperparameters
batch_size = 64
learning_rate = 0.01
momentum = 0.9
weight_decay = 1e-4
dropout_rate = 0.3
num_epochs = 25

# 1. Load KMNIST dataset and 3. Apply normalization
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1918,), (0.3483,))
])

train_dataset = datasets.KMNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.KMNIST(root='./data', train=False, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 4. Load ShuffleNetV2
model = models.shufflenet_v2_x1_0(pretrained=False)

# Modify the first conv layer to accept grayscale images
model.conv1[0] = nn.Conv2d(1, 24, kernel_size=3, stride=2, padding=1, bias=False)

# Modify the last fully connected layer to match KMNIST classes (10)
model.fc = nn.Sequential(
    nn.Dropout(dropout_rate),
    nn.Linear(model.fc.in_features, 10)
)

# 5. Replace activation functions with Swish
def replace_activations(model, old_activation, new_activation):
    for name, module in model.named_children():
        if isinstance(module, old_activation):
            setattr(model, name, new_activation())
        replace_activations(module, old_activation, new_activation)

# Replace ReLU with Swish
replace_activations(model, nn.ReLU, Swish)

model = model.to(device)

# 6. Set up optimizer and scheduler
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, weight_decay=weight_decay)
scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)
criterion = nn.CrossEntropyLoss()

# Training and evaluation functions
def train(model, loader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    progress_bar = tqdm(loader, desc="Training")
    for inputs, targets in progress_bar:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

        progress_bar.set_postfix({'loss': running_loss/total, 'acc': 100.*correct/total})

    train_loss = running_loss / len(loader.dataset)
    train_acc = 100. * correct / len(loader.dataset)
    return train_loss, train_acc

def evaluate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        progress_bar = tqdm(loader, desc="Testing")
        for inputs, targets in progress_bar:
            inputs, targets = inputs.to(device), targets.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, targets)

            running_loss += loss.item() * inputs.size(0)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            progress_bar.set_postfix({'loss': running_loss/total, 'acc': 100.*correct/total})

    test_loss = running_loss / len(loader.dataset)
    test_acc = 100. * correct / len(loader.dataset)
    return test_loss, test_acc

# 7. Training loop with tqdm progress bar
best_train_loss = float('inf')
best_train_acc = 0
best_test_loss = float('inf')
best_test_acc = 0

print("Starting training...")
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")

    train_loss, train_acc = train(model, train_loader, optimizer, criterion, device)
    test_loss, test_acc = evaluate(model, test_loader, criterion, device)

    # Update learning rate
    scheduler.step()

    # Print epoch results
    print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2f}%")
    print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.2f}%")

    # Track best performance
    if train_loss < best_train_loss:
        best_train_loss = train_loss
    if train_acc > best_train_acc:
        best_train_acc = train_acc
    if test_loss < best_test_loss:
        best_test_loss = test_loss
    if test_acc > best_test_acc:
        best_test_acc = test_acc

# 8. Display best results
print("\n" + "="*50)
print("Training completed!")
print(f"Best Train Loss: {best_train_loss:.4f}")
print(f"Best Train Accuracy: {best_train_acc:.2f}%")
print(f"Best Test Loss: {best_test_loss:.4f}")
print(f"Best Test Accuracy: {best_test_acc:.2f}%")
print("="*50)