In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
import wandb
import os
import sys

# 1. Define Model Architecture 
class SimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super(SimpleCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(64 * 8 * 8, 256), 
            nn.ReLU(),
            # classifier.3 is the layer that changes size
            nn.Linear(256, num_classes) 
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

# 2. Define Training Utility Function (FIXED)
def train_and_log(dataset_name, num_classes, init_model_path=None, project_name="Q4-Sequential-CIFAR", epochs=10):
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # W&B initialization for this specific run
    run = wandb.init(
        project=project_name, 
        group=f"Sequence_{dataset_name}",
        name=f"Train_{dataset_name}_init_{os.path.basename(init_model_path) if init_model_path else 'scratch'}",
        reinit=True,
        config={
            "dataset": dataset_name,
            "epochs": epochs,
            "learning_rate": 0.001,
            "batch_size": 64,
            "model_architecture": "SimpleCNN",
            "num_classes": num_classes
        }
    )
    
    # Initialize the model with the TARGET number of classes
    model = SimpleCNN(num_classes=num_classes)

    # Load initial weights if provided (FIXED TRANSFER LEARNING LOGIC)
    if init_model_path and os.path.exists(init_model_path):
        print(f"Loading weights from: {init_model_path} for transfer...")
        # Use strict=False to ignore the size mismatch in the final classification layer.
        # This successfully loads the feature weights but skips the head weights.
        try:
            model.load_state_dict(torch.load(init_model_path, map_location=device), strict=False)
            print("Successfully loaded feature weights (classification head weights ignored).")
        except Exception as e:
            print(f"Error during non-strict loading: {e}. Model will start from scratch.")
            
    model.to(device)

    # Data transformation and loading
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])
    
    if dataset_name == 'CIFAR-100':
        train_data = datasets.CIFAR100(root='./data', train=True, download=True, transform=transform)
        test_data = datasets.CIFAR100(root='./data', train=False, download=True, transform=transform)
    else: # CIFAR-10
        train_data = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
        test_data = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

    train_loader = DataLoader(train_data, batch_size=64, shuffle=True, num_workers=0) # Changed num_workers to 0 for better compatibility
    test_loader = DataLoader(test_data, batch_size=64, shuffle=False, num_workers=0)

    # Optimizer and Loss
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training loop
    best_accuracy = 0.0
    for epoch in range(epochs):
        # ... Training loop and evaluation logic remain the same ...
        model.train()
        total_loss = 0
        for i, (images, labels) in enumerate(train_loader):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for images, labels in test_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        accuracy = correct / total
        
        wandb.log({
            "epoch": epoch,
            "train_loss": total_loss / len(train_loader),
            "val_accuracy": accuracy
        })
        
        print(f"[{dataset_name}] Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}, Acc: {accuracy:.4f}")

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            torch.save(model.state_dict(), f'best_model_{dataset_name}.pt')

    run.finish()
    
    # Save the final model state (not just the best) to be used for the next stage
    final_path = f'final_model_{dataset_name}_{num_classes}.pt'
    torch.save(model.state_dict(), final_path)
    return final_path

In [6]:
import os 
import torch
import glob
import time # Use time.sleep to ensure file system operations complete, if needed

# Configuration
# Set NUM_EPOCHS to 100 for final submission, 10 is used for testing speed.
NUM_EPOCHS = 10 
CIFAR10_CLASSES = 10
CIFAR100_CLASSES = 100

# These are the required paths for the intermediate transfer weights
MODEL_PATH_A = "model_cifar100_final.pt" # CIFAR-100 -> CIFAR-10 transfer file
MODEL_PATH_B = "model_cifar10_final.pt"  # CIFAR-10 -> CIFAR-100 transfer file

# Cleanup function to delete old files
def cleanup_old_files():
    # Delete the primary transfer files
    for f in [MODEL_PATH_A, MODEL_PATH_B]:
        if os.path.exists(f):
            print(f"Cleanup: Removing old transfer file {f}")
            os.remove(f)
    # Also delete the temporary files created by train_and_log, just in case
    for f in glob.glob("final_model_CIFAR*.pt"):
        if os.path.exists(f):
            print(f"Cleanup: Removing temporary file {f}")
            os.remove(f)
    for f in glob.glob("best_model_CIFAR*.pt"):
        if os.path.exists(f):
            print(f"Cleanup: Removing temporary best file {f}")
            os.remove(f)
    
print("Performing initial file system cleanup...")
cleanup_old_files()
print("Cleanup complete.")
time.sleep(1) # Wait a moment for file system to sync (especially on Windows)

# 4a. Sequence A: Train CIFAR-100 (100 classes) -> Train CIFAR-10 (10 classes)

print("Starting Sequence A: CIFAR-100 -> CIFAR-10")

# Stage A-1: Train CIFAR-100 from scratch 
print("Stage A-1: Training CIFAR-100 from scratch...")
# This call returns the path of the final saved model (e.g., final_model_CIFAR-100_100.pt)
final_cifar100_path = train_and_log(
    dataset_name='CIFAR-100',
    num_classes=CIFAR100_CLASSES,
    epochs=NUM_EPOCHS
)
# Move the newly trained model to the required transfer path (MODEL_PATH_A)
os.rename(final_cifar100_path, MODEL_PATH_A)
print(f"Weights saved to {MODEL_PATH_A} for transfer.")


# Stage A-2: Fine-tune on CIFAR-10 using CIFAR-100 weights
print("\nStage A-2: Fine-tuning CIFAR-10 using CIFAR-100 weights...")
final_cifar10_path_A = train_and_log(
    dataset_name='CIFAR-10',
    num_classes=CIFAR10_CLASSES,
    init_model_path=MODEL_PATH_A, # Use the model saved from Stage A-1
    epochs=NUM_EPOCHS
)
# Cleanup the temporary final file name created by this stage
if os.path.exists(final_cifar10_path_A):
    os.remove(final_cifar10_path_A)
print("Sequence A completed.")


# 4b. Sequence B: Train CIFAR-10 (10 classes) -> Train CIFAR-100 (100 classes)

print("Starting Sequence B: CIFAR-10 -> CIFAR-100")

# Stage B-1: Train CIFAR-10 from scratch
print("Stage B-1: Training CIFAR-10 from scratch...")
final_cifar10_path = train_and_log(
    dataset_name='CIFAR-10',
    num_classes=CIFAR10_CLASSES,
    epochs=NUM_EPOCHS
)
# Move the newly trained model to the required transfer path (MODEL_PATH_B)
os.rename(final_cifar10_path, MODEL_PATH_B)
print(f"Weights saved to {MODEL_PATH_B} for transfer.")

# Stage B-2: Fine-tune on CIFAR-100 using CIFAR-10 weights
print("\nStage B-2: Fine-tuning CIFAR-100 using CIFAR-10 weights...")
final_cifar100_path_B = train_and_log(
    dataset_name='CIFAR-100',
    num_classes=CIFAR100_CLASSES,
    init_model_path=MODEL_PATH_B, # Use the model saved from Stage B-1
    epochs=NUM_EPOCHS
)
# Cleanup the temporary final file name created by this stage
if os.path.exists(final_cifar100_path_B):
    os.remove(final_cifar100_path_B)
print("Sequence B completed. All W&B runs finished.")

Performing initial file system cleanup...
Cleanup complete.

--- Starting Sequence A: CIFAR-100 -> CIFAR-10 ---
Stage A-1: Training CIFAR-100 from scratch...


[CIFAR-100] Epoch 1/10, Loss: 3.5727, Acc: 0.2559
[CIFAR-100] Epoch 2/10, Loss: 2.8980, Acc: 0.3219
[CIFAR-100] Epoch 3/10, Loss: 2.6033, Acc: 0.3612
[CIFAR-100] Epoch 4/10, Loss: 2.4047, Acc: 0.3857
[CIFAR-100] Epoch 5/10, Loss: 2.2578, Acc: 0.3954
[CIFAR-100] Epoch 6/10, Loss: 2.1407, Acc: 0.4113
[CIFAR-100] Epoch 7/10, Loss: 2.0444, Acc: 0.4268
[CIFAR-100] Epoch 8/10, Loss: 1.9666, Acc: 0.4270
[CIFAR-100] Epoch 9/10, Loss: 1.8956, Acc: 0.4355


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[CIFAR-100] Epoch 10/10, Loss: 1.8295, Acc: 0.4404


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▅▄▃▃▂▂▂▁▁
val_accuracy,▁▄▅▆▆▇▇▇██

0,1
epoch,9.0
train_loss,1.82948
val_accuracy,0.4404


Weights saved to model_cifar100_final.pt for transfer.

Stage A-2: Fine-tuning CIFAR-10 using CIFAR-100 weights...


Loading weights from: model_cifar100_final.pt for transfer...
Error during non-strict loading: Error(s) in loading state_dict for SimpleCNN:
	size mismatch for classifier.3.weight: copying a param with shape torch.Size([100, 256]) from checkpoint, the shape in current model is torch.Size([10, 256]).
	size mismatch for classifier.3.bias: copying a param with shape torch.Size([100]) from checkpoint, the shape in current model is torch.Size([10]).. Model will start from scratch.


100%|████████████████████████████████████████████████████████████████████████████████| 170M/170M [06:55<00:00, 410kB/s]


[CIFAR-10] Epoch 1/10, Loss: 1.1388, Acc: 0.6812
[CIFAR-10] Epoch 2/10, Loss: 0.8976, Acc: 0.7030
[CIFAR-10] Epoch 3/10, Loss: 0.8121, Acc: 0.7204
[CIFAR-10] Epoch 4/10, Loss: 0.7440, Acc: 0.7339
[CIFAR-10] Epoch 5/10, Loss: 0.6897, Acc: 0.7448
[CIFAR-10] Epoch 6/10, Loss: 0.6485, Acc: 0.7508
[CIFAR-10] Epoch 7/10, Loss: 0.6070, Acc: 0.7556
[CIFAR-10] Epoch 8/10, Loss: 0.5724, Acc: 0.7538
[CIFAR-10] Epoch 9/10, Loss: 0.5419, Acc: 0.7546


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[CIFAR-10] Epoch 10/10, Loss: 0.5150, Acc: 0.7529


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▅▄▄▃▂▂▂▁▁
val_accuracy,▁▃▅▆▇█████

0,1
epoch,9.0
train_loss,0.51501
val_accuracy,0.7529


Sequence A completed.

--- Starting Sequence B: CIFAR-10 -> CIFAR-100 ---
Stage B-1: Training CIFAR-10 from scratch...


[CIFAR-10] Epoch 1/10, Loss: 1.3723, Acc: 0.6037
[CIFAR-10] Epoch 2/10, Loss: 1.0450, Acc: 0.6697
[CIFAR-10] Epoch 3/10, Loss: 0.9029, Acc: 0.7069
[CIFAR-10] Epoch 4/10, Loss: 0.8021, Acc: 0.7224
[CIFAR-10] Epoch 5/10, Loss: 0.7204, Acc: 0.7335
[CIFAR-10] Epoch 6/10, Loss: 0.6529, Acc: 0.7381
[CIFAR-10] Epoch 7/10, Loss: 0.5923, Acc: 0.7545
[CIFAR-10] Epoch 8/10, Loss: 0.5404, Acc: 0.7521
[CIFAR-10] Epoch 9/10, Loss: 0.5019, Acc: 0.7513


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[CIFAR-10] Epoch 10/10, Loss: 0.4602, Acc: 0.7510


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▅▄▄▃▂▂▂▁▁
val_accuracy,▁▄▆▇▇▇████

0,1
epoch,9.0
train_loss,0.46015
val_accuracy,0.751


Weights saved to model_cifar10_final.pt for transfer.

Stage B-2: Fine-tuning CIFAR-100 using CIFAR-10 weights...


Loading weights from: model_cifar10_final.pt for transfer...
Error during non-strict loading: Error(s) in loading state_dict for SimpleCNN:
	size mismatch for classifier.3.weight: copying a param with shape torch.Size([10, 256]) from checkpoint, the shape in current model is torch.Size([100, 256]).
	size mismatch for classifier.3.bias: copying a param with shape torch.Size([10]) from checkpoint, the shape in current model is torch.Size([100]).. Model will start from scratch.
[CIFAR-100] Epoch 1/10, Loss: 3.0358, Acc: 0.3642
[CIFAR-100] Epoch 2/10, Loss: 2.3643, Acc: 0.4081
[CIFAR-100] Epoch 3/10, Loss: 2.1099, Acc: 0.4263
[CIFAR-100] Epoch 4/10, Loss: 1.9389, Acc: 0.4423
[CIFAR-100] Epoch 5/10, Loss: 1.8015, Acc: 0.4535
[CIFAR-100] Epoch 6/10, Loss: 1.6784, Acc: 0.4656
[CIFAR-100] Epoch 7/10, Loss: 1.5746, Acc: 0.4629
[CIFAR-100] Epoch 8/10, Loss: 1.4865, Acc: 0.4663
[CIFAR-100] Epoch 9/10, Loss: 1.4061, Acc: 0.4682


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[CIFAR-100] Epoch 10/10, Loss: 1.3431, Acc: 0.4671


0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▅▄▃▃▂▂▂▁▁
val_accuracy,▁▄▅▆▇█████

0,1
epoch,9.0
train_loss,1.34306
val_accuracy,0.4671


Sequence B completed. All W&B runs finished.
