# Experiment 1.4: Extended Training + LR Scheduler

**Based on Baseline Fruit Ripeness Classifier**

## Changes from Baseline:
- ✅ NUM_EPOCHS: 5 → 40 (extended training)
- ✅ Added Learning Rate Scheduler (ReduceLROnPlateau)
- ✅ Enhanced visualization and tracking

## How to Get Your Kaggle API Token:
1. Go to https://www.kaggle.com/
2. Click your profile picture (top right) → **Settings**
3. Scroll to **API** section  
4. Click **"Create New Token"**
5. Download `kaggle.json`
6. Upload it when prompted in Step 3 below

---

Quick-start notebook tuned for Google Colab GPU usage.

1. Install dependencies.
2. Upload your `kaggle.json` when prompted.
3. Download and prepare the dataset (handled below).
4. Run the training and evaluation cells.

You can also run this locally; skip the Kaggle upload cell if your credentials are already configured.

## 1. Install Dependencies
Run this cell once per Colab session to install PyTorch, Kaggle, and analysis libraries.


In [None]:
%%capture
!pip install -q kagglehub torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q numpy pandas scikit-learn matplotlib seaborn tqdm

## 2. Import Libraries and Configure the Runtime
Imports the packages needed for data handling, modelling, and plotting, then reports the active device.


In [None]:
import os
import random
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import torch
from tqdm.auto import tqdm
from torch import nn
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import transforms
from torchvision.datasets import ImageFolder

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(f"Torch version: {torch.__version__}")

## 3. Configure Kaggle Credentials
Upload your Kaggle API token so the dataset can be downloaded automatically. The next cell also fixes the token file permissions required by Kaggle.


In [None]:
try:
    from google.colab import files  # type: ignore
    IS_COLAB = True
except ImportError:
    files = None
    IS_COLAB = False

kaggle_dir = Path.home() / ".kaggle"
kaggle_dir.mkdir(parents=True, exist_ok=True)

kaggle_json = kaggle_dir / "kaggle.json"
if not kaggle_json.exists():
    if files is None:
        raise FileNotFoundError(
            "kaggle.json not found. Upload it in Colab or place it in ~/.kaggle/"
        )
    print("Upload your kaggle.json file (Account > Create New API Token).")
    uploaded = files.upload()
    if not uploaded:
        raise ValueError("No files uploaded.")
    if "kaggle.json" in uploaded:
        data = uploaded["kaggle.json"]
    else:
        filename, data = next(iter(uploaded.items()))
        print(f"Received '{filename}'. Renaming to 'kaggle.json'.")
    kaggle_json.write_bytes(data)
    print("kaggle.json uploaded.")
else:
    print("kaggle.json already present; skipping upload.")


In [None]:
!chmod 600 ~/.kaggle/kaggle.json

## 4. Download and Prepare the Dataset
Downloads the Kaggle dataset with `kagglehub`, extracts archives, and locates the train/test directories.


In [None]:
import shutil
import zipfile

import kagglehub

DATASET_SLUG = "leftin/fruit-ripeness-unripe-ripe-and-rotten"
TARGET_DIR = Path("data/fruit_ripeness_dataset")
FORCE_DOWNLOAD = False  # Set to True to refresh the dataset


def iter_files(path: Path):
    return [p for p in path.rglob('*') if p.is_file()]


def copy_contents(src: Path, dst: Path) -> None:
    files = iter_files(src)
    if not files:
        return
    for file_path in tqdm(files, desc="Copying dataset files", unit="file"):
        relative = file_path.relative_to(src)
        target = dst / relative
        target.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(file_path, target)


def extract_archives(path: Path) -> None:
    zip_files = list(path.rglob("*.zip"))
    for zip_path in tqdm(zip_files, desc="Extracting archives", unit="zip"):
        extract_dir = zip_path.with_suffix("")
        extract_dir.mkdir(parents=True, exist_ok=True)
        with zipfile.ZipFile(zip_path, "r") as zf:
            members = zf.namelist()
            for member in tqdm(members, desc=f"Extracting {zip_path.name}", leave=False, unit="file"):
                zf.extract(member, extract_dir)
        zip_path.unlink()


def find_split_dir(root: Path, name: str):
    candidates = sorted(
        [p for p in root.rglob(name) if p.is_dir()],
        key=lambda p: len(p.parts),
    )
    for candidate in candidates:
        if any(candidate.glob("*/*")):
            return candidate
    return None


if TARGET_DIR.exists() and not FORCE_DOWNLOAD:
    print(f"Dataset already present at {TARGET_DIR.resolve()}\nSet FORCE_DOWNLOAD=True to re-download.")
else:
    if TARGET_DIR.exists() and FORCE_DOWNLOAD:
        shutil.rmtree(TARGET_DIR)
    TARGET_DIR.mkdir(parents=True, exist_ok=True)
    print(f"Downloading {DATASET_SLUG} with kagglehub ...")
    downloaded_path = Path(kagglehub.dataset_download(DATASET_SLUG)).resolve()
    print(f"Download complete: {downloaded_path}")
    copy_contents(downloaded_path, TARGET_DIR)
    extract_archives(TARGET_DIR)
    print(f"Dataset extracted to {TARGET_DIR.resolve()}")

TRAIN_DIR = find_split_dir(TARGET_DIR, "train")
TEST_DIR = find_split_dir(TARGET_DIR, "test")

if TRAIN_DIR is None:
    raise RuntimeError(
        f"Could not locate a 'train' directory inside {TARGET_DIR.resolve()}"
    )

print(f"Using train directory: {TRAIN_DIR}")
if TEST_DIR is None:
    raise RuntimeError(
        f"Could not locate a 'test' directory inside {TARGET_DIR.resolve()}"
    )
print(f"Using test directory: {TEST_DIR}")


## 5. Set Hyperparameters

**🔬 EXPERIMENT 1.4 CONFIGURATION**

Changes from baseline:
- NUM_EPOCHS: 5 → **40**
- Added **Learning Rate Scheduler** (ReduceLROnPlateau)

Central place to adjust random seed, batch size, image size, and training duration.


In [None]:
# ============================================================
# EXPERIMENT 1.4 CONFIGURATION
# ============================================================
EXPERIMENT_NAME = "Exp_1.4_Extended_Training_LR_Scheduler"

SEED = 42
BATCH_SIZE = 32
NUM_EPOCHS = 40  # 🔬 CHANGED: Was 5 → now 40
VAL_SPLIT = 0.15
IMAGE_SIZE = 224
LEARNING_RATE = 1e-3  # 0.001

# 🔬 NEW: Learning Rate Scheduler Configuration
USE_LR_SCHEDULER = True
LR_SCHEDULER_PATIENCE = 3  # Reduce LR after 3 epochs of no improvement
LR_SCHEDULER_FACTOR = 0.5  # Reduce LR by half
LR_SCHEDULER_MIN_LR = 1e-6  # Minimum learning rate


def set_seed(seed: int) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed(SEED)

## 6. Build Datasets and DataLoaders
Creates stratified training/validation/test splits, applies transforms, and prepares PyTorch data loaders.


In [None]:
full_dataset = ImageFolder(TRAIN_DIR, transform=None)
if len(full_dataset) == 0:
    raise RuntimeError("ImageFolder found no images in the training directory.")

class_names = full_dataset.classes
print(f"Detected classes: {class_names}")

val_size = max(1, int(len(full_dataset) * VAL_SPLIT))
remaining_for_train = len(full_dataset) - val_size

generator = torch.Generator().manual_seed(SEED)
train_subset, val_subset = random_split(
    full_dataset,
    [remaining_for_train, val_size],
    generator=generator,
)

raw_test_dataset = ImageFolder(TEST_DIR, transform=None)
if raw_test_dataset.classes != class_names:
    raise RuntimeError("Class labels differ between train and test directories.")
test_indices = range(len(raw_test_dataset))


class SubsetWithTransform(Dataset):
    def __init__(self, dataset: ImageFolder, indices, transform):
        self.dataset = dataset
        self.indices = list(indices)
        self.transform = transform

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        image, label = self.dataset[self.indices[idx]]
        if self.transform:
            image = self.transform(image)
        return image, label


train_transforms = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

eval_transforms = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_dataset = SubsetWithTransform(full_dataset, train_subset.indices, train_transforms)
val_dataset = SubsetWithTransform(full_dataset, val_subset.indices, eval_transforms)
test_dataset = SubsetWithTransform(raw_test_dataset, test_indices, eval_transforms)

num_workers = 0 if IS_COLAB else min(2, (os.cpu_count() or 1) - 1 if (os.cpu_count() or 1) > 1 else 0)
num_workers = max(num_workers, 0)
pin_memory = torch.cuda.is_available()
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=num_workers, pin_memory=pin_memory)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=num_workers, pin_memory=pin_memory)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=num_workers, pin_memory=pin_memory)

print(
    f"Data sizes -> train: {len(train_dataset)}, val: {len(val_dataset)}, test: {len(test_dataset)}"
)



## 7. Define the Baseline CNN
A lightweight convolutional network used as the initial benchmark model.


In [None]:
class SimpleCNN(nn.Module):
    def __init__(self, num_classes: int):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),
            nn.Dropout(0.25),
        )
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128 * (IMAGE_SIZE // 8) * (IMAGE_SIZE // 8), 256),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x


model = SimpleCNN(num_classes=len(class_names)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

total_params = sum(p.numel() for p in model.parameters())
print(f"Model parameters: {total_params / 1e6:.2f}M")

In [None]:
model = SimpleCNN(num_classes=len(class_names)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# 🔬 NEW: Add Learning Rate Scheduler
if USE_LR_SCHEDULER:
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='min',
        factor=LR_SCHEDULER_FACTOR,
        patience=LR_SCHEDULER_PATIENCE,
        verbose=True,
        min_lr=LR_SCHEDULER_MIN_LR
    )
    print("✅ Learning Rate Scheduler: ReduceLROnPlateau")
    print(f"   Patience: {LR_SCHEDULER_PATIENCE} epochs")
    print(f"   Factor: {LR_SCHEDULER_FACTOR}x")
    print(f"   Min LR: {LR_SCHEDULER_MIN_LR}")
else:
    scheduler = None

total_params = sum(p.numel() for p in model.parameters())
print(f"\nModel: SimpleCNN")
print(f"Parameters: {total_params / 1e6:.2f}M")
print(f"\n{'='*70}")
print(f"EXPERIMENT: {EXPERIMENT_NAME}")
print(f"{'='*70}")
print(f"Epochs:        {NUM_EPOCHS}")
print(f"Batch Size:    {BATCH_SIZE}")
print(f"Learning Rate: {LEARNING_RATE}")
print(f"LR Scheduler:  {'Yes' if USE_LR_SCHEDULER else 'No'}")
print(f"{'='*70}\n")

## 8. Define Training Utilities
Helper functions for training and evaluation, including tqdm progress bars for batch-level insight.


In [None]:
def train_one_epoch(model, loader, optimizer, criterion, device, epoch=None, total_epochs=None):
    model.train()
    running_loss = 0.0
    running_correct = 0
    total = 0

    desc = "Training" if epoch is None or total_epochs is None else f"Train {epoch:02d}/{total_epochs:02d}"
    progress = tqdm(loader, desc=desc, leave=False, unit="batch")

    for images, labels in progress:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad(set_to_none=True)
        logits = model(images)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)
        running_correct += (logits.argmax(dim=1) == labels).sum().item()
        total += images.size(0)

        if total:
            progress.set_postfix(
                loss=running_loss / total,
                acc=running_correct / total,
            )

    epoch_loss = running_loss / total
    epoch_acc = running_correct / total
    return epoch_loss, epoch_acc


@torch.no_grad()
def evaluate(model, loader, criterion, device, split="Eval", epoch=None, total_epochs=None):
    model.eval()
    running_loss = 0.0
    running_correct = 0
    total = 0

    if epoch is None or total_epochs is None:
        desc = split
    else:
        desc = f"{split} {epoch:02d}/{total_epochs:02d}"
    progress = tqdm(loader, desc=desc, leave=False, unit="batch")

    for images, labels in progress:
        images, labels = images.to(device), labels.to(device)
        logits = model(images)
        loss = criterion(logits, labels)

        running_loss += loss.item() * images.size(0)
        running_correct += (logits.argmax(dim=1) == labels).sum().item()
        total += images.size(0)

        if total:
            progress.set_postfix(
                loss=running_loss / total,
                acc=running_correct / total,
            )

    epoch_loss = running_loss / total
    epoch_acc = running_correct / total
    return epoch_loss, epoch_acc


## 9. Run the Training Loop

🔬 **Modified**: Added LR scheduler step and enhanced tracking.

Executes the epoch loop with tqdm progress bars to monitor loss and accuracy updates in real time.


In [None]:
history = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": [], "learning_rates": []}
best_val_acc = 0.0
best_epoch = 0

for epoch in range(1, NUM_EPOCHS + 1):
    # Track current LR
    current_lr = optimizer.param_groups[0]['lr']
    history["learning_rates"].append(current_lr)
    
    train_loss, train_acc = train_one_epoch(
        model, train_loader, optimizer, criterion, device, epoch=epoch, total_epochs=NUM_EPOCHS
    )
    val_loss, val_acc = evaluate(
        model, val_loader, criterion, device, split="Validation", epoch=epoch, total_epochs=NUM_EPOCHS
    )

    history["train_loss"].append(train_loss)
    history["train_acc"].append(train_acc)
    history["val_loss"].append(val_loss)
    history["val_acc"].append(val_acc)
    
    # Track best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_epoch = epoch

    # 🔬 NEW: Step the learning rate scheduler
    if USE_LR_SCHEDULER and scheduler is not None:
        old_lr = current_lr
        scheduler.step(val_loss)
        new_lr = optimizer.param_groups[0]['lr']
        if new_lr != old_lr:
            print(f"   📉 LR Reduced: {old_lr:.2e} → {new_lr:.2e}")

    print(
        f"Epoch {epoch:02d}/{NUM_EPOCHS} | "
        f"train_loss: {train_loss:.4f}, train_acc: {train_acc:.3f} | "
        f"val_loss: {val_loss:.4f}, val_acc: {val_acc:.3f}"
        + (" ⭐" if epoch == best_epoch else "")
    )

print(f"\nTraining complete! Best val acc: {best_val_acc:.4f} at epoch {best_epoch}")


## 10. Visualise Training Curves

🔬 **Enhanced**: Added learning rate schedule and train/val gap plots.

Plots loss and accuracy so you can inspect learning behaviour.


In [None]:
epochs_range = range(1, len(history["train_loss"]) + 1)

plt.figure(figsize=(16, 10))

# Loss plot
plt.subplot(2, 2, 1)
plt.plot(epochs_range, history["train_loss"], 'b-', label="Train", linewidth=2)
plt.plot(epochs_range, history["val_loss"], 'r-', label="Validation", linewidth=2)
plt.axvline(x=best_epoch, color='g', linestyle='--', alpha=0.5, label=f'Best: Epoch {best_epoch}')
plt.title("Loss", fontweight='bold')
plt.xlabel("Epoch")
plt.ylabel("Cross-entropy")
plt.legend()
plt.grid(alpha=0.3)

# Accuracy plot
plt.subplot(2, 2, 2)
plt.plot(epochs_range, history["train_acc"], 'b-', label="Train", linewidth=2)
plt.plot(epochs_range, history["val_acc"], 'r-', label="Validation", linewidth=2)
plt.axvline(x=best_epoch, color='g', linestyle='--', alpha=0.5, label=f'Best: Epoch {best_epoch}')
plt.title("Accuracy", fontweight='bold')
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.grid(alpha=0.3)

# Train/Val gap (overfitting indicator)
plt.subplot(2, 2, 3)
acc_gap = np.array(history["train_acc"]) - np.array(history["val_acc"])
plt.plot(epochs_range, acc_gap, 'purple', linewidth=2)
plt.axhline(y=0, color='k', linestyle='-', alpha=0.3)
plt.fill_between(epochs_range, acc_gap, 0, alpha=0.3, color='purple')
plt.title("Train/Val Gap (Overfitting Indicator)", fontweight='bold')
plt.xlabel("Epoch")
plt.ylabel("Gap")
plt.grid(alpha=0.3)
plt.text(0.5, 0.95, f'Final: {acc_gap[-1]:.3f}', transform=plt.gca().transAxes,
         ha='center', va='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

# Learning rate schedule
plt.subplot(2, 2, 4)
plt.plot(epochs_range, history["learning_rates"], 'orange', linewidth=2, marker='o', markersize=3)
plt.title("Learning Rate Schedule", fontweight='bold')
plt.xlabel("Epoch")
plt.ylabel("Learning Rate")
plt.yscale('log')
plt.grid(alpha=0.3, which='both')

plt.suptitle(EXPERIMENT_NAME, fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Print summary
print(f"\n{'='*70}")
print("SUMMARY")
print(f"{'='*70}")
print(f"Best Val Accuracy:  {max(history['val_acc']):.4f} at Epoch {np.argmax(history['val_acc'])+1}")
print(f"Final Train Acc:    {history['train_acc'][-1]:.4f}")
print(f"Final Val Acc:      {history['val_acc'][-1]:.4f}")
print(f"Train/Val Gap:      {history['train_acc'][-1] - history['val_acc'][-1]:.4f}")
print(f"Final LR:           {history['learning_rates'][-1]:.2e}")
print(f"LR Reductions:      {len(set(history['learning_rates']))-1}")
print(f"{'='*70}\n")

## 11. Evaluate on the Test Set
Reports final performance using the held-out split.


In [None]:
test_loss, test_acc = evaluate(model, test_loader, criterion, device, split="Test")
print(f"Test loss: {test_loss:.4f}")
print(f"Test accuracy: {test_acc:.3f}")


## 12. (Optional) Save the Trained Model

Toggle this to persist the model weights for reuse or submission.


In [None]:
SAVE_MODEL = False  # Switch to True to persist the trained weights

if SAVE_MODEL:
    checkpoint_dir = Path("checkpoints")
    checkpoint_dir.mkdir(parents=True, exist_ok=True)
    checkpoint_path = checkpoint_dir / "baseline_cnn.pt"
    torch.save({
        "model_state_dict": model.state_dict(),
        "class_names": class_names,
        "image_size": IMAGE_SIZE,
    }, checkpoint_path)
    print(f"Saved checkpoint to {checkpoint_path.resolve()}")