In [1]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [70]:
import os
import shutil
import random

# Paths
input_base = '/kaggle/input/cloud-types/clouds'
working_base = '/kaggle/working/datasets_folder'

# List of cloud categories
categories = [
    'altocumulus', 'altostratus', 'cirrocumulus', 'cirrostratus', 'cirrus',
    'contrails', 'cumulonimbus', 'cumulus', 'lenticular', 'mammatus',
    'nimbostratus', 'stratocumulus', 'stratus'
]

# Ratios for dataset split
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

# Step 1: Copy original data to writable working directory
for category in categories:
    src_dir = os.path.join(input_base, category)
    dst_dir = os.path.join(working_base, category)
    os.makedirs(dst_dir, exist_ok=True)

    for filename in os.listdir(src_dir):
        src_file = os.path.join(src_dir, filename)
        dst_file = os.path.join(dst_dir, filename)
        try:
            if os.path.getsize(src_file) > 0:  # Skip empty files
                shutil.copy2(src_file, dst_file)
        except:
            continue  # Skip corrupt/unreadable files

# Step 2: Create train/validation/test folders
for split in ['train', 'val', 'test']:
    for category in categories:
        os.makedirs(os.path.join(working_base, split, category), exist_ok=True)

# Step 3: Split and copy files
for category in categories:
    src_dir = os.path.join(working_base, category)
    all_files = [f for f in os.listdir(src_dir) if os.path.getsize(os.path.join(src_dir, f)) > 0]
    random.shuffle(all_files)

    total_files = len(all_files)
    train_end = int(total_files * train_ratio)
    val_end = train_end + int(total_files * val_ratio)

    train_files = all_files[:train_end]
    val_files = all_files[train_end:val_end]
    test_files = all_files[val_end:]

    # Copy files to respective folders
    for f in train_files:
        shutil.copy2(os.path.join(src_dir, f), os.path.join(working_base, 'train', category, f))
    for f in val_files:
        shutil.copy2(os.path.join(src_dir, f), os.path.join(working_base, 'val', category, f))
    for f in test_files:
        shutil.copy2(os.path.join(src_dir, f), os.path.join(working_base, 'test', category, f))

print("Data split complete. Check /kaggle/working/datasets_folder/")


Data split complete. Check /kaggle/working/datasets_folder/


In [71]:
import os
import time
import io
import csv
import json
import subprocess
from pathlib import Path
from typing import List, Dict, Tuple

import numpy as np
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms, datasets, models
from torch.utils.tensorboard import SummaryWriter
from torch.profiler import profile, ProfilerActivity, record_function, tensorboard_trace_handler


In [72]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [73]:
try:
    import psutil
except Exception:
    psutil = None

In [74]:
DATA_DIR = "/kaggle/working/datasets_folder"   
TRAIN_SUBDIR = "train"
VAL_SUBDIR = "val"   
OUTPUT_DIR = "results"
LOG_DIR = "logs/tensorboard"
PROFILE_DIR = os.path.join(OUTPUT_DIR, "profiles")
MODEL_DIR = os.path.join(OUTPUT_DIR, "models")

BATCH_SIZES = [1, 4, 8, 16, 32]
IMG_SIZE = 224
NUM_EPOCHS = 1    
LR = 1e-3
NUM_WORKERS = 2
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

WARMUP_ITERS = 5
MEASURE_ITERS = 20      
PROFILE_STEPS = 6       
PRINT_EVERY = 20

# dirs
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
Path(LOG_DIR).mkdir(parents=True, exist_ok=True)
Path(PROFILE_DIR).mkdir(parents=True, exist_ok=True)
Path(MODEL_DIR).mkdir(parents=True, exist_ok=True)


mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]

train_tf = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std),
])

val_tf = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std),
])

train_dir = os.path.join(DATA_DIR, TRAIN_SUBDIR)
val_dir = os.path.join(DATA_DIR, VAL_SUBDIR)
if not os.path.exists(train_dir) or not os.path.exists(val_dir):
    raise FileNotFoundError(f"Expected dataset dirs: {train_dir} and {val_dir}")

train_dataset = datasets.ImageFolder(train_dir, transform=train_tf)
val_dataset = datasets.ImageFolder(val_dir, transform=val_tf)
NUM_CLASSES = len(train_dataset.classes)

In [75]:
def print_dataset_info(ds):
    print(f"Num classes: {len(ds.classes)}")
    print("Classes:", ds.classes)
    print("Num images:", len(ds))
    counts = {c:0 for c in ds.classes}
    for _, label in ds.samples:
        counts[ds.classes[label]] += 1
    print("Distribution:", counts)

In [76]:
print("Device:", DEVICE)
print_dataset_info(train_dataset)

Device: cuda
Num classes: 13
Classes: ['altocumulus', 'altostratus', 'cirrocumulus', 'cirrostratus', 'cirrus', 'contrails', 'cumulonimbus', 'cumulus', 'lenticular', 'mammatus', 'nimbostratus', 'stratocumulus', 'stratus']
Num images: 1143
Distribution: {'altocumulus': 97, 'altostratus': 99, 'cirrocumulus': 99, 'cirrostratus': 100, 'cirrus': 82, 'contrails': 88, 'cumulonimbus': 76, 'cumulus': 86, 'lenticular': 82, 'mammatus': 83, 'nimbostratus': 86, 'stratocumulus': 84, 'stratus': 81}


In [77]:
def system_ram_mb() -> float:
    if psutil:
        vm = psutil.virtual_memory()
        used = (vm.total - vm.available) / (1024**2)
        return round(used, 2)
    return -1.0

def nvidia_smi_query():
    """Return (gpu_util_pct, vram_used_mb) aggregated average across GPUs or -1 if not found."""
    if not torch.cuda.is_available():
        return -1.0, -1.0
    try:
        out = subprocess.check_output(["nvidia-smi", "--query-gpu=utilization.gpu,memory.used", "--format=csv,noheader,nounits"])
        lines = out.decode().strip().splitlines()
        # take first GPU
        util, mem = [float(x.strip()) for x in lines[0].split(",")]
        return util, mem
    except Exception:
        return -1.0, -1.0

def model_size_mb(model: nn.Module) -> float:
    # serialize to buffer
    buffer = io.BytesIO()
    torch.save(model.state_dict(), buffer)
    size = buffer.getbuffer().nbytes / (1024**2)
    return round(size, 3)

def topk_counts(outputs: torch.Tensor, labels: torch.Tensor, k: int=5) -> int:
    with torch.no_grad():
        _, idx = outputs.topk(k, 1, True, True)
        correct = 0
        for i in range(labels.size(0)):
            if labels[i].item() in idx[i]:
                correct += 1
        return correct


In [78]:
def make_densenet(num_classes: int, pretrained=True):
    model = models.densenet121(pretrained=pretrained)
    num_ftrs = model.classifier.in_features
    model.classifier = nn.Linear(num_ftrs, num_classes)
    return model

In [79]:
def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    running_correct = 0
    total = 0
    for i, (images, labels) in enumerate(loader, 1):
        images = images.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()*images.size(0)
        preds = outputs.argmax(dim=1)
        running_correct += (preds == labels).sum().item()
        total += images.size(0)
        if i % PRINT_EVERY == 0:
            print(f"  Batch {i} loss={running_loss/total:.4f} acc={running_correct/total:.4f}")
    return running_loss/total if total>0 else 0.0, running_correct/total if total>0 else 0.0

In [80]:
def evaluate(model, loader, device):
    model.eval()
    running_loss = 0.0
    running_correct = 0
    top5_correct = 0
    total = 0
    criterion = nn.CrossEntropyLoss()
    with torch.no_grad():
        for images, labels in loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item()*images.size(0)
            preds = outputs.argmax(dim=1)
            running_correct += (preds == labels).sum().item()
            top5_correct += topk_counts(outputs, labels, k=5)
            total += images.size(0)
    if total==0:
        return 0.0, 0.0, 0.0
    return running_loss/total, running_correct/total, top5_correct/total

In [85]:
def measure_latency_throughput(model, device, batch_size, img_size, dtype=torch.float32, warmup=5, iters=10):
    model.eval()
    dummy = torch.randn(batch_size, 3, img_size, img_size, device=device, dtype=dtype)
    
    with torch.no_grad():
        # --- WRAP IN AUTocast FOR AMP ---
        if dtype == torch.float16 and device.type == 'cuda':
            autocast_ctx = autocast
        else:
            class DummyCtx:
                def __enter__(self): return None
                def __exit__(self, exc_type, exc_val, exc_tb): return False
            autocast_ctx = DummyCtx

        # Warmup
        for _ in range(max(1, warmup)):
            with autocast_ctx():
                _ = model(dummy)

        # Timing
        import time
        start = time.time()
        for _ in range(iters):
            with autocast_ctx():
                _ = model(dummy)
        if device.type == 'cuda':
            torch.cuda.synchronize()
        end = time.time()
    
    latency_ms = (end - start) / iters * 1000
    throughput = batch_size / (latency_ms / 1000)
    
    peak_vram = torch.cuda.max_memory_allocated(device) / 1024**2 if device.type=='cuda' else 0
    return latency_ms, throughput, peak_vram


In [86]:
def run_profiler(model, loader, logdir_variant):
    """Run a few steps with PyTorch profiler and write TensorBoard traces under logdir_variant."""
    activities = [ProfilerActivity.CPU]
    if torch.cuda.is_available():
        activities.append(ProfilerActivity.CUDA)
    print("  Starting profiler write to", logdir_variant)
    try:
        with profile(
            activities=activities,
            record_shapes=True,
            profile_memory=True,
            with_stack=True,
            on_trace_ready=tensorboard_trace_handler(logdir_variant),
        ) as prof:
            model.eval()
            it = iter(loader)
            steps = 0
            while steps < PROFILE_STEPS:
                try:
                    images, labels = next(it)
                except StopIteration:
                    break
                images = images.to(DEVICE) if DEVICE.type=='cuda' else images
                with record_function("model_infer"):
                    _ = model(images)
                prof.step()
                steps += 1
        print(f"  Profiler: wrote {steps} steps to {logdir_variant}")
    except Exception as e:
        print("  Profiler failed:", e)


In [87]:
def apply_dynamic_quantization(model_cpu: nn.Module):
    """Apply dynamic quantization to linear / LSTM layers (works on CPU)."""
    model_cpu.eval()
    q_model = torch.quantization.quantize_dynamic(
        model_cpu, {nn.Linear}, dtype=torch.qint8
    )
    return q_model

def prepare_torchscript(model, example_shape=(1,3,IMG_SIZE,IMG_SIZE)):
    model.eval()
    example = torch.randn(*example_shape).to(DEVICE)
    with torch.no_grad():
        scripted = torch.jit.trace(model, example)
    class Wrapped(nn.Module):
        def __init__(self, mod):
            super().__init__()
            self.mod = mod
        def forward(self, x):
            return self.mod(x)
    return Wrapped(scripted).to(DEVICE)

In [None]:
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler
import torch
import os
import csv
import json


def train_epoch_amp(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    running_correct = 0
    total = 0
    scaler = GradScaler()  # local to function

    for images, labels in loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()

        with autocast():
            outputs = model(images)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item() * images.size(0)
        preds = outputs.detach().float().argmax(dim=1)  # FP32 for safety
        running_correct += (preds == labels).sum().item()
        total += images.size(0)

    avg_loss = running_loss / total if total > 0 else 0.0
    avg_acc = running_correct / total if total > 0 else 0.0
    return avg_loss, avg_acc


def main():
    # Dataloaders
    ref_batch = max(BATCH_SIZES)
    train_loader = DataLoader(train_dataset, batch_size=ref_batch, shuffle=True, num_workers=NUM_WORKERS)

    def val_loader_factory(bs):
        return DataLoader(val_dataset, batch_size=bs, shuffle=False, num_workers=NUM_WORKERS)

    all_results = []


    print("=== Baseline FP32 ===")
    baseline = make_densenet(NUM_CLASSES, pretrained=True).to(DEVICE)
    baseline_results = benchmark_variant("baseline_fp32", baseline, train_loader, val_loader_factory, BATCH_SIZES, DEVICE, LOG_DIR)
    all_results.extend(baseline_results)


    if DEVICE.type == 'cuda':
        print("=== AMP (mixed precision) ===")
        amp_model = make_densenet(NUM_CLASSES, pretrained=True).to(DEVICE)

       
        global train_epoch
        orig_train_epoch = train_epoch
        train_epoch = train_epoch_amp

        amp_results = benchmark_variant("amp_fp16", amp_model, train_loader, val_loader_factory, BATCH_SIZES, DEVICE, LOG_DIR)
        all_results.extend(amp_results)

        
        train_epoch = orig_train_epoch
    else:
        print("GPU not available: skipping AMP variant")


    try:
        print("=== TorchScript variant ===")
        ts_model = make_densenet(NUM_CLASSES, pretrained=True).to(DEVICE)
        ts_wrapped = prepare_torchscript(ts_model)
        ts_results = benchmark_variant("torchscript", ts_wrapped, train_loader, val_loader_factory, BATCH_SIZES, DEVICE, LOG_DIR)
        all_results.extend(ts_results)
    except Exception as e:
        print("TorchScript creation failed:", e)


    try:
        print("=== Dynamic Quantization (CPU) ===")
        cpu_model = make_densenet(NUM_CLASSES, pretrained=True).to('cpu')
        q_model = apply_dynamic_quantization(cpu_model)
        cpu_device = torch.device('cpu')
        train_loader_cpu = DataLoader(train_dataset, batch_size=max(1, min(8, ref_batch)), shuffle=True, num_workers=NUM_WORKERS)
        def val_loader_factory_cpu(bs):
            return DataLoader(val_dataset, batch_size=bs, shuffle=False, num_workers=NUM_WORKERS)
        q_results = benchmark_variant("dynamic_quant_cpu", q_model, train_loader_cpu, val_loader_factory_cpu, [1,4,8], cpu_device, LOG_DIR)
        all_results.extend(q_results)
    except Exception as e:
        print("Dynamic quantization failed:", e)


    csv_path = os.path.join(OUTPUT_DIR, "benchmark_results.csv")
    fieldnames = ["model_variant","batch_size","device","ram_usage_mb","vram_usage_mb","cpu_utilization_pct","gpu_utilization_pct","latency_ms","throughput_samples_sec","accuracy_top1","accuracy_top5","model_size_mb","optimization_technique"]
    with open(csv_path, "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for r in all_results:
            out = {k: r.get(k, "") for k in fieldnames}
            writer.writerow(out)

    print("Saved CSV:", csv_path)
    print("Models saved in:", MODEL_DIR)
    print("TensorBoard logs in:", LOG_DIR)
    print("Profiler traces in:", PROFILE_DIR)


    print("\nSummary:")
    for r in all_results:
        print(r)


    with open(os.path.join(OUTPUT_DIR, "benchmark_results.json"), "w") as f:
        json.dump(all_results, f, indent=2)

    print("All done.")


if __name__ == "__main__":
    main()


=== Baseline FP32 ===
[baseline_fp32] Training epoch 1/1
[baseline_fp32] Train loss 2.0158 acc 0.3185 | Val loss 1.9515 top1 0.4434 top5 0.8139
  Saved best model for baseline_fp32
  Starting profiler write to logs/tensorboard/baseline_fp32
  Profiler: wrote 6 steps to logs/tensorboard/baseline_fp32
[baseline_fp32] bs=1 top1=0.4434 top5=0.8139 lat=20.490002632141113ms thr=48.804288508551764 samples/s peak_vram=2955.9677734375 MB
[baseline_fp32] bs=4 top1=0.4434 top5=0.8139 lat=20.9658145904541ms thr=190.78676779967478 samples/s peak_vram=2955.9677734375 MB
[baseline_fp32] bs=8 top1=0.4434 top5=0.8139 lat=25.339221954345703ms thr=315.7160868796128 samples/s peak_vram=2955.9677734375 MB
[baseline_fp32] bs=16 top1=0.4434 top5=0.8139 lat=56.996703147888184ms thr=280.71799097721714 samples/s peak_vram=2955.9677734375 MB
[baseline_fp32] bs=32 top1=0.4434 top5=0.8139 lat=112.72008419036865ms thr=283.8890711433148 samples/s peak_vram=2955.9677734375 MB
=== AMP (mixed precision) ===
[amp_fp16] 

=== Baseline FP32 ===
[baseline_fp32] Training epoch 1/1


TypeError: main.<locals>.train_epoch_amp() missing 1 required positional argument: 'scaler'

# If  you want to use DenseNet for Transferlearning use below code

In [15]:
# data_dir="/kaggle/working/datasets_folder"
# train_dataset = datasets.ImageFolder(os.path.join(data_dir, 'train'), transform=transform)
# val_dataset = datasets.ImageFolder(os.path.join(data_dir, 'test'), transform=transform)

# train_loader=torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
# val_loader=torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)

In [16]:
# def get_dataset_info(datasets):
#     print(f"Number of Classes: {len(datasets.classes)}")
#     print(f"Class name:  {datasets.classes}")
#     print(f"Number of Images: {len(datasets)}")
#     class_count={cls : 0 for cls in datasets.classes}
#     for _, label in datasets.samples:
#         # class_count[datasets.classes[label]] += 1
#     print("Class distribution")
#     for cls, count in class_count.items():
#         print(f"{cls} : {count}")
# print("Training data info:")
# get_dataset_info(train_dataset)
# print("Testting data info:")
# get_dataset_info(val_dataset)

Training data info:
Number of Classes: 13
Class name:  ['altocumulus', 'altostratus', 'cirrocumulus', 'cirrostratus', 'cirrus', 'contrails', 'cumulonimbus', 'cumulus', 'lenticular', 'mammatus', 'nimbostratus', 'stratocumulus', 'stratus']
Number of Images: 1048
Class distribution
altocumulus : 86
altostratus : 92
cirrocumulus : 89
cirrostratus : 94
cirrus : 76
contrails : 77
cumulonimbus : 70
cumulus : 79
lenticular : 76
mammatus : 76
nimbostratus : 81
stratocumulus : 76
stratus : 76
Testting data info:
Number of Classes: 13
Class name:  ['altocumulus', 'altostratus', 'cirrocumulus', 'cirrostratus', 'cirrus', 'contrails', 'cumulonimbus', 'cumulus', 'lenticular', 'mammatus', 'nimbostratus', 'stratocumulus', 'stratus']
Number of Images: 337
Class distribution
altocumulus : 28
altostratus : 30
cirrocumulus : 26
cirrostratus : 28
cirrus : 25
contrails : 26
cumulonimbus : 25
cumulus : 25
lenticular : 24
mammatus : 26
nimbostratus : 25
stratocumulus : 24
stratus : 25


In [17]:
# model =models.densenet121(pretrained=True)

In [18]:
# device = "cuda" if torch.cuda.is_available() else "cpu"
# num_ftrs=model.classifier.in_features
# model.fc = nn.Linear(num_ftrs, len(train_dataset.classes))
# model = model.to(device)

In [20]:
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.fc.parameters(), lr=0.001)

In [21]:
# def train(model, train_loader, criterion, optimizer, epoches=10):
#     model.train()
#     start_time=time.time()
#     for epoch in range(epoches):
#         running_loss=0.0
#         correct=0
#         total=0
#         for images, labels in train_loader:
#             images, labels = images.to(device), labels.to(device)
#             optimizer.zero_grad()
#             outputs = model(images)
#             loss = criterion(outputs, labels)
#             loss.backward()
#             optimizer.step()

#             running_loss += loss.item()
#             _, pedicted=outputs.max(1)
#             total += labels.size(0)
#             correct += pedicted.eq(labels).sum().item()
#         print(f"Epochs [{epoch +1}/{epoches}], loss: {running_loss / len(train_loader):.4f}, Accuracy : {100 * correct/total}")
#         end_time = time.time()
#         print(f"Total Training Time: {end_time - start_time:.2f} seconds")

In [22]:
# def evalution(model, val_loader, criterion):
#     model.eval()
#     correct =0
#     total = 0
#     val_loss = 0.0
#     with torch.no_grad():
#         for images, label in val_loader:
#             images, labels = images.to(device), labels.to(device)
#             outputs = model(images)
#             loss =criterion(outputs, labels)
#             val_loss += loss.item()
#             _, predict = outputs.max(1)
#             total += labels.size(0)
#             correct += predict.eq(labels).sum().item()
#     print(f"Validation : {val_loss/ len(val_loader):.4f}, Accuracy : {100 * correct/total:.2f}%")

In [24]:
# def inference(model, image_path, transforms):
#     model.eval()
#     image=Image.open(image_path)
#     image=transforms(image).unsqueeze(0).to(device)
#     with torch.no_grad():
#         output = model(image)
#         _, predict=torch.max(output, 1)
#     class_name = train_dataset.classes[predict.item()]
#     print(f"Predicted class: {class_name}")
#     return class_name

# def vitualizing(model, val_loader, transform):
#     model.eval()
#     images, labels = next(iter(val_loader))
#     images, labels = images.to(device), labels.to(device)
#     with torch.no_grad():
#         outputs = model(images)
#         _, predict = torch.max(outputs, 1)
#     fig = plt.figure(figsize=(10, 10))
#     for i in range(9):
#         ax = fig.add_subplot(3, 3, i+1, xtricks=[], ytricks=[])
#         img= images[i].cpu().numpy().transpose((1, 2, 0))
#         img= img * np.array([0.299, 0.244, 0.255]) + np.array([0.485, 0.456, 0.406])
#         img= np.clip(img, 0, 1)
#         ax.imshow(img)
#         ax.set_title(f"Pred: {train_dataset.classes[preds[i].item()]}")
#     plt.show()

In [29]:
%load_ext tensorboard
%tensorboard --logdir logs/tensorboard

<IPython.core.display.Javascript object>