# Data Understanding & Preparation

In [3]:
# ============================================================
# STAGE 1 — Data Understanding & Preparation (FINAL)
# Fix:
# - Handle filename mismatch: train_XXX ↔ mask_XXX
# - Strict validation without false assertion
# - Build clean training manifest
# ============================================================

from pathlib import Path
import numpy as np
import cv2
import pandas as pd
from tqdm import tqdm
import re

# -----------------------------
# CONFIG
# -----------------------------
DATA_ROOT = Path("/kaggle/input/data-science-ara-7-0/dataset/dataset")

TRAIN_IMG_DIR = DATA_ROOT / "train" / "images"
TRAIN_MASK_DIR = DATA_ROOT / "train" / "mask"
TEST_IMG_DIR  = DATA_ROOT / "test" / "images"

IMG_EXTS = {".jpg", ".jpeg", ".png"}

# -----------------------------
# 1. LOAD FILES
# -----------------------------
train_images = sorted([p for p in TRAIN_IMG_DIR.iterdir() if p.suffix.lower() in IMG_EXTS])
train_masks  = sorted([p for p in TRAIN_MASK_DIR.iterdir() if p.suffix.lower() in IMG_EXTS])
test_images  = sorted([p for p in TEST_IMG_DIR.iterdir() if p.suffix.lower() in IMG_EXTS])

print(f"[INFO] Train images : {len(train_images)}")
print(f"[INFO] Train masks  : {len(train_masks)}")
print(f"[INFO] Test images  : {len(test_images)}")

# -----------------------------
# 2. BUILD MASK INDEX BY NUMBER
# -----------------------------
def extract_index(name: str):
    """
    Extract numeric id from:
    - train_048.jpg
    - mask_048.png
    """
    m = re.search(r"(\d+)", name)
    return m.group(1) if m else None

mask_index = {}
for m in train_masks:
    idx = extract_index(m.stem)
    if idx is not None:
        mask_index[idx] = m

# -----------------------------
# 3. PAIR IMAGE–MASK
# -----------------------------
pairs = []
missing = []

for img in train_images:
    idx = extract_index(img.stem)
    if idx in mask_index:
        pairs.append({
            "image_path": img,
            "mask_path": mask_index[idx],
            "id": idx
        })
    else:
        missing.append(img.name)

print(f"[INFO] Valid pairs : {len(pairs)}")
print(f"[WARNING] Missing masks : {len(missing)}")

if missing:
    print("[WARNING] Example missing:", missing[:10])

assert len(pairs) > 0, "No valid image-mask pairs found"

# -----------------------------
# 4. SANITY CHECK (SHAPE + VALUES)
# -----------------------------
records = []

for p in tqdm(pairs, desc="Validating pairs"):
    img = cv2.imread(str(p["image_path"]))
    mask = cv2.imread(str(p["mask_path"]), cv2.IMREAD_GRAYSCALE)

    assert img is not None, f"Failed to read image {p['image_path']}"
    assert mask is not None, f"Failed to read mask {p['mask_path']}"
    assert img.shape[:2] == mask.shape, f"Shape mismatch: {p['image_path'].name}"

    uniq = np.unique(mask)

    records.append({
        "image": p["image_path"].name,
        "mask": p["mask_path"].name,
        "height": mask.shape[0],
        "width": mask.shape[1],
        "unique_values": uniq.tolist(),
        "has_pothole": int((mask == 255).any())
    })

df_info = pd.DataFrame(records)

# -----------------------------
# 5. DATASET STATS
# -----------------------------
print("\n[INFO] Unique mask values:")
print(sorted(set(v for row in df_info["unique_values"] for v in row)))

print("\n[INFO] Pothole presence:")
print(df_info["has_pothole"].value_counts())

print("\n[INFO] Resolution distribution (top):")
print(df_info.groupby(["height", "width"]).size().sort_values(ascending=False).head())

# -----------------------------
# 6. STRICT VALIDATION
# -----------------------------
all_vals = set(v for row in df_info["unique_values"] for v in row)
assert all_vals.issubset({0, 255}), f"Invalid mask values detected: {all_vals}"

# -----------------------------
# 7. EXPORT CLEAN MANIFEST
# -----------------------------
df_manifest = pd.DataFrame({
    "image_path": [str(p["image_path"]) for p in pairs],
    "mask_path":  [str(p["mask_path"]) for p in pairs],
    "id":         [p["id"] for p in pairs],
})

print(f"\n[INFO] Final usable training samples: {len(df_manifest)}")

# Optional save
# df_manifest.to_csv("/kaggle/working/train_manifest.csv", index=False)

print("\n[STAGE 1 COMPLETE]")
print("Image–mask mapping correct (train_xxx ↔ mask_xxx)")
print("Dataset fully validated")
print("Ready for STAGE 2")

[INFO] Train images : 498
[INFO] Train masks  : 498
[INFO] Test images  : 295
[INFO] Valid pairs : 498


Validating pairs: 100%|██████████| 498/498 [00:59<00:00,  8.40it/s]


[INFO] Unique mask values:
[0, 255]

[INFO] Pothole presence:
has_pothole
1    498
Name: count, dtype: int64

[INFO] Resolution distribution (top):
height  width
720     720      74
4160    3120     50
2760    3680     39
234     416      29
360     640      27
dtype: int64

[INFO] Final usable training samples: 498

[STAGE 1 COMPLETE]
Image–mask mapping correct (train_xxx ↔ mask_xxx)
Dataset fully validated
Ready for STAGE 2





# Preprocessing & Data Augmentation

In [4]:
# ============================================================
# STAGE 2 — Preprocessing & Data Augmentation
# Purpose:
# - Define resize strategy (fixed-size)
# - Normalize input for EfficientNet encoder
# - Build train / valid / test augmentation pipelines
# Notes:
# - Mask kept binary {0,1} for training
# - Conversion to {0,255} happens only at submission stage
# ============================================================

import albumentations as A
from albumentations.pytorch import ToTensorV2
import cv2
import numpy as np

# -----------------------------
# GLOBAL SETTINGS
# -----------------------------
# Recommended: 512 (safe) or 640 (higher PB if GPU allows)
INPUT_SIZE = 512

# ImageNet normalization (required for EfficientNet)
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD  = (0.229, 0.224, 0.225)

# -----------------------------
# 1. TRAIN AUGMENTATION
# -----------------------------
train_transform = A.Compose(
    [
        # --- geometric ---
        A.Resize(INPUT_SIZE, INPUT_SIZE, interpolation=cv2.INTER_LINEAR),
        A.HorizontalFlip(p=0.5),

        # --- photometric (road-specific) ---
        A.RandomBrightnessContrast(
            brightness_limit=0.2,
            contrast_limit=0.2,
            p=0.7
        ),
        A.HueSaturationValue(
            hue_shift_limit=10,
            sat_shift_limit=15,
            val_shift_limit=10,
            p=0.5
        ),
        A.RandomShadow(
            shadow_roi=(0, 0.5, 1, 1),
            num_shadows_lower=1,
            num_shadows_upper=2,
            shadow_dimension=5,
            p=0.3
        ),

        # --- texture noise ---
        A.GaussianBlur(blur_limit=3, p=0.2),

        # --- normalize & tensor ---
        A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
        ToTensorV2(),
    ],
    additional_targets={"mask": "mask"},
)

# -----------------------------
# 2. VALIDATION AUGMENTATION
# (NO randomness)
# -----------------------------
valid_transform = A.Compose(
    [
        A.Resize(INPUT_SIZE, INPUT_SIZE, interpolation=cv2.INTER_LINEAR),
        A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
        ToTensorV2(),
    ],
    additional_targets={"mask": "mask"},
)

# -----------------------------
# 3. TEST AUGMENTATION
# (image only)
# -----------------------------
test_transform = A.Compose(
    [
        A.Resize(INPUT_SIZE, INPUT_SIZE, interpolation=cv2.INTER_LINEAR),
        A.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
        ToTensorV2(),
    ]
)

# -----------------------------
# 4. QUICK SANITY PRINT
# -----------------------------
print("[STAGE 2 READY]")
print(f"Input size          : {INPUT_SIZE} x {INPUT_SIZE}")
print("Normalization       : ImageNet (EfficientNet compatible)")
print("Train augmentation  : ON (lighting, shadow, blur, flip)")
print("Validation/test aug : OFF (deterministic)")
print("Safe to proceed to STAGE 3 (Dataset & DataLoader)")

[STAGE 2 READY]
Input size          : 512 x 512
Normalization       : ImageNet (EfficientNet compatible)
Train augmentation  : ON (lighting, shadow, blur, flip)
Validation/test aug : OFF (deterministic)
Safe to proceed to STAGE 3 (Dataset & DataLoader)


  A.RandomShadow(


# Model Construction & Training

In [6]:
!pip install -q segmentation-models-pytorch==0.3.3 timm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.5/68.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Building wheel for efficientnet-pytorch (setup.py) ... [?25l[?25hdone
  Building wheel for pretrainedmodels (setup.py) ... [?25l[?25hdone


In [None]:
# ============================================================
# STAGE 3 — Model Construction & Training (REVISED)
# Fix:
# - Install segmentation-models-pytorch inside cell
# ============================================================

# 1. IMPORTS
# -----------------------------
import os
import random
from pathlib import Path
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

import segmentation_models_pytorch as smp

# -----------------------------
# 2. REPRODUCIBILITY
# -----------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# -----------------------------
# 3. LOAD DATA (FROM STAGE 1 LOGIC)
# -----------------------------
DATA_ROOT = Path("/kaggle/input/data-science-ara-7-0/dataset/dataset")
TRAIN_IMG_DIR = DATA_ROOT / "train" / "images"
TRAIN_MASK_DIR = DATA_ROOT / "train" / "mask"

import re
def extract_idx(name):
    m = re.search(r"(\d+)", name)
    return m.group(1) if m else None

pairs = []
for img in TRAIN_IMG_DIR.iterdir():
    idx = extract_idx(img.name)
    mask = TRAIN_MASK_DIR / f"mask_{idx}.png"
    if mask.exists():
        pairs.append((str(img), str(mask)))

df = pd.DataFrame(pairs, columns=["image_path", "mask_path"])
print("Total training samples:", len(df))

# -----------------------------
# 4. TRAIN / VALID SPLIT
# -----------------------------
from sklearn.model_selection import train_test_split

df_train, df_valid = train_test_split(
    df,
    test_size=0.15,
    random_state=SEED,
    shuffle=True
)

print("Train:", len(df_train), "Valid:", len(df_valid))

# -----------------------------
# 5. DATASET CLASS
# -----------------------------
class PotholeDataset(Dataset):
    def __init__(self, df, transform):
        self.df = df.reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img = cv2.imread(self.df.loc[idx, "image_path"])
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        mask = cv2.imread(self.df.loc[idx, "mask_path"], cv2.IMREAD_GRAYSCALE)
        mask = (mask == 255).astype("float32")

        augmented = self.transform(image=img, mask=mask)
        img = augmented["image"]
        mask = augmented["mask"].unsqueeze(0)

        return img, mask

# -----------------------------
# 6. DATALOADERS
# -----------------------------
BATCH_SIZE = 4

train_ds = PotholeDataset(df_train, train_transform)
valid_ds = PotholeDataset(df_valid, valid_transform)

train_loader = DataLoader(
    train_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)

valid_loader = DataLoader(
    valid_ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

# -----------------------------
# 7. MODEL
# -----------------------------
model = smp.UnetPlusPlus(
    encoder_name="efficientnet-b4",
    encoder_weights="imagenet",
    in_channels=3,
    classes=1,
)

model.to(device)

# -----------------------------
# 8. LOSS
# -----------------------------
dice_loss = smp.losses.DiceLoss(mode="binary")
focal_loss = smp.losses.FocalLoss(mode="binary")

def criterion(pred, target):
    return dice_loss(pred, target) + focal_loss(pred, target)

# -----------------------------
# 9. METRIC
# -----------------------------
def dice_coef(pred, target, eps=1e-7):
    pred = (pred > 0.5).float()
    intersection = (pred * target).sum(dim=(2,3))
    union = pred.sum(dim=(2,3)) + target.sum(dim=(2,3))
    dice = (2 * intersection + eps) / (union + eps)
    return dice.mean()

# -----------------------------
# 10. OPTIMIZER
# -----------------------------
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)

# -----------------------------
# 11. TRAINING LOOP
# -----------------------------
EPOCHS = 30
best_dice = 0.0

for epoch in range(1, EPOCHS + 1):
    model.train()
    train_loss = 0.0

    for imgs, masks in tqdm(train_loader, desc=f"Epoch {epoch} [TRAIN]"):
        imgs = imgs.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()
        logits = model(imgs)
        loss = criterion(logits, masks)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    model.eval()
    val_loss = 0.0
    val_dice = 0.0

    with torch.no_grad():
        for imgs, masks in tqdm(valid_loader, desc=f"Epoch {epoch} [VALID]"):
            imgs = imgs.to(device)
            masks = masks.to(device)

            logits = model(imgs)
            loss = criterion(logits, masks)
            dice = dice_coef(torch.sigmoid(logits), masks)

            val_loss += loss.item()
            val_dice += dice.item()

    val_loss /= len(valid_loader)
    val_dice /= len(valid_loader)

    print(
        f"Epoch {epoch:02d} | "
        f"Train Loss {train_loss:.4f} | "
        f"Val Loss {val_loss:.4f} | "
        f"Val Dice {val_dice:.4f}"
    )

    if val_dice > best_dice:
        best_dice = val_dice
        torch.save(model.state_dict(), "/kaggle/working/best_unetpp_effb4.pt")
        print(">> Best model saved")

print("\n[STAGE 3 COMPLETE]")
print("Best validation Dice:", round(best_dice, 4))
print("Ready for STAGE 4 (Inference, Threshold Tuning & RLE Submission)")



Device: cpu
Total training samples: 498
Train: 423 Valid: 75
Downloading: "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b4-6ed6700e.pth" to /root/.cache/torch/hub/checkpoints/efficientnet-b4-6ed6700e.pth


100%|██████████| 74.4M/74.4M [00:00<00:00, 175MB/s]
Epoch 1 [TRAIN]:   6%|▌         | 6/106 [02:31<40:48, 24.49s/it]

# Optimization, Validation & Refinement

In [None]:
# ============================================================
# STAGE 4 — Optimization, Validation & Refinement
# Purpose:
# - Load best model
# - Tune threshold on validation set
# - Apply post-processing
# - Select optimal configuration for inference
# ============================================================

import numpy as np
import torch
import cv2
from tqdm import tqdm
import pandas as pd

# -----------------------------
# CONFIG
# -----------------------------
MODEL_PATH = "/kaggle/working/best_unetpp_effb4.pt"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Threshold candidates (PB-oriented)
THRESHOLDS = [0.30, 0.35, 0.40, 0.45, 0.50]

# Post-processing
MIN_AREA = 300   # pixels; tune if needed

# -----------------------------
# LOAD MODEL
# -----------------------------
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model.eval()
model.to(DEVICE)

print("[INFO] Best model loaded")

# -----------------------------
# HELPER FUNCTIONS
# -----------------------------
def dice_score(pred, target, eps=1e-7):
    intersection = (pred * target).sum()
    union = pred.sum() + target.sum()
    return (2 * intersection + eps) / (union + eps)

def remove_small_objects(mask, min_area):
    num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
        mask.astype(np.uint8), connectivity=8
    )
    clean = np.zeros_like(mask, dtype=np.uint8)
    for i in range(1, num_labels):
        if stats[i, cv2.CC_STAT_AREA] >= min_area:
            clean[labels == i] = 1
    return clean

# -----------------------------
# THRESHOLD TUNING
# -----------------------------
results = []

with torch.no_grad():
    for thr in THRESHOLDS:
        dices = []

        for imgs, masks in tqdm(valid_loader, desc=f"Tuning thr={thr}"):
            imgs = imgs.to(DEVICE)
            masks = masks.to(DEVICE)

            probs = torch.sigmoid(model(imgs)).cpu().numpy()
            gt = masks.cpu().numpy()

            for i in range(probs.shape[0]):
                pred = (probs[i, 0] > thr).astype(np.uint8)
                pred = remove_small_objects(pred, MIN_AREA)

                dice = dice_score(pred, gt[i, 0])
                dices.append(dice)

        mean_dice = float(np.mean(dices))
        results.append({"threshold": thr, "dice": mean_dice})
        print(f"[RESULT] thr={thr:.2f} | Dice={mean_dice:.4f}")

# -----------------------------
# SELECT BEST CONFIG
# -----------------------------
df_thr = pd.DataFrame(results).sort_values("dice", ascending=False)
best_thr = float(df_thr.iloc[0]["threshold"])
best_dice = float(df_thr.iloc[0]["dice"])

print("\n[OPTIMAL CONFIG]")
print(df_thr)
print(f"\nBest threshold : {best_thr}")
print(f"Best val Dice  : {best_dice:.4f}")

# -----------------------------
# SAVE CONFIG FOR STAGE 5
# -----------------------------
OPT_CONFIG = {
    "threshold": best_thr,
    "min_area": MIN_AREA,
}

print("\n[STAGE 4 COMPLETE]")
print("Threshold tuning finished")
print("Post-processing calibrated")
print("Ready for STAGE 5 (Inference & RLE Submission)")


# Inference, Encoding & Submission

In [None]:
# ============================================================
# STAGE 5 — Inference, RLE Encoding & Submission
# Purpose:
# - Run inference on test/images
# - Apply threshold + post-processing
# - Encode mask using RLE (column-wise)
# - Generate submission.csv matching sample_submission
# ============================================================

import numpy as np
import pandas as pd
import torch
import cv2
from pathlib import Path
from tqdm import tqdm

# -----------------------------
# CONFIG
# -----------------------------
DATA_ROOT = Path("/kaggle/input/data-science-ara-7-0/dataset/dataset")
TEST_IMG_DIR = DATA_ROOT / "test" / "images"
SAMPLE_SUB = Path("/kaggle/input/data-science-ara-7-0/sample_submission.csv")

MODEL_PATH = "/kaggle/working/best_unetpp_effb4.pt"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# from STAGE 4
BEST_THRESHOLD = OPT_CONFIG["threshold"]
MIN_AREA = OPT_CONFIG["min_area"]

# input size must match STAGE 2
INPUT_SIZE = 512

# -----------------------------
# LOAD SAMPLE SUBMISSION
# -----------------------------
df_sub = pd.read_csv(SAMPLE_SUB)
id_col = df_sub.columns[0]   # ImageId
rle_col = df_sub.columns[1]  # rle

print("[INFO] Sample submission loaded")
print(df_sub.head())

# -----------------------------
# LOAD MODEL
# -----------------------------
model.load_state_dict(torch.load(MODEL_PATH, map_location=DEVICE))
model.eval()
model.to(DEVICE)

# -----------------------------
# RLE ENCODER (OFFICIAL)
# -----------------------------
def encode_rle(mask: np.ndarray, pos_value: int = 255) -> str:
    binary = (mask == pos_value).astype(np.uint8)
    pixels = binary.T.flatten()  # column-wise
    pixels = np.concatenate([[0], pixels, [0]])
    runs = np.where(pixels[1:] != pixels[:-1])[0] + 1
    runs[1::2] -= runs[0::2]
    return " ".join(str(x) for x in runs)

# -----------------------------
# POST-PROCESSING
# -----------------------------
def remove_small_objects(mask, min_area):
    num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(
        mask.astype(np.uint8), connectivity=8
    )
    clean = np.zeros_like(mask, dtype=np.uint8)
    for i in range(1, num_labels):
        if stats[i, cv2.CC_STAT_AREA] >= min_area:
            clean[labels == i] = 1
    return clean

# -----------------------------
# INFERENCE LOOP
# -----------------------------
pred_rles = []

with torch.no_grad():
    for img_name in tqdm(df_sub[id_col].values, desc="Inference"):
        img_path = TEST_IMG_DIR / img_name
        assert img_path.exists(), f"Missing test image {img_name}"

        # read image
        img = cv2.imread(str(img_path))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        h0, w0 = img.shape[:2]

        # resize (same as STAGE 2)
        img_resized = cv2.resize(img, (INPUT_SIZE, INPUT_SIZE))
        img_resized = img_resized.astype("float32") / 255.0

        # normalize ImageNet
        img_resized[..., 0] = (img_resized[..., 0] - 0.485) / 0.229
        img_resized[..., 1] = (img_resized[..., 1] - 0.456) / 0.224
        img_resized[..., 2] = (img_resized[..., 2] - 0.406) / 0.225

        # to tensor
        img_tensor = torch.from_numpy(img_resized.transpose(2, 0, 1)).unsqueeze(0)
        img_tensor = img_tensor.to(DEVICE)

        # predict
        prob = torch.sigmoid(model(img_tensor))[0, 0].cpu().numpy()

        # threshold + post-process
        pred = (prob > BEST_THRESHOLD).astype(np.uint8)
        pred = remove_small_objects(pred, MIN_AREA)

        # resize back to original size
        pred = cv2.resize(pred, (w0, h0), interpolation=cv2.INTER_NEAREST)

        # convert to {0,255}
        pred_255 = pred * 255

        # RLE
        if pred_255.sum() == 0:
            rle = ""
        else:
            rle = encode_rle(pred_255)

        pred_rles.append(rle)

# -----------------------------
# BUILD SUBMISSION
# -----------------------------
df_sub[rle_col] = pred_rles

OUT_SUB = "/kaggle/working/submission.csv"
df_sub.to_csv(OUT_SUB, index=False)

print("\n[STAGE 5 COMPLETE]")
print(f"Submission saved to: {OUT_SUB}")

# -----------------------------
# FINAL QA
# -----------------------------
print("\n[QA CHECK]")
print("Rows submission :", len(df_sub))
print("Empty RLE count :", (df_sub[rle_col] == "").sum())
print("Sample rows:")
print(df_sub.head())
