In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import sys, subprocess

# Remove integrations that caused crashes in this chat
subprocess.run([sys.executable, "-m", "pip", "-q", "uninstall", "-y", "ray", "wandb", "tensorboard"], check=False)

# Ultralytics is usually preinstalled on Kaggle, but ensure it's importable
try:
    import ultralytics  # noqa
except Exception:
    subprocess.run([sys.executable, "-m", "pip", "-q", "install", "ultralytics==8.2.0", "--no-deps"], check=True)

# OpenCV headless (safe; no deps)
subprocess.run([sys.executable, "-m", "pip", "-q", "install", "opencv-python-headless==4.10.0.84", "--no-deps"], check=False)

import ultralytics
print("✅ ultralytics:", ultralytics.__version__)


In [None]:
import os, sys, importlib
from pathlib import Path

# Disable W&B (prevents project-name crashes)
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_SILENT"] = "true"
os.environ["OMP_NUM_THREADS"] = "1"

WORK = "/kaggle/working"

# Ensure sitecustomize can be found in this process and DDP subprocesses
if WORK not in sys.path:
    sys.path.insert(0, WORK)
os.environ["PYTHONPATH"] = WORK + ":" + os.environ.get("PYTHONPATH", "")

# --- Write sitecustomize to patch torch.load inside every subprocess ---
Path(f"{WORK}/sitecustomize.py").write_text(r'''
import os
os.environ.setdefault("WANDB_DISABLED", "true")
os.environ.setdefault("WANDB_MODE", "disabled")
os.environ.setdefault("WANDB_SILENT", "true")
os.environ.setdefault("OMP_NUM_THREADS", "1")

try:
    import torch

    # Patch only once
    if not getattr(torch, "_weights_only_off", False):
        _orig_torch_load = torch.load

        def patched_torch_load(*args, **kwargs):
            # Force full load in PyTorch 2.6+
            kwargs.setdefault("weights_only", False)
            try:
                return _orig_torch_load(*args, **kwargs)
            except TypeError as e:
                # If some torch build doesn't accept weights_only kwarg, retry without it
                if "weights_only" in str(e):
                    kwargs.pop("weights_only", None)
                    return _orig_torch_load(*args, **kwargs)
                raise

        torch.load = patched_torch_load
        torch._weights_only_off = True
except Exception:
    pass
''')

# Force reload sitecustomize in current notebook process
sys.modules.pop("sitecustomize", None)
importlib.invalidate_caches()
import sitecustomize  # noqa: F401

# Patch torch.load ALSO in current process (belt + suspenders)
import torch
if not getattr(torch, "_weights_only_off", False):
    _orig = torch.load
    def patched_torch_load(*args, **kwargs):
        kwargs.setdefault("weights_only", False)
        try:
            return _orig(*args, **kwargs)
        except TypeError as e:
            if "weights_only" in str(e):
                kwargs.pop("weights_only", None)
                return _orig(*args, **kwargs)
            raise
    torch.load = patched_torch_load
    torch._weights_only_off = True

print("✅ torch:", torch.__version__)
print("✅ torch.load name:", torch.load.__name__)

# Verify subprocess inherits patch (DDP uses subprocesses)
import subprocess as sp
import sys as _sys
out = sp.check_output([_sys.executable, "-c", "import torch; print(torch.load.__name__)"]).decode().strip()
print("✅ subprocess torch.load name:", out)


In [None]:
import torch
from ultralytics import YOLO

print("cuda:", torch.cuda.is_available(), "count:", torch.cuda.device_count())
DEVICE = "0,1" if torch.cuda.is_available() and torch.cuda.device_count() >= 2 else (0 if torch.cuda.is_available() else "cpu")
print("✅ DEVICE:", DEVICE)

_ = YOLO("yolov8l.pt")
print("✅ YOLO('yolov8l.pt') loads OK (UnpicklingError fixed)")


In [None]:
from pathlib import Path
import os, math
from collections import Counter

INPUT = Path("/kaggle/input")
cands = list(INPUT.glob("**/randomized_dataset"))
assert cands, "❌ Could not find 'randomized_dataset' under /kaggle/input"
DATA_ROOT = cands[0]
print("✅ DATA_ROOT:", DATA_ROOT)

IMG_EXTS = (".jpg",".jpeg",".png",".JPG",".JPEG",".PNG")

def has_images(d: Path) -> bool:
    return d.exists() and any(p.is_file() and p.suffix in IMG_EXTS for p in d.iterdir())

def find_split_dirs(root: Path, split: str):
    candidates = [
        (root/split/"images", root/split/"labels"),
        (root/"images"/split, root/"labels"/split),
        (root/split, root/split/"labels"),
    ]
    for img_dir, lbl_dir in candidates:
        if has_images(img_dir):
            return img_dir, (lbl_dir if lbl_dir.exists() else None)
    return None, None

OUT = Path("/kaggle/working/rdd_clean")

def symlink_dir_or_files(src: Path, dst: Path):
    dst.parent.mkdir(parents=True, exist_ok=True)
    if dst.exists() or dst.is_symlink():
        return
    try:
        os.symlink(src, dst, target_is_directory=True)
    except Exception:
        dst.mkdir(parents=True, exist_ok=True)
        for f in src.iterdir():
            if f.is_file() and f.suffix in IMG_EXTS:
                try:
                    os.symlink(f, dst/f.name)
                except:
                    pass

def clamp01(x): 
    return 0.0 if x < 0 else 1.0 if x > 1 else x

def clean_labels(src_img: Path, src_lbl: Path, out_lbl: Path):
    out_lbl.mkdir(parents=True, exist_ok=True)
    stats = Counter()
    imgs = [p for p in src_img.iterdir() if p.is_file() and p.suffix in IMG_EXTS]

    for img_path in imgs:
        src_txt = src_lbl / f"{img_path.stem}.txt"
        out_txt = out_lbl / f"{img_path.stem}.txt"

        if not src_txt.exists():
            out_txt.write_text("")
            stats["missing_label_created"] += 1
            continue

        new_lines = []
        for line in src_txt.read_text().strip().splitlines():
            parts = line.strip().split()
            if len(parts) != 5:
                stats["bad_format"] += 1
                continue
            try:
                cls = int(float(parts[0]))
                xc, yc, w, h = map(float, parts[1:])
            except:
                stats["non_numeric"] += 1
                continue

            if cls < 0 or cls > 4:
                stats["bad_class"] += 1
                continue
            if w <= 0 or h <= 0:
                stats["bad_wh"] += 1
                continue

            xc, yc, w, h = map(clamp01, [xc, yc, w, h])
            new_lines.append(f"{cls} {xc:.6f} {yc:.6f} {w:.6f} {h:.6f}")

        out_txt.write_text("\n".join(new_lines) + ("\n" if new_lines else ""))

    return stats

summary = Counter()
for split in ["train","val","test"]:
    src_img, src_lbl = find_split_dirs(DATA_ROOT, split)
    assert src_img is not None, f"❌ No images found for split={split}"

    symlink_dir_or_files(src_img, OUT / "images" / split)

    if split in ["train","val"]:
        assert src_lbl is not None, f"❌ No labels found for split={split}"
        summary.update(clean_labels(src_img, src_lbl, OUT / "labels" / split))

print("✅ label cleaning summary:", dict(summary.most_common(10)))

yaml_path = Path("/kaggle/working/rdd_clean.yaml")
yaml_path.write_text("""
path: /kaggle/working/rdd_clean
train: images/train
val: images/val
test: images/test

names:
  0: longitudinal_crack
  1: transverse_crack
  2: alligator_crack
  3: other_corruption
  4: pothole
""".strip() + "\n")
print("✅ YAML:", yaml_path)


In [None]:
import time, torch
from ultralytics import YOLO

DEVICE = "0,1" if torch.cuda.is_available() and torch.cuda.device_count() >= 2 else (0 if torch.cuda.is_available() else "cpu")
print("✅ DEVICE:", DEVICE)

START = time.time()
TIME_LIMIT_SEC = 11.3 * 3600  # stop before Kaggle runtime kills session

def epoch_trend_and_stop(trainer):
    # print only on rank0 (DDP)
    rank = getattr(trainer, "rank", 0)
    if rank in (-1, 0):
        m = getattr(trainer, "metrics", {})
        if isinstance(m, dict):
            print(f"\n[EPOCH {trainer.epoch+1}] mAP50={m.get('metrics/mAP50(B)')} | mAP50-95={m.get('metrics/mAP50-95(B)')}")
    if time.time() - START > TIME_LIMIT_SEC:
        print("\n⏱️ Time limit reached → stopping safely.")
        trainer.stop = True

model = YOLO("yolov8l.pt")
model.callbacks.setdefault("on_fit_epoch_end", [])
model.callbacks["on_fit_epoch_end"].append(epoch_trend_and_stop)

# total batch for 2xT4
BATCH = 32  # if OOM -> 24 -> 16

model.train(
    data=str(yaml_path),
    imgsz=640,
    epochs=500,              # time stop ends it
    batch=BATCH,
    device=DEVICE,
    workers=2,               # if RAM crash -> 0
    cache=False,             # do NOT use True (RAM crash)
    amp=True,
    verbose=True,
    project="runs",
    name="baseline_640_2gpu",
    patience=25,
    save=True,
    save_period=5,

    optimizer="SGD",
    lr0=0.01,
    lrf=0.01,
    momentum=0.937,
    weight_decay=0.0005,

    mosaic=0.8,
    mixup=0.10,
    translate=0.10,
    scale=0.50,
    fliplr=0.5,
    close_mosaic=10
)
