# üß† Key Insights & Recommendations

### üìä What the Data Shows
- **Median coverage (P50):** ~0.000% of the image.  
- **90th percentile (P90):** 90% of forged images have coverage ‚â§ **1.167%**.  
- **95th percentile (P95):** 95% of forged images have coverage ‚â§ **9.864%**.  
- **Mean coverage:** **1.934%**, skewed by a few very large masks.  
- **Median forged mask area:** 0 px ‚Ä¢ **P90:** 7,900 px ‚Ä¢ **P95:** 25,392 px.  

Most forged regions are **tiny** compared to the full image ‚Äî often less than 1% of total pixels.

---

### üî¨ Practical Recommendations

**1. Model design**
- Use **loss functions sensitive to small regions** such as `DiceLoss`, `FocalLoss`, or a combination (e.g. BCE + Dice).
- Train with **higher input resolution** to preserve small texture and boundary details.
- Consider multi-scale feature extraction or attention-based segmentation models (e.g. U-Net++, DeepLabv3+, Swin-Unet).

**2. Sampling & class balance**
- **Oversample** forged images with very small mask coverage to help the model learn subtle forgeries.
- Optionally undersample or weight authentic images to balance the dataset.

**3. Data augmentations**
- Apply **gentle augmentations** that preserve pixel-level structure:
  - ‚úÖ Brightness/contrast shift, flip, rotate, crop, resize.
  - ‚ö†Ô∏è Avoid heavy blur, strong noise, or compression artifacts ‚Äî these can destroy small forged cues.

**4. Evaluation & visualization**
- Use **pixel-level metrics** (Dice/F1/IoU) instead of only image-level accuracy.
- Visualize both correct and incorrect predictions to ensure the model captures tiny anomalies.

---

### üí° TL;DR
> Most forgeries are **small and subtle** ‚Äî treat this as a fine-grained segmentation task.  
> High-resolution inputs + small-object-aware loss + balanced sampling = better detection accuracy.

In [None]:
# ===== Cell: Imports & Paths =====
import os, warnings, random, gc
warnings.filterwarnings("ignore")

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
from PIL import Image
from tqdm import tqdm

DATA_DIR = Path("/kaggle/input/recodai-luc-scientific-image-forgery-detection")
TRAIN_IMG_DIR = DATA_DIR / "train_images"
MASK_DIR      = DATA_DIR / "train_masks"
TEST_IMG_DIR  = DATA_DIR / "test_images"

assert (TRAIN_IMG_DIR / "authentic").exists(), "Missing train_images/authentic"
assert (TRAIN_IMG_DIR / "forged").exists(), "Missing train_images/forged"
assert MASK_DIR.exists(), "Missing train_masks"

def case_id_from_path(p: Path) -> str:
    return p.stem

def read_image(path: Path):
    img = cv2.imread(str(path), cv2.IMREAD_COLOR)
    if img is None:
        raise FileNotFoundError(path)
    return img


In [None]:
# ===== Cell: Utilities =====
def list_image_paths():
    auth_paths = sorted((TRAIN_IMG_DIR / "authentic").glob("*.png"))
    forg_paths = sorted((TRAIN_IMG_DIR / "forged").glob("*.png"))
    mask_paths = sorted(MASK_DIR.glob("*.npy"))
    return auth_paths, forg_paths, mask_paths

def ensure_binary_mask(m):
    if m.dtype != np.uint8: m = m.astype(np.uint8)
    return (m > 0).astype(np.uint8)

def overlay_mask_bgr(img_bgr, mask_bin, alpha=0.35):
    if mask_bin is None: return img_bgr
    ov = img_bgr.copy()
    ov[mask_bin>0] = (0,0,255)
    return cv2.addWeighted(img_bgr, 1.0, ov, alpha, 0)


In [None]:
# ===== Cell: Index (Disambiguated) & Sanity =====
auth_paths, forg_paths, mask_paths = list_image_paths()

print(f"authentic images: {len(auth_paths)}")
print(f"forged images   : {len(forg_paths)}")
print(f"mask files      : {len(mask_paths)}")

# Build separate frames
df_auth = pd.DataFrame({
    "case_id": [p.stem for p in auth_paths],
    "label": ["authentic"]*len(auth_paths),
    "img_path": [str(p) for p in auth_paths],
})

df_forg = pd.DataFrame({
    "case_id": [p.stem for p in forg_paths],
    "label": ["forged"]*len(forg_paths),
    "img_path": [str(p) for p in forg_paths],
})

df_mask = pd.DataFrame({
    "case_id": [m.stem for m in mask_paths],
    "mask_path": [str(m) for m in mask_paths]
})

# Merge masks ONLY into forged subset
df_forg = df_forg.merge(df_mask, on="case_id", how="left")
df_forg["has_mask"] = df_forg["mask_path"].notna()

# Authentic subset must never carry masks
df_auth["mask_path"] = None
df_auth["has_mask"]  = False

# Concatenate
df = pd.concat([df_auth, df_forg], axis=0, ignore_index=True)

# Diagnostics for duplicate case_ids across classes
dup_case_ids = df.groupby("case_id")["label"].nunique()
n_dups = (dup_case_ids > 1).sum()
print(f"case_ids present in BOTH authentic and forged: {n_dups}")

# Sanity
num_masks = len(df_mask)
num_forged = (df_forg["label"]=="forged").sum()
auth_with_mask = df_auth["has_mask"].sum()
forged_without_mask = ((df_forg["label"]=="forged") & ~df_forg["has_mask"]).sum()

print("\n=== Sanity ===")
print("masks == forged?              ", num_masks, "==", num_forged)
print("authentic with mask (should 0):", auth_with_mask)
print("forged without mask (should 0):", forged_without_mask)

# Assertions
assert num_masks == num_forged, "Number of masks must equal number of forged images."
assert auth_with_mask == 0, "Authentic images must have no masks."
assert forged_without_mask == 0, "All forged images must have exactly one mask."

df.head()


In [None]:
# ===== Cell: Image Size Stats by Class =====
size_rows = []
sample_df = df.sample(min(1500, len(df)), random_state=42)

for _, r in tqdm(sample_df.iterrows(), total=len(sample_df)):
    img = read_image(Path(r["img_path"]))
    h, w = img.shape[:2]
    size_rows.append({"case_id": r["case_id"], "label": r["label"], "img_h": h, "img_w": w})

sizes = pd.DataFrame(size_rows)
display(sizes.describe())

plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.hist(sizes.loc[sizes.label=='authentic','img_h'], bins=40)
plt.title("Image height ‚Äî authentic")
plt.subplot(1,2,2)
plt.hist(sizes.loc[sizes.label=='forged','img_h'], bins=40)
plt.title("Image height ‚Äî forged")
plt.tight_layout(); plt.show()

plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.hist(sizes.loc[sizes.label=='authentic','img_w'], bins=40)
plt.title("Image width ‚Äî authentic")
plt.subplot(1,2,2)
plt.hist(sizes.loc[sizes.label=='forged','img_w'], bins=40)
plt.title("Image width ‚Äî forged")
plt.tight_layout(); plt.show()


In [None]:
# ===== Cell: Channel Intensity Stats =====
def channel_stats(paths, k=300):
    paths = list(paths)
    if len(paths) == 0:
        return np.array([0,0,0], dtype=float), np.array([0,0,0], dtype=float)
    paths = random.sample(paths, min(k, len(paths)))
    m, s = [], []
    for p in paths:
        img = cv2.imread(str(p), cv2.IMREAD_COLOR) / 255.0
        m.append(img.mean(axis=(0,1)))  # BGR
        s.append(img.std(axis=(0,1)))
    m, s = np.array(m), np.array(s)
    return m.mean(axis=0), s.mean(axis=0)

auth_m, auth_s = channel_stats(df.loc[df.label=='authentic','img_path'])
forg_m, forg_s = channel_stats(df.loc[df.label=='forged','img_path'])

print("authentic mean BGR:", auth_m, "std:", auth_s)
print("forged mean BGR   :", forg_m, "std:", forg_s)

plt.figure(figsize=(8,4))
plt.bar(['B-auth','G-auth','R-auth','B-forg','G-forg','R-forg'], np.r_[auth_m, forg_m])
plt.title("Average channel intensity (BGR)")
plt.show()


In [None]:
# ===== Cell: Mask Stats (Forged Only) ‚Äî clearer charts + auto-explanations =====
forged_df = df[df.label == 'forged'].copy()

areas, coverages, sizes2 = [], [], []
bad_masks = []

for _, r in tqdm(forged_df.iterrows(), total=len(forged_df)):
    img = read_image(Path(r['img_path']))
    H, W = img.shape[:2]

    try:
        m = np.load(r['mask_path'])
        if m is None or m.size == 0:
            raise ValueError("Empty mask")

        # Ensure 2D binary mask
        m = (m > 0).astype(np.uint8)
        if m.ndim == 3:
            m = m[..., 0]
        if m.shape != (H, W):
            if m.shape[::-1] == (H, W):
                m = m.T
            else:
                m = cv2.resize(m, (W, H), interpolation=cv2.INTER_NEAREST)

        area = int(m.sum())
        cov = area / (H * W)  # 0..1
    except Exception as e:
        bad_masks.append((r['case_id'], str(e)))
        area, cov = 0, 0.0

    areas.append(area)
    coverages.append(cov)
    sizes2.append((H, W))

forged_df['mask_area'] = areas
forged_df['coverage']  = coverages
forged_df['img_h']     = [s[0] for s in sizes2]
forged_df['img_w']     = [s[1] for s in sizes2]

# Basic table
display(forged_df[['mask_area','coverage','img_h','img_w']].describe())

# --------- Helpful summary stats we will annotate on plots ----------
import numpy as np
N = len(forged_df)
cov = forged_df['coverage'].values
area = forged_df['mask_area'].values
cov_pct = cov * 100.0

def q(x, p): 
    return float(np.percentile(x, p))

stats = {
    "cov_p50": q(cov_pct, 50),
    "cov_p75": q(cov_pct, 75),
    "cov_p90": q(cov_pct, 90),
    "cov_p95": q(cov_pct, 95),
    "area_p50": q(area, 50),
    "area_p90": q(area, 90),
    "area_p95": q(area, 95),
    "mean_cov_pct": float(cov_pct.mean())
}

# --------- Coverage histogram (with percentiles marked) ----------
plt.figure(figsize=(12,4))
plt.hist(cov_pct, bins=60)
plt.title("How big are forged regions? (Coverage % of image)")
plt.xlabel("Coverage (%)"); plt.ylabel("Image count")

for p in [50, 90, 95]:
    v = q(cov_pct, p)
    plt.axvline(v, linestyle="--")
    plt.text(v, plt.ylim()[1]*0.9, f"P{p}={v:.3f}%", rotation=90, va="top")

plt.tight_layout()
plt.show()

# --------- Cumulative distribution (CDF) of coverage ----------
sorted_cov = np.sort(cov_pct)
cdf = np.arange(1, N+1) / N

plt.figure(figsize=(12,4))
plt.plot(sorted_cov, cdf)
plt.title("Cumulative fraction of images by coverage size")
plt.xlabel("Coverage (%)"); plt.ylabel("Fraction of images ‚â§ x")

for p in [50, 90, 95]:
    v = q(cov_pct, p)
    frac = p/100.0
    plt.axvline(v, linestyle="--")
    plt.hlines(frac, xmin=0, xmax=v, linestyles="--")
    plt.text(v, frac, f"P{p}={v:.3f}%", va="bottom", ha="right")

plt.tight_layout()
plt.show()

# --------- Coverage category bar chart (easier to read) ----------
bins = [0, 0.1, 0.5, 1.0, 2.0, 5.0, np.inf]  # in percent
labels = ["‚â§0.1%", "0.1‚Äì0.5%", "0.5‚Äì1%", "1‚Äì2%", "2‚Äì5%", ">5%"]
cats = pd.cut(cov_pct, bins=bins, labels=labels, include_lowest=True, right=True)
counts = cats.value_counts().reindex(labels).fillna(0).astype(int)

plt.figure(figsize=(10,4))
bars = plt.bar(range(len(labels)), counts.values)
plt.xticks(range(len(labels)), labels)
plt.ylabel("Image count")
plt.title("Coverage buckets (how many images fall in each size range)")

# annotate bars with counts + percentage
for i, b in enumerate(bars):
    cnt = counts.values[i]
    pct = 100.0 * cnt / N
    plt.text(b.get_x() + b.get_width()/2, b.get_height() * 0.98, f"{cnt}\n({pct:.1f}%)",
             ha="center", va="top")

plt.tight_layout()
plt.show()

# --------- Boxplots (area and coverage) ----------
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.boxplot(area, vert=True, showfliers=False)
plt.title("Mask pixel area (no outliers)")
plt.ylabel("Pixels")

plt.subplot(1,2,2)
plt.boxplot(cov_pct, vert=True, showfliers=False)
plt.title("Coverage % (no outliers)")
plt.ylabel("Coverage (%)")

plt.tight_layout()
plt.show()

# --------- Textual takeaways printed to output ----------
print("==== What to read from the charts ====")
print(f"- Median coverage (P50): ~{stats['cov_p50']:.3f}% of the image.")
print(f"- 90% of forged images have coverage ‚â§ {stats['cov_p90']:.3f}% (P90).")
print(f"- 95% of forged images have coverage ‚â§ {stats['cov_p95']:.3f}% (P95).")
print(f"- Mean coverage: {stats['mean_cov_pct']:.3f}% (skewed if a few masks are large).")
print(f"- Median forged mask area (pixels): {stats['area_p50']:.0f}; P90: {stats['area_p90']:.0f}; P95: {stats['area_p95']:.0f}.")

print("\nPractical implications:")
print("- Most forged regions are SMALL relative to the image ‚Üí consider losses like Dice/Focal and higher input resolution.")
print("- Sampling: oversample forged images with tiny coverage to help the model learn subtle patterns.")
print("- Augmentations should preserve tiny artifacts (avoid heavy blurs that erase small masks).")

# --------- Bad masks (if any) ----------
if bad_masks:
    print(f"\n‚ö†Ô∏è {len(bad_masks)} problematic masks detected (showing first 5):")
    print(pd.DataFrame(bad_masks, columns=['case_id', 'error']).head())

In [None]:
# ===== REPLACE the previous triptych cell with this version (fixes boolean assignment) =====
# Big, clear side-by-side: Authentic | Forged | Overlay ‚Äî with robust overlay drawing

import numpy as np, pandas as pd, cv2, matplotlib.pyplot as plt
from pathlib import Path

def safe_imread(path: str):
    if path is None or (isinstance(path, float) and np.isnan(path)):
        raise FileNotFoundError("Image path is NaN")
    img = cv2.imread(str(path), cv2.IMREAD_UNCHANGED)
    if img is None:
        raise FileNotFoundError(f"cv2.imread failed: {path}")
    if img.ndim == 2:
        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
    elif img.shape[2] == 4:
        img = cv2.cvtColor(img, cv2.COLOR_BGRA2BGR)
    return img

def safe_load_mask(path: str):
    if path is None or (isinstance(path, float) and np.isnan(path)):
        return None
    m = np.load(str(path))
    if m is None or m.size == 0:
        return None
    m = (m > 0).astype(np.uint8)
    if m.ndim > 2:
        m = np.squeeze(m)
    if m.ndim != 2:
        return None
    return m

def prepare_mask_for_image(mask_arr, H, W):
    if mask_arr is None:
        return np.zeros((H, W), dtype=np.uint8)
    m = mask_arr
    if m.shape == (H, W):
        pass
    elif m.shape == (W, H):
        m = m.T
    else:
        if m.shape[0] == 0 or m.shape[1] == 0:
            return np.zeros((H, W), dtype=np.uint8)
        m = cv2.resize(m, (W, H), interpolation=cv2.INTER_NEAREST)
    return (m > 0).astype(np.uint8)

def overlay_with_contours(img_bgr, mask_bin, alpha=0.30, fill_color=(0,0,255), ctr_color=(0,255,255), ctr_th=3):
    """
    Safe overlay: use per-channel assignment to avoid boolean-broadcast errors.
    """
    H, W = img_bgr.shape[:2]
    m = prepare_mask_for_image(mask_bin, H, W)  # 2D (H,W)
    m_bool = m.astype(bool)

    overlay = img_bgr.copy()
    # Per-channel assignment (avoids NumPy boolean assignment shape issues)
    overlay[m_bool, 0] = fill_color[0]
    overlay[m_bool, 1] = fill_color[1]
    overlay[m_bool, 2] = fill_color[2]

    vis = cv2.addWeighted(img_bgr, 1.0, overlay, alpha, 0)

    # Draw contours
    m8 = (m * 255).astype(np.uint8)
    cnts, _ = cv2.findContours(m8, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if len(cnts) > 0:
        cv2.drawContours(vis, cnts, -1, ctr_color, ctr_th)

    area = int(m.sum())
    cov = (100.0 * area) / max(1, H*W)
    return vis, area, cov

def build_pairs(df):
    a = df[df.label=="authentic"][["case_id","img_path"]].rename(columns={"img_path":"auth_img"})
    f = df[df.label=="forged"][["case_id","img_path","mask_path"]].rename(columns={"img_path":"forg_img"})
    return a.merge(f, on="case_id", how="inner")

def show_triptychs(df, n=6, seed=42, min_mask_area_px=20, dpi=180):
    pairs = build_pairs(df)
    if len(pairs) == 0:
        print("No shared case_id between authentic and forged.")
        return

    rng = np.random.RandomState(seed)
    order = rng.permutation(len(pairs))

    selected, failures = [], []
    for idx in order:
        r = pairs.iloc[idx]
        try:
            img_a = safe_imread(r["auth_img"])
            img_f = safe_imread(r["forg_img"])
            H, W = img_f.shape[:2]
            raw_m = safe_load_mask(r.get("mask_path", None))
            m = prepare_mask_for_image(raw_m, H, W)
            area = int(m.sum())
            if area < min_mask_area_px:
                continue
            cov = (100.0 * area) / max(1, H*W)
            selected.append((r["case_id"], img_a, img_f, m, area, cov))
            if len(selected) >= n:
                break
        except Exception as e:
            failures.append((r.get("case_id","?"), str(e)))

    if len(selected) == 0:
        print("No valid triptychs. First few failures:")
        print(pd.DataFrame(failures, columns=["case_id","error"]).head())
        return

    rows = len(selected)
    fig, axes = plt.subplots(rows, 3, figsize=(18, 5*rows), dpi=dpi)
    if rows == 1:
        axes = np.array([axes])

    for rix, (cid, img_a, img_f, m, area, cov) in enumerate(selected):
        # Authentic
        axes[rix,0].imshow(cv2.cvtColor(img_a, cv2.COLOR_BGR2RGB), interpolation="nearest")
        axes[rix,0].set_title(f"{cid} ‚Äî Authentic", fontsize=14)
        axes[rix,0].axis("off")

        # Forged
        axes[rix,1].imshow(cv2.cvtColor(img_f, cv2.COLOR_BGR2RGB), interpolation="nearest")
        axes[rix,1].set_title(f"{cid} ‚Äî Forged", fontsize=14)
        axes[rix,1].axis("off")

        # Overlay (safe)
        vis, area_safe, cov_safe = overlay_with_contours(img_f, m, alpha=0.35)
        axes[rix,2].imshow(cv2.cvtColor(vis, cv2.COLOR_BGR2RGB), interpolation="nearest")
        axes[rix,2].set_title(f"{cid} ‚Äî Overlay\narea={area_safe} px | cov={cov_safe:.3f}%", fontsize=14)
        axes[rix,2].axis("off")

    plt.tight_layout()
    plt.show()

    if failures:
        print(f"‚ö†Ô∏è Skipped {len(failures)} problematic pair(s). Showing first 5:")
        print(pd.DataFrame(failures, columns=["case_id","error"]).head())

# Run (larger, clearer; skips tiny masks)
show_triptychs(df, n=6, seed=40, min_mask_area_px=20, dpi=180)

In [None]:
# ===== Cell: Save Tidy Indices =====
df.to_csv("/kaggle/working/train_index_images_masks_disambiguated.csv", index=False)
df[df.label=='forged'].to_csv("/kaggle/working/forged_with_mask_stats_disambiguated.csv", index=False)
print("Saved:")
print(" - /kaggle/working/train_index_images_masks_disambiguated.csv")
print(" - /kaggle/working/forged_with_mask_stats_disambiguated.csv")
