In [None]:
# ======================================================================
# CSE475 – TASK 1  (EDA + Related Work template helper)
# Dataset: Betel Leaf (Healthy / Diseased / Dried)
# Author: Shourav Deb
# ======================================================================

import os, json, math, glob
from pathlib import Path

import numpy as np
import pandas as pd
from PIL import Image, ImageFilter
import cv2
from tqdm import tqdm

def get_row_class(row):
    # Pandas renames 'class' -> '_1' in itertuples
    return getattr(row, "class", getattr(row, "_1"))


# ----------------------------------------------------------------------
# 1. CONFIG
# ----------------------------------------------------------------------

DATA_ROOT = "/kaggle/input/betel-leaf"
print("Using dataset root:", DATA_ROOT)

OUT_DIR = Path("/kaggle/working/eda_outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# class names
CLASS_NAMES = ["Healthy", "Diseased", "Dried"]

# possible subfolders (On-Field + Controlled Environment)
SPLIT_FOLDERS = [
    "Controlled Environment",
    "Controlled_Environment",
    "controlled_environment",
    "On Field",
    "On-Field",
    "on_field",
    "",   # also check directly under root
]

# ----------------------------------------------------------------------
# 2. INDEX ALL IMAGES
# ----------------------------------------------------------------------
records = []

for split in SPLIT_FOLDERS:
    split_path = Path(DATA_ROOT) / split if split != "" else Path(DATA_ROOT)
    if not split_path.exists():
        continue

    for cls in CLASS_NAMES:
        cls_path = split_path / cls
        if not cls_path.exists():
            continue
        image_files = sorted(glob.glob(str(cls_path / "*.jpg"))) + \
                      sorted(glob.glob(str(cls_path / "*.jpeg"))) + \
                      sorted(glob.glob(str(cls_path / "*.png")))

        for im in image_files:
            records.append({
                "path": im,
                "class": cls,
                "split": split if split != "" else "root"
            })

df_index = pd.DataFrame(records)
index_csv_path = OUT_DIR / "image_index.csv"
df_index.to_csv(index_csv_path, index=False)
print("Total images found:", len(df_index))
print(df_index.head())

if len(df_index) == 0:
    raise RuntimeError("No images found. Check folder names / dataset mount.")

# ----------------------------------------------------------------------
# 3. BASIC CLASS BALANCE
# ----------------------------------------------------------------------
class_counts = df_index.groupby(["split", "class"]).size().reset_index(name="count")
class_counts.to_csv(OUT_DIR / "class_balance.csv", index=False)
print("\nClass balance:\n", class_counts)

# also a simpler global view
global_counts = df_index["class"].value_counts().reset_index()
global_counts.columns = ["class", "count"]
print("\nGlobal class counts:\n", global_counts)
global_counts.to_csv(OUT_DIR / "class_balance_global.csv", index=False)

# ----------------------------------------------------------------------
# 4. HELPER FUNCTIONS
# ----------------------------------------------------------------------
def load_rgb(path):
    img = Image.open(path).convert("RGB")
    return np.array(img)

def rgb_to_hsv_np(rgb):
    # rgb: (H, W, 3) in 0..255
    return cv2.cvtColor(rgb, cv2.COLOR_RGB2HSV)

def image_brightness(img_rgb):
    # simple brightness = mean of V in HSV
    hsv = rgb_to_hsv_np(img_rgb)
    return hsv[..., 2].mean()

def image_contrast(img_rgb):
    # std of grayscale / value channel
    hsv = rgb_to_hsv_np(img_rgb)
    return hsv[..., 2].std()

def saturation_ratio(img_rgb, clip_thr=250):
    hsv = rgb_to_hsv_np(img_rgb)
    s = hsv[..., 1]
    # % of pixels with very high saturation
    return (s > clip_thr).mean()

def laplacian_variance(img_rgb):
    # blur/sharpness measure
    gray = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2GRAY)
    return cv2.Laplacian(gray, cv2.CV_64F).var()

def noise_proxy(img_rgb):
    # difference between img and blurred version
    img_f = img_rgb.astype(np.float32)
    blur = cv2.GaussianBlur(img_f, (5,5), 0)
    diff = np.abs(img_f - blur).mean()
    return float(diff)

# perceptual-ish average hash (no extra install)
def average_hash(img_rgb, hash_size=8):
    pil_img = Image.fromarray(img_rgb).convert("L").resize((hash_size, hash_size), Image.LANCZOS)
    pixels = np.array(pil_img)
    avg = pixels.mean()
    diff = pixels > avg
    # convert to hex
    bits = "".join("1" if v else "0" for v in diff.flatten())
    return hex(int(bits, 2))[2:].rjust(hash_size * hash_size // 4, "0")

def aspect_ratio(w, h):
    return round(w / h, 4)

# ----------------------------------------------------------------------
# 5. PER-IMAGE STATS LOOP
# ----------------------------------------------------------------------
per_image_records = []

print("\nComputing per-image statistics ...")
for row in tqdm(df_index.itertuples(index=False), total=len(df_index)):
    img = load_rgb(row.path)
    h, w, _ = img.shape

    mean_rgb = img.mean(axis=(0,1))
    std_rgb  = img.std(axis=(0,1))
    hsv = rgb_to_hsv_np(img)
    hist_h, _ = np.histogram(hsv[...,0], bins=16, range=(0,255))
    hist_s, _ = np.histogram(hsv[...,1], bins=16, range=(0,255))
    hist_v, _ = np.histogram(hsv[...,2], bins=16, range=(0,255))
    br = image_brightness(img)
    ct = image_contrast(img)
    sat_clip = saturation_ratio(img, clip_thr=240)
    sharp = laplacian_variance(img)
    noise = noise_proxy(img)
    gw_diff = float(np.std(mean_rgb))
    ahash = average_hash(img, hash_size=8)

    per_image_records.append({
        "path": row.path,
        "class": get_row_class(row),
        "split": row.split,
        "width": w,
        "height": h,
        "aspect_ratio": aspect_ratio(w, h),
        "mean_r": float(mean_rgb[0]),
        "mean_g": float(mean_rgb[1]),
        "mean_b": float(mean_rgb[2]),
        "std_r": float(std_rgb[0]),
        "std_g": float(std_rgb[1]),
        "std_b": float(std_rgb[2]),
        "brightness": float(br),
        "contrast": float(ct),
        "sat_clip_pct": float(sat_clip),
        "sharpness_laplacian": float(sharp),
        "noise_proxy": float(noise),
        "grayworld_std": float(gw_diff),
        "hash": ahash,
        "hist_h": hist_h.tolist(),
        "hist_s": hist_s.tolist(),
        "hist_v": hist_v.tolist(),
    })

df_stats = pd.DataFrame(per_image_records)
df_stats.to_pickle(OUT_DIR / "per_image_stats.pkl")  # convenient
df_stats.to_csv(OUT_DIR / "per_image_stats.csv", index=False)
print("Saved per-image stats to:", OUT_DIR / "per_image_stats.csv")

# ----------------------------------------------------------------------
# 6. PER-CLASS AGGREGATION
# ----------------------------------------------------------------------
agg_funcs = {
    "width": ["mean", "min", "max"],
    "height": ["mean", "min", "max"],
    "aspect_ratio": ["mean", "min", "max"],
    "mean_r": "mean",
    "mean_g": "mean",
    "mean_b": "mean",
    "std_r": "mean",
    "std_g": "mean",
    "std_b": "mean",
    "brightness": ["mean", "std"],
    "contrast": ["mean", "std"],
    "sat_clip_pct": ["mean"],
    "sharpness_laplacian": ["mean", "std"],
    "noise_proxy": ["mean", "std"],
    "grayworld_std": ["mean", "max"],
}

df_class = df_stats.groupby("class").agg(agg_funcs)
df_class.columns = ["_".join(c for c in col if c) for col in df_class.columns.ravel()]
df_class.to_csv(OUT_DIR / "per_class_summary.csv")
print("\nPer-class summary:\n", df_class)

# ----------------------------------------------------------------------
# 7. RESOLUTION & ASPECT-RATIO DISTRIBUTION
# ----------------------------------------------------------------------
res_counts = df_stats.groupby(["width", "height"]).size().reset_index(name="count")
res_counts.to_csv(OUT_DIR / "resolution_distribution.csv", index=False)

ar_counts = df_stats["aspect_ratio"].value_counts().reset_index()
ar_counts.columns = ["aspect_ratio", "count"]
ar_counts.to_csv(OUT_DIR / "aspect_ratio_distribution.csv", index=False)

# ----------------------------------------------------------------------
# 8. DUPLICATE / NEAR-DUPLICATE DETECTION
#    (same hash -> very likely same image; we just dump groups)
# ----------------------------------------------------------------------
dup_groups = df_stats.groupby("hash")["path"].apply(list).reset_index()
dup_groups = dup_groups[dup_groups["path"].apply(len) > 1]
dup_groups.to_json(OUT_DIR / "duplicate_candidates.json", orient="records", indent=4)
print("\nPotential duplicate groups:", len(dup_groups))

# ----------------------------------------------------------------------
# 9. AUGMENTATION PROBE
#    we just check if simple augs break aspect ratio / size / class distribution
# ----------------------------------------------------------------------
import random
from PIL import ImageEnhance

def random_augment(pil_img):
    choice = random.choice(["flip", "crop", "color", "blur"])
    if choice == "flip":
        return pil_img.transpose(Image.FLIP_LEFT_RIGHT), "hflip"
    elif choice == "crop":
        w, h = pil_img.size
        cw, ch = int(w*0.9), int(h*0.9)
        left = (w - cw)//2
        top = (h - ch)//2
        return pil_img.crop((left, top, left+cw, top+ch)).resize((w, h)), "center_crop"
    elif choice == "color":
        enh = ImageEnhance.Color(pil_img)
        return enh.enhance(1.5), "color_jitter"
    else:
        return pil_img.filter(ImageFilter.GaussianBlur(radius=1.0)), "blur"

probe_results = []
probe_samples = df_index.sample(min(30, len(df_index)), random_state=42)

for row in probe_samples.itertuples(index=False):
    img = Image.open(row.path).convert("RGB")
    aug_img, aug_name = random_augment(img)
    arr = np.array(aug_img)
    probe_results.append({
        "path": row.path,
        "class": get_row_class(row),
        "aug": aug_name,
        "after_w": arr.shape[1],
        "after_h": arr.shape[0],
        "after_brightness": float(image_brightness(arr)),
        "after_contrast": float(image_contrast(arr)),
    })

df_probe = pd.DataFrame(probe_results)
df_probe.to_csv(OUT_DIR / "augmentation_probe.csv", index=False)
print("Saved augmentation probe:", OUT_DIR / "augmentation_probe.csv")

# ----------------------------------------------------------------------
# 10. FINAL JSON REPORT  (for Task 1 submission)
# ----------------------------------------------------------------------
final_report = {
    "total_images": int(len(df_index)),
    "by_class": global_counts.to_dict(orient="records"),
    "by_split_class": class_counts.to_dict(orient="records"),
    "resolution_modes": res_counts.sort_values("count", ascending=False).head(10).to_dict(orient="records"),
    "aspect_ratio_top": ar_counts.sort_values("count", ascending=False).head(10).to_dict(orient="records"),
    "per_class_summary_csv": str(OUT_DIR / "per_class_summary.csv"),
    "duplicates_found": int(len(dup_groups)),
    "dup_file": str(OUT_DIR / "duplicate_candidates.json"),
    "augmentation_probe_csv": str(OUT_DIR / "augmentation_probe.csv"),
    "notes": "Generated for CSE475 Task 1 – image-focused EDA.",
}

with open(OUT_DIR / "final_report.json", "w") as f:
    json.dump(final_report, f, indent=4)

print("\nFinal report saved to:", OUT_DIR / "final_report.json")

# ----------------------------------------------------------------------
# 11. RELATED WORK TABLE TEMPLATE (Task 1)
# ----------------------------------------------------------------------
related_work_cols = [
    "Title",
    "Dataset name and URL",
    "Dataset description",
    "Methods name",
    "Accuracy of the model",
    "Pros",
    "Cons",
    "Citation",
]
df_rw = pd.DataFrame(columns=related_work_cols)
df_rw.to_csv(OUT_DIR / "related_work_template.csv", index=False)
print("Related work template saved to:", OUT_DIR / "related_work_template.csv")


Using dataset root: /kaggle/input/betel-leaf
Total images found: 893
                                                path    class  \
0  /kaggle/input/betel-leaf/Controlled Environmen...  Healthy   
1  /kaggle/input/betel-leaf/Controlled Environmen...  Healthy   
2  /kaggle/input/betel-leaf/Controlled Environmen...  Healthy   
3  /kaggle/input/betel-leaf/Controlled Environmen...  Healthy   
4  /kaggle/input/betel-leaf/Controlled Environmen...  Healthy   

                    split  
0  Controlled Environment  
1  Controlled Environment  
2  Controlled Environment  
3  Controlled Environment  
4  Controlled Environment  

Class balance:
                     split     class  count
0  Controlled Environment  Diseased    220
1  Controlled Environment     Dried    340
2  Controlled Environment   Healthy    333

Global class counts:
       class  count
0     Dried    340
1   Healthy    333
2  Diseased    220

Computing per-image statistics ...


100%|██████████| 893/893 [1:49:48<00:00,  7.38s/it]


Saved per-image stats to: /kaggle/working/eda_outputs/per_image_stats.csv

Per-class summary:
            width_mean  width_min  width_max  height_mean  height_min  \
class                                                                  
Diseased  6112.000000       6112       6112  6112.000000        6112   
Dried     6110.455882       5801       6112  6110.455882        5801   
Healthy   6112.000000       6112       6112  6112.000000        6112   

          height_max  aspect_ratio_mean  aspect_ratio_min  aspect_ratio_max  \
class                                                                         
Diseased        6112                1.0               1.0               1.0   
Dried           6112                1.0               1.0               1.0   
Healthy         6112                1.0               1.0               1.0   

          mean_r_mean  ...  brightness_std  contrast_mean  contrast_std  \
class                  ...                                               