# Imports and Utilities

In [None]:
import os
import re
import json
import pandas as pd
import numpy as np

import kagglehub
from kagglehub import KaggleDatasetAdapter

pd.set_option("display.max_columns", 200)

# Canonical Harmonization Schema

We standardize all datasets into the following structure.

**Required fields**
- image_id
- dataset_name
- image_path
- modality
- eye
- diagnosis_raw
- diagnosis_category

**Optional fields** (included when available)
- patient metadata (age, sex, patient_id)
- image metadata (resolution_x/y)
- severity grading
- view type (macula, optic disc, unknown)
- extra_json for non-standard fields

The schema is simple. It will grow later when as I formalize the mappings.

In [None]:
CANONICAL_COLUMNS = [
    "image_id",
    "dataset_name",
    "image_path",
    "eye",
    "modality",
    "view_type",
    "diagnosis_raw",
    "diagnosis_category",
    "diagnosis_binary",
    "severity",
    "patient_id",
    "age",
    "sex",
    "resolution_x",
    "resolution_y",
    "extra_json"
]

def canonical_row():
    """Return an empty row matching the canonical schema."""
    return {col: None for col in CANONICAL_COLUMNS}

# Basic Harmonization Rules

We start with simple, flexible rules:

- Map diagnosis labels using keyword search.
- Infer laterality (left/right) from filename.
- Infer modality (OCT, fundus, etc.) from dataset name.
- Extract standard columns when possible.

In [None]:
def map_diagnosis(raw):
    if raw is None:
        return None
    r = str(raw).lower()
    if "dr" in r or "retinopathy" in r:
        return "DR"
    if "amd" in r or "degeneration" in r:
        return "AMD"
    if "cataract" in r:
        return "Cataract"
    if "glaucoma" in r:
        return "Glaucoma"
    if "normal" in r:
        return "Normal"
    if "fluid" in r or "cyst" in r or "edema" in r:
        return "Edema"
    return "Other"

def infer_eye(path):
    if not isinstance(path, str):
        return None
    p = path.lower()
    if any(x in p for x in ["left","_l","-l","os"]):
        return "left"
    if any(x in p for x in ["right","_r","-r","od"]):
        return "right"
    return None

def infer_modality(dataset_name):
    name = dataset_name.lower()
    if "oct" in name:
        return "OCT"
    if "fundus" in name or "messidor" in name or "aptos" in name:
        return "Fundus"
    if "cataract" in name:
        return "Slit-Lamp"
    return "Unknown"

# Universal Loader

This function:

- Downloads the Kaggle dataset
- Attempts to detect key columns
- Converts each row into the canonical format

This gives a single interface for all datasets.

In [None]:
def load_dataset(identifier, dataset_name):
    """Load a Kaggle dataset and convert rows into the canonical schema."""
    print(f"Loading dataset: {dataset_name} ({identifier})")

    try:
        df = kagglehub.load_dataset(
            KaggleDatasetAdapter.PANDAS,
            identifier,
            file_path=""
        )
    except Exception as e:
        print(f"Failed to load {identifier}: {e}")
        return pd.DataFrame()

    rows = []

    # Auto-detect image and diagnosis fields
    img_field = next((c for c in df.columns if "path" in c.lower() or "img" in c.lower()), None)
    diag_field = next((c for c in df.columns if "label" in c.lower() or "class" in c.lower() or "diagn" in c.lower()), None)

    for idx, row in df.iterrows():
        r = canonical_row()
        r["image_id"] = f"{dataset_name}_{idx}"
        r["dataset_name"] = dataset_name
        r["image_path"] = row.get(img_field, None)
        r["diagnosis_raw"] = row.get(diag_field, None)
        r["diagnosis_category"] = map_diagnosis(r["diagnosis_raw"])
        r["eye"] = infer_eye(r["image_path"])
        r["modality"] = infer_modality(dataset_name)
        r["extra_json"] = json.dumps({c: row[c] for c in df.columns if c not in [img_field, diag_field]}, default=str)

        rows.append(r)

    return pd.DataFrame(rows)

# Dataset Registry

This is the list of all datasets included in the project.
Each dataset can be enabled/disabled without changing the code.

In [None]:
DATASETS = [
    ("sheemazain/cataract-classification-dataset-in-ds", "Cataract DS"),
    ("drbasanthkb/cornea-in-diabetes", "Cornea in Diabetes"),
    ("pritpal2873/diabetic-retinopathy-detection-classification-data", "DR Detection"),
    ("sumit17125/eye-image-dataset", "Eye Image Dataset"),
    ("arjunbhushan005/fundus-images", "Fundus Images"),
    ("orvile/macular-degeneration-disease-dataset", "Macular Degeneration"),
    ("google-brain/messidor2-dr-grades", "Messidor2"),
    ("orvile/octdl-optical-coherence-tomography-dataset", "OCTDL"),
    ("shakilrana/octdl-retinal-oct-images-dataset", "OCTDL Images"),
    ("ferencjuhsz/refuge2-and-refuge2cross-dataset", "Refuge2"),
    ("mohamedabdalkader/retinal-disease-detection", "Retinal Disease Detection"),
    ("joseguzman/y79-retinoblastoma-cells", "Retinoblastoma Cells"),
]

# Harmonization Pipeline

This section loads, harmonizes, merges, and exports everything into a single Parquet file.

In [None]:
harmonized_frames = []

for identifier, name in DATASETS:
    df = load_dataset(identifier, name)
    if not df.empty:
        harmonized_frames.append(df)

final_df = pd.concat(harmonized_frames, ignore_index=True)

print("Total harmonized rows:", len(final_df))
final_df.head()

# Export

In [None]:
final_df.to_parquet("harmonized.parquet")
print("Exported harmonized.parquet")

# Next Steps for the project

- Refine data dictionary.
- Strengthen diagnosis mapping rules.
- Extract pixel metadata.
- Validate outliers and noise.