In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import shutil
from pathlib import Path
from google.colab import drive
import kagglehub

"""
Downloads Kaggle CLEVR dataset and stores it persistently in Google Drive
"""

# Mount Google Drive
drive.mount("/content/drive")

KAGGLE_DATASET = "timoboz/clevr-dataset"

# Google Drive target
DRIVE_ROOT = "/content/drive/MyDrive/Colab Notebooks/FYP/dataset"
CLEVR_DIR = os.path.join(DRIVE_ROOT, "clevr_kaggle")


def download_and_extract(force_download: bool = False) -> str:
    clevr_path = Path(CLEVR_DIR)

    # Reuse if already stored in Drive
    if clevr_path.exists() and not force_download:
        print(f"‚úì CLEVR Kaggle dataset already exists at:\n  {clevr_path}")
        return str(clevr_path)

    os.makedirs(DRIVE_ROOT, exist_ok=True)

    print("üì• Downloading CLEVR dataset from Kaggle...")
    print("‚è±Ô∏è  This may take several minutes...")

    # 1Ô∏è‚É£ Download to kagglehub cache
    cache_path = kagglehub.dataset_download(
        KAGGLE_DATASET,
        force_download=force_download
    )

    print(f"‚úì Downloaded to cache:\n  {cache_path}")

    # 2Ô∏è‚É£ Copy to Google Drive
    if clevr_path.exists():
        shutil.rmtree(clevr_path)

    shutil.copytree(cache_path, clevr_path)

    print(f"‚úì Dataset copied to Google Drive:\n  {clevr_path}")

    return str(clevr_path)


Dataset dimentions check


In [None]:
import os, json, time
from pathlib import Path

CLEVR_ROOT = "/content/drive/MyDrive/Colab Notebooks/FYP/dataset/clevr_kaggle/CLEVR_v1.0"

IMAGE_DIR = {
    "train": f"{CLEVR_ROOT}/images/train",
    "val":   f"{CLEVR_ROOT}/images/val",
    "test":  f"{CLEVR_ROOT}/images/test"
}

QUESTION_FILES = {
    "train": f"{CLEVR_ROOT}/questions/CLEVR_train_questions.json",
    "val":   f"{CLEVR_ROOT}/questions/CLEVR_val_questions.json",
    "test":  f"{CLEVR_ROOT}/questions/CLEVR_test_questions.json"
}

SCENE_FILES = {
    "train": f"{CLEVR_ROOT}/scenes/CLEVR_train_scenes.json",
    "val":   f"{CLEVR_ROOT}/scenes/CLEVR_val_scenes.json"
}

def safe_listdir(path, retries=5, sleep_s=2):
    """Retry os.listdir to survive Google Drive Errno 5 glitches."""
    for attempt in range(1, retries + 1):
        try:
            return os.listdir(path)
        except OSError as e:
            print(f"‚ö†Ô∏è listdir failed (attempt {attempt}/{retries}) for {path}\n   -> {e}")
            time.sleep(sleep_s)
    return None

def count_images(path, retries=5):
    if not os.path.exists(path):
        return 0, "missing"

    # Fast path (listdir) with retries
    files = safe_listdir(path, retries=retries)
    if files is not None:
        return sum(1 for f in files if f.lower().endswith(".png")), "listdir"

    # Fallback path (Path.glob) - slower, but sometimes works when listdir fails
    try:
        p = Path(path)
        return sum(1 for _ in p.glob("*.png")), "glob"
    except OSError as e:
        return -1, f"failed ({e})"

def count_json_items(path, key):
    if not os.path.exists(path):
        return 0, "missing"
    try:
        with open(path, "r") as f:
            data = json.load(f)
        return len(data.get(key, [])), "ok"
    except OSError as e:
        return -1, f"failed ({e})"
    except json.JSONDecodeError as e:
        return -1, f"bad json ({e})"

print("üì∑ IMAGE COUNTS")
for split, path in IMAGE_DIR.items():
    n, mode = count_images(path)
    print(f"  {split.upper():5} images : {n}   [{mode}]")

print("\n‚ùì QUESTION COUNTS")
for split, path in QUESTION_FILES.items():
    n, mode = count_json_items(path, "questions")
    print(f"  {split.upper():5} questions : {n}   [{mode}]")

print("\nüß© SCENE COUNTS")
for split, path in SCENE_FILES.items():
    n, mode = count_json_items(path, "scenes")
    print(f"  {split.upper():5} scenes : {n}   [{mode}]")

print("\n‚úÖ Done.")
