In [None]:
import numpy as np
import pandas as pd

Get number of questions in original dataset


In [None]:
import json

base_path = r"/content/drive/MyDrive/Colab Notebooks/FYP/dataset/clevr_kaggle/CLEVR_v1.0/questions"
total_q = 0

for i in range(1, 6):
    json_path = rf"{base_path}/CLEVR_val_questions_L{i}.json"

    try:
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        num_questions = len(data["questions"])
        print(f"L{i}: {num_questions} questions")
        total_q += num_questions

    except FileNotFoundError:
        print(f"L{i}: File not found â†’ {json_path}")

    except KeyError:
        print(f"L{i}: Missing 'questions' key in file")

print(total_q)

L1: 41884 questions
L2: 6780 questions
L3: 26981 questions
L4: 32046 questions
L5: 42300 questions
149991


## Downsampling with skewness and from the train dataset

avg = 2.87  â†’ on average, each image appears in ~3 questions
max = 9     â†’ worst case: one image appears 9 times

In [None]:
import json
from collections import Counter

path = r"D:\VS Projects\seli-fyp\competence-aware-curriculum-framework-VQA\dataset\CLEVR_v1.0\questions"

for i in range(1,6):
    with open(rf"{path}\CLEVR_train_questions_L{i}.json") as f:
        data = json.load(f)

    counts = Counter(q["image_index"] for q in data["questions"])

    avg = sum(counts.values()) / len(counts)
    mx = max(counts.values())

    print(f"L{i}: avg={avg:.2f}, max={mx}")

L1: avg=2.87, max=9
L2: avg=1.16, max=3
L3: avg=2.00, max=8
L4: avg=2.30, max=7
L5: avg=2.87, max=8


**CLEVR Curriculum Tier Dataset Statistics Analyzer**

Option A : Computes downsampled dataset stats

In [None]:
import json
import os
from collections import Counter

# Base directory containing downsampled CLEVR tier JSON files
# Updated for Google Colab + Google Drive environment
BASE_DIR = "/content/drive/MyDrive/Colab Notebooks/FYP/CLEVR/downsampled"


def analyze_tier(json_path):
    """
    Analyze a single CLEVR tier JSON file and compute dataset statistics and returns:
    total_q : int
        Total number of questions in the tier.
    num_images : int
        Number of unique images referenced in the tier.
    avg_q_per_img : float
        Average number of questions per image.
    max_q_per_img : int
        Maximum number of questions associated with a single image.
    """
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Extract questions list following CLEVR JSON structure
    questions = data.get("questions", [])
    total_q = len(questions)

    # Count how many times each image is referenced
    image_counter = Counter(
        q["image_filename"] for q in questions
    )

    num_images = len(image_counter)
    avg_q_per_img = total_q / num_images if num_images else 0
    max_q_per_img = max(image_counter.values()) if num_images else 0

    return total_q, num_images, avg_q_per_img, max_q_per_img

print("CLEVR TRAIN BALANCED DATASET STATISTICS")

# Sanity check to ensure expected files exist in the directory
print("Files found:", os.listdir(BASE_DIR), "\n")

# Iterate through all tier JSON files and report statistics
for file in sorted(os.listdir(BASE_DIR)):
    if not file.lower().endswith(".json"):
        continue

    # Identify tier level from filename
    tier = None
    for t in ["L1", "L2", "L3", "L4", "L5"]:
        if t in file:
            tier = t
            break

    if tier is None:
        continue

    path = os.path.join(BASE_DIR, file)
    total_q, num_images, avg_q, max_q = analyze_tier(path)

    print(f"=== {file} ===")
    print(f"Tier                  : {tier}")
    print(f"Total questions       : {total_q}")
    print(f"Unique images         : {num_images}")
    print(f"Avg questions / image : {avg_q:.2f}")
    print(f"Max questions / image : {max_q}")
    print()



ðŸ“Š CLEVR TRAIN BALANCED DATASET STATS

Files found: ['CLEVR_train_questions_L1.json', 'CLEVR_train_questions_L2.json', 'CLEVR_train_questions_L3.json', 'CLEVR_train_questions_L4.json', 'CLEVR_train_questions_L5.json'] 

=== CLEVR_train_questions_L1.json ===
Tier                  : L1
Total questions       : 60000
Unique images         : 31929
Avg questions / image : 1.88
Max questions / image : 2

=== CLEVR_train_questions_L2.json ===
Tier                  : L2
Total questions       : 31437
Unique images         : 27208
Avg questions / image : 1.16
Max questions / image : 3

=== CLEVR_train_questions_L3.json ===
Tier                  : L3
Total questions       : 60000
Unique images         : 36563
Avg questions / image : 1.64
Max questions / image : 2

=== CLEVR_train_questions_L4.json ===
Tier                  : L4
Total questions       : 60000
Unique images         : 34307
Avg questions / image : 1.75
Max questions / image : 2

=== CLEVR_train_questions_L5.json ===
Tier           

Option B : Computes downsampled dataset stats

In [None]:
import json
import os
from collections import Counter

# ðŸ”§ UPDATED PATH FOR COLAB
BASE_DIR = "/content/drive/MyDrive/Colab Notebooks/FYP/CLEVR/downsampledV2"

def analyze_tier(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # CLEVR format
    questions = data.get("questions", [])
    total_q = len(questions)

    image_counter = Counter(
        q["image_filename"] for q in questions
    )

    num_images = len(image_counter)
    avg_q_per_img = total_q / num_images if num_images else 0
    max_q_per_img = max(image_counter.values()) if num_images else 0

    return total_q, num_images, avg_q_per_img, max_q_per_img


print("\nðŸ“Š CLEVR TRAIN BALANCED DATASET STATS\n")

# Optional: sanity check
print("Files found:", os.listdir(BASE_DIR), "\n")

for file in sorted(os.listdir(BASE_DIR)):
    if not file.lower().endswith(".json"):
        continue

    tier = None
    for t in ["L1", "L2", "L3", "L4", "L5"]:
        if t in file:
            tier = t
            break

    if tier is None:
        continue

    path = os.path.join(BASE_DIR, file)
    total_q, num_images, avg_q, max_q = analyze_tier(path)

    print(f"=== {file} ===")
    print(f"Tier                  : {tier}")
    print(f"Total questions       : {total_q}")
    print(f"Unique images         : {num_images}")
    print(f"Avg questions / image : {avg_q:.2f}")
    print(f"Max questions / image : {max_q}")
    print()




ðŸ“Š CLEVR TRAIN BALANCED DATASET STATS

Files found: ['CLEVR_train_questions_L1.json', 'CLEVR_train_questions_L2.json', 'CLEVR_train_questions_L3.json', 'CLEVR_train_questions_L4.json', 'CLEVR_train_questions_L5.json'] 

=== CLEVR_train_questions_L1.json ===
Tier                  : L1
Total questions       : 60000
Unique images         : 60000
Avg questions / image : 1.00
Max questions / image : 1

=== CLEVR_train_questions_L2.json ===
Tier                  : L2
Total questions       : 31437
Unique images         : 27208
Avg questions / image : 1.16
Max questions / image : 3

=== CLEVR_train_questions_L3.json ===
Tier                  : L3
Total questions       : 60000
Unique images         : 60000
Avg questions / image : 1.00
Max questions / image : 1

=== CLEVR_train_questions_L4.json ===
Tier                  : L4
Total questions       : 60000
Unique images         : 60000
Avg questions / image : 1.00
Max questions / image : 1

=== CLEVR_train_questions_L5.json ===
Tier           

**TRANING DOWNSAMPLE**

DOWNSAMPLE OPTION A

* Downsampling questions per image to avoid image bias
* Enforcing tier-specific question caps
* Preserving reasoning diversity while controlling dataset size

Flow

1. Group questions by image
2. Randomly shuffle images and questions (seeded for reproducibility)
3. Select a limited number of questions per image
4. Stop once the tierâ€™s target size is reached


In [4]:
import json
import random
import os
from collections import defaultdict
from google.colab import drive

# -------- MOUNT GOOGLE DRIVE -------- #
drive.mount('/content/drive')

# ---------------- CONFIG ---------------- #
INPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/FYP/dataset/clevr_kaggle/CLEVR_v1.0/questions"
OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/FYP/CLEVR/downsampledV3"

MAX_Q_PER_IMAGE = {
    1: 2,  # primary cap for L1
    2: 1,  # ignored (we keep all)
    3: 2,
    4: 2,
    5: 3,
}

TARGET_PER_TIER = {
    1: 60000,
    2: None,   # keep all
    3: 60000,
    4: 60000,
    5: 70000,
}

SEED = 42
random.seed(SEED)
# ---------------------------------------- #

os.makedirs(OUTPUT_DIR, exist_ok=True)

def downsample_tier(tier):
    input_file = os.path.join(INPUT_DIR, f"CLEVR_train_questions_L{tier}.json")
    output_file = os.path.join(OUTPUT_DIR, f"CLEVR_train_questions_L{tier}.json")

    with open(input_file, "r") as f:
        data = json.load(f)

    questions = data["questions"]

    # ---------- TIER 2: KEEP ALL ----------
    if tier == 2:
        out_data = {
            "info": data.get("info", {}),
            "questions": questions
        }
        with open(output_file, "w") as f:
            json.dump(out_data, f)

        print(f"âœ… Tier L2: kept ALL {len(questions)} questions â†’ {output_file}")
        return

    # ---------- OTHER TIERS ----------
    image_map = defaultdict(list)
    for q in questions:
        image_map[q["image_index"]].append(q)

    image_ids = list(image_map.keys())
    random.shuffle(image_ids)

    collected = []
    target = TARGET_PER_TIER[tier]
    primary_cap = MAX_Q_PER_IMAGE[tier]

    # ----- PASS 1: primary cap -----
    leftovers = defaultdict(list)

    for img_id in image_ids:
        qs = image_map[img_id]
        random.shuffle(qs)

        collected.extend(qs[:primary_cap])
        leftovers[img_id] = qs[primary_cap:]

        if target and len(collected) >= target:
            collected = collected[:target]
            break

    random.shuffle(collected)

    out_data = {
        "info": data.get("info", {}),
        "questions": collected
    }

    with open(output_file, "w") as f:
        json.dump(out_data, f)

    print(
        f"âœ… Tier L{tier}: kept={len(collected)} "
        f"(primary={primary_cap}"
        f"{', +2 fill' if tier == 1 else ''}) â†’ {output_file}"
    )


# -------- RUN FOR ALL TIERS -------- #
for tier in [1, 2, 3, 4, 5]:
    downsample_tier(tier)

print("\nðŸŽ¯ All tiers downsampled successfully.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
âœ… Tier L1: kept=60000 (primary=2, +2 fill) â†’ /content/drive/MyDrive/Colab Notebooks/FYP/CLEVR/downsampled/CLEVR_train_questions_L1.json
âœ… Tier L2: kept ALL 31437 questions â†’ /content/drive/MyDrive/Colab Notebooks/FYP/CLEVR/downsampled/CLEVR_train_questions_L2.json
âœ… Tier L3: kept=60000 (primary=2) â†’ /content/drive/MyDrive/Colab Notebooks/FYP/CLEVR/downsampled/CLEVR_train_questions_L3.json
âœ… Tier L4: kept=60000 (primary=2) â†’ /content/drive/MyDrive/Colab Notebooks/FYP/CLEVR/downsampled/CLEVR_train_questions_L4.json
âœ… Tier L5: kept=70000 (primary=3) â†’ /content/drive/MyDrive/Colab Notebooks/FYP/CLEVR/downsampled/CLEVR_train_questions_L5.json

ðŸŽ¯ All tiers downsampled successfully.


DOWNSAMPLE OPTION B - ROUND ROBIN BASIS

* Explicitly controlling the average number of questions per image  
* Enforcing uniform image coverage via round-robin sampling
* Preserving CLEVRâ€™s compositional reasoning structure

Flow



1. Take 1 question from every image
2. Continue until: Target size is reached, or Image question limit is reached

In [None]:
import json
import random
import os
from collections import defaultdict
from google.colab import drive
import math

# -------- MOUNT GOOGLE DRIVE -------- #
drive.mount('/content/drive')

# ---------------- CONFIG ---------------- #
INPUT_DIR = "/content"
OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/FYP/CLEVR/downsampledV2"

TARGET_AVG_Q_PER_IMAGE = {
    1: 3.0,
    2: 1.16,
    3: 2.0,
    4: 2.3,
    5: 3.0,
}

TARGET_PER_TIER = {
    1: 60000,
    2: None,   # keep all
    3: 60000,
    4: 60000,
    5: 70000,
}

SEED = 42
random.seed(SEED)
# ---------------------------------------- #

os.makedirs(OUTPUT_DIR, exist_ok=True)

def downsample_tier(tier):
    input_file = os.path.join(INPUT_DIR, f"CLEVR_train_questions_L{tier}.json")
    output_file = os.path.join(OUTPUT_DIR, f"CLEVR_train_questions_L{tier}.json")

    with open(input_file, "r") as f:
        data = json.load(f)

    questions = data["questions"]

    # ---- Tier 2: keep all ----
    if tier == 2:
        out_data = {"info": data.get("info", {}), "questions": questions}
        with open(output_file, "w") as f:
            json.dump(out_data, f)
        print(f"âœ… Tier L2: kept ALL {len(questions)}")
        return

    # ---- Group by image ----
    image_map = defaultdict(list)
    for q in questions:
        image_map[q["image_index"]].append(q)

    image_ids = list(image_map.keys())
    random.shuffle(image_ids)

    for img_id in image_ids:
        random.shuffle(image_map[img_id])

    target = TARGET_PER_TIER[tier]
    max_per_image = MAX_Q_PER_IMAGE[tier]

    collected = []
    used = defaultdict(int)

    # ---- Round-robin passes ----
    for pass_idx in range(max_per_image):
        for img_id in image_ids:
            if used[img_id] <= pass_idx and pass_idx < len(image_map[img_id]):
                collected.append(image_map[img_id][pass_idx])
                used[img_id] += 1

                if target and len(collected) >= target:
                    collected = collected[:target]
                    break
        if target and len(collected) >= target:
            break

    random.shuffle(collected)

    out_data = {"info": data.get("info", {}), "questions": collected}
    with open(output_file, "w") as f:
        json.dump(out_data, f)

    avg = len(collected) / len(image_ids)
    print(
        f"âœ… Tier L{tier}: kept={len(collected)}, "
        f"avgâ‰ˆ{avg:.2f}, maxâ‰¤{max_per_image}"
    )


# -------- RUN FOR ALL TIERS -------- #
for tier in [1, 2, 3, 4, 5]:
    downsample_tier(tier)

print("\nðŸŽ¯ All tiers downsampled with AVG-based logic.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
âœ… Tier L1: kept=60000, avgâ‰ˆ0.88, maxâ‰¤3
âœ… Tier L2: kept ALL 31437
âœ… Tier L3: kept=60000, avgâ‰ˆ0.95, maxâ‰¤2
âœ… Tier L4: kept=60000, avgâ‰ˆ0.92, maxâ‰¤2
âœ… Tier L5: kept=70000, avgâ‰ˆ1.02, maxâ‰¤3

ðŸŽ¯ All tiers downsampled with AVG-based logic.


**VALIDATION DOWNSAMPLE**

In [None]:
import json
import random
import os
from collections import defaultdict
from google.colab import drive

# -------- MOUNT GOOGLE DRIVE -------- #
drive.mount('/content/drive')

# ---------------- CONFIG ---------------- #
INPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/FYP/dataset/clevr_kaggle/CLEVR_v1.0/questions"
OUTPUT_DIR = "/content/drive/MyDrive/Colab Notebooks/FYP/CLEVR/downsampled_val"

MAX_Q_PER_IMAGE = {
    1: 2,
    2: 1,  # ignored (keep all)
    3: 2,
    4: 2,
    5: 3,
}

TARGET_PER_TIER = {
    1: 8000,
    2: None,   # keep all
    3: 8000,
    4: 8000,
    5: 10000,
}

SEED = 42
random.seed(SEED)
# ---------------------------------------- #

os.makedirs(OUTPUT_DIR, exist_ok=True)

def downsample_val_tier(tier):
    input_file = os.path.join(INPUT_DIR, f"CLEVR_val_questions_L{tier}.json")
    output_file = os.path.join(OUTPUT_DIR, f"CLEVR_val_questions_L{tier}.json")

    with open(input_file, "r") as f:
        data = json.load(f)

    questions = data["questions"]

    # ---------- TIER 2: KEEP ALL ----------
    if tier == 2:
        out_data = {
            "info": data.get("info", {}),
            "questions": questions
        }
        with open(output_file, "w") as f:
            json.dump(out_data, f)

        print(f"âœ… VAL L2: kept ALL {len(questions)} questions")
        return

    # ---------- OTHER TIERS ----------
    image_map = defaultdict(list)
    for q in questions:
        image_map[q["image_index"]].append(q)

    image_ids = list(image_map.keys())
    random.shuffle(image_ids)

    collected = []
    target = TARGET_PER_TIER[tier]
    cap = MAX_Q_PER_IMAGE[tier]

    for img_id in image_ids:
        qs = image_map[img_id]
        random.shuffle(qs)

        collected.extend(qs[:cap])

        if len(collected) >= target:
            collected = collected[:target]
            break

    random.shuffle(collected)

    out_data = {
        "info": data.get("info", {}),
        "questions": collected
    }

    with open(output_file, "w") as f:
        json.dump(out_data, f)

    print(
        f"âœ… VAL L{tier}: kept={len(collected)} "
        f"(cap={cap}) â†’ {output_file}"
    )


# -------- RUN FOR ALL TIERS -------- #
for tier in [1, 2, 3, 4, 5]:
    downsample_val_tier(tier)

print("\nðŸŽ¯ Validation tiers downsampled successfully.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
âœ… VAL L1: kept=8000 (cap=2) â†’ /content/drive/MyDrive/Colab Notebooks/FYP/CLEVR/downsampled_val/CLEVR_val_questions_L1.json
âœ… VAL L2: kept ALL 6780 questions
âœ… VAL L3: kept=8000 (cap=2) â†’ /content/drive/MyDrive/Colab Notebooks/FYP/CLEVR/downsampled_val/CLEVR_val_questions_L3.json
âœ… VAL L4: kept=8000 (cap=2) â†’ /content/drive/MyDrive/Colab Notebooks/FYP/CLEVR/downsampled_val/CLEVR_val_questions_L4.json
âœ… VAL L5: kept=10000 (cap=3) â†’ /content/drive/MyDrive/Colab Notebooks/FYP/CLEVR/downsampled_val/CLEVR_val_questions_L5.json

ðŸŽ¯ Validation tiers downsampled successfully.


In [None]:
import json
import os
from collections import Counter

# ðŸ”§ UPDATED PATH FOR COLAB
BASE_DIR = "/content/drive/MyDrive/Colab Notebooks/FYP/CLEVR/downsampled_val"

def analyze_tier(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # CLEVR format
    questions = data.get("questions", [])
    total_q = len(questions)

    image_counter = Counter(
        q["image_filename"] for q in questions
    )

    num_images = len(image_counter)
    avg_q_per_img = total_q / num_images if num_images else 0
    max_q_per_img = max(image_counter.values()) if num_images else 0

    return total_q, num_images, avg_q_per_img, max_q_per_img


print("\nðŸ“Š CLEVR TRAIN BALANCED DATASET STATS\n")

# Optional: sanity check
print("Files found:", os.listdir(BASE_DIR), "\n")

for file in sorted(os.listdir(BASE_DIR)):
    if not file.lower().endswith(".json"):
        continue

    tier = None
    for t in ["L1", "L2", "L3", "L4", "L5"]:
        if t in file:
            tier = t
            break

    if tier is None:
        continue

    path = os.path.join(BASE_DIR, file)
    total_q, num_images, avg_q, max_q = analyze_tier(path)

    print(f"=== {file} ===")
    print(f"Tier                  : {tier}")
    print(f"Total questions       : {total_q}")
    print(f"Unique images         : {num_images}")
    print(f"Avg questions / image : {avg_q:.2f}")
    print(f"Max questions / image : {max_q}")
    print()




ðŸ“Š CLEVR TRAIN BALANCED DATASET STATS

Files found: ['CLEVR_val_questions_L1.json', 'CLEVR_val_questions_L2.json', 'CLEVR_val_questions_L3.json', 'CLEVR_val_questions_L4.json', 'CLEVR_val_questions_L5.json'] 

=== CLEVR_val_questions_L1.json ===
Tier                  : L1
Total questions       : 8000
Unique images         : 4268
Avg questions / image : 1.87
Max questions / image : 2

=== CLEVR_val_questions_L2.json ===
Tier                  : L2
Total questions       : 6780
Unique images         : 5882
Avg questions / image : 1.15
Max questions / image : 3

=== CLEVR_val_questions_L3.json ===
Tier                  : L3
Total questions       : 8000
Unique images         : 4893
Avg questions / image : 1.63
Max questions / image : 2

=== CLEVR_val_questions_L4.json ===
Tier                  : L4
Total questions       : 8000
Unique images         : 4590
Avg questions / image : 1.74
Max questions / image : 2

=== CLEVR_val_questions_L5.json ===
Tier                  : L5
Total questions 