In [None]:
import json
import os
import pandas as pd
from google.colab import drive

# 1. Mount Google Drive
drive.mount('/content/drive')

def process_clevr_from_specific_folder():
    # --- CONFIGURATION ---
    # We point directly to your specific folder here
    folder_path = '/content/drive/MyDrive/FYP/FYP impl'
    filename = 'CLEVR_train_questions.json'

    input_path = os.path.join(folder_path, filename)
    output_mapped = os.path.join(folder_path, "CLEVR_Categorized.xlsx")
    output_unmapped = os.path.join(folder_path, "CLEVR_Unmapped.xlsx")
    # ---------------------

    # 2. Verify the file exists before starting
    print(f"Checking for file at: {input_path}")
    if not os.path.exists(input_path):
        print(f"❌ Error: File not found at {input_path}")
        print("Please check if the folder name 'FYP impl' has a space and matches exactly.")
        return

    print(f"✅ File found! Starting processing...")

    # 3. Define Task Mapping
    task_mapping = {
        "count": "Counting", "exist": "Existence",
        "equal_integer": "Compare Integer", "less_than": "Compare Integer", "greater_than": "Compare Integer",
        "query_color": "Query Attribute", "query_size": "Query Attribute",
        "query_material": "Query Attribute", "query_shape": "Query Attribute",
        "equal_color": "Compare Attribute", "equal_size": "Compare Attribute",
        "equal_material": "Compare Attribute", "equal_shape": "Compare Attribute"
    }

    print("Loading JSON data... (This may take 1-2 minutes)")
    with open(input_path, 'r') as f:
        data = json.load(f)

    questions = data.get('questions', [])
    categorized_records = {cat: [] for cat in set(task_mapping.values())}
    unmapped_records = []

    print(f"Processing {len(questions)} questions...")
    for q in questions:
        full_program = q.get('program', [])
        if not full_program: continue

        last_op = full_program[-1].get('function', 'unknown')

        record = {
            "Question Index": q.get('question_index'),
            "Question": q.get('question'),
            "Answer": q.get('answer'),
            "Last Operation": last_op,
            "Full Program": " -> ".join([node['function'] for node in full_program]),
            "Image": q.get('image_filename')
        }

        category = task_mapping.get(last_op)
        if category:
            categorized_records[category].append(record)
        else:
            unmapped_records.append(record)

    # 4. Save to Excel
    print(f"Saving categorized data to: {output_mapped}")
    try:
        with pd.ExcelWriter(output_mapped, engine='openpyxl') as writer:
            data_written = False
            for category, items in categorized_records.items():
                if items:
                    df = pd.DataFrame(items)
                    df.to_excel(writer, sheet_name=category[:31], index=False)
                    data_written = True

            if not data_written:
                pd.DataFrame(["No Data"]).to_excel(writer, sheet_name="Empty")

        print("✅ Categorized Excel saved successfully.")
    except Exception as e:
        print(f"❌ Error saving categorized Excel: {e}")

    # 5. Save Unmapped Data (if any)
    if unmapped_records:
        print(f"Saving unmapped data to: {output_unmapped}")
        pd.DataFrame(unmapped_records).to_excel(output_unmapped, index=False)
        print(f"⚠️ Found {len(unmapped_records)} unmapped items.")
    else:
        print("✅ No unmapped operations found.")

# Run the function
process_clevr_from_specific_folder()

Mounted at /content/drive
Checking for file at: /content/drive/MyDrive/FYP/FYP impl/CLEVR_train_questions.json
✅ File found! Starting processing...
Loading JSON data... (This may take 1-2 minutes)
Processing 699989 questions...


Exception ignored in: <function ZipFile.__del__ at 0x7cc5137b0c20>
Traceback (most recent call last):
  File "/usr/lib/python3.12/zipfile/__init__.py", line 1966, in __del__
    self.close()
  File "/usr/lib/python3.12/zipfile/__init__.py", line 1983, in close
    self.fp.seek(self.start_dir)
ValueError: seek of closed file


Saving categorized data to: /content/drive/MyDrive/FYP/FYP impl/CLEVR_Categorized.xlsx
✅ Categorized Excel saved successfully.
✅ No unmapped operations found.


### **CURRICULLUM LEARNING FRAMEWORK  TO IMPROVE COMPOSITIONAL REASONING IN VQA MODELS**

1. **CLEVR DATASET SPLIT INTO REASONING TASK BASED SUBSETS**

*   LEVEL1 Attribute & existence, no relate
*   LEVEL2 Counting / compare integer, no relate
*   LEVEL3 Attribute & existence with relate
*   LEVEL4 Count/ Compare (integer or attribute) with strong composition


     
    
    


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Define function groups + level assignment (L1–L4)

In [None]:
import os
import json
from collections import defaultdict
from typing import List, Dict, Any, Optional

FOLDER_PATH = '/content/drive/MyDrive/FYP/FYP impl/CLEVR_v1.0/questions'
TRAIN_FILENAME = 'CLEVR_train_questions.json'

TRAIN_PATH = os.path.join(FOLDER_PATH, TRAIN_FILENAME)

print("Using CLEVR train file at:", TRAIN_PATH)
assert os.path.exists(TRAIN_PATH), "Train JSON not found. Check path/name!"

ATTR_QUERY_FNS = {"query_color", "query_shape", "query_size", "query_material"}
EXIST_FNS = {"exist"}
COUNT_FNS = {"count"}
NUM_COMPARE_FNS = {"equal_integer", "greater_than", "less_than"}
ATTR_COMPARE_FNS = {"equal_color", "equal_shape", "equal_size", "equal_material"}

def has_relate(program: List[Dict[str, Any]]) -> bool:
    return any(step.get("function") == "relate" for step in program)

def program_length(program: List[Dict[str, Any]]) -> int:
    return len(program)

def assign_level(program: List[Dict[str, Any]]) -> Optional[str]:
    """
    Map a CLEVR functional program to curriculum levels L1–L5:

    L1: Attribute & existence, no relate
    L2: Compare attribute (multi-set), no relate
    L3: Counting / integer compare, no relate
    L4: Attribute & existence with relate
    L5: Count / Compare (integer or attribute) with relate (strong composition)
    """
    if not program:
        return None

    last_fn = program[-1].get("function", None)
    relate = has_relate(program)

    # ----- L1: attribute & existence, no relate -----
    if (not relate) and (last_fn in ATTR_QUERY_FNS or last_fn in EXIST_FNS):
        return "L1"

    # ----- L2: Compare Attribute (multi-set), no relate -----
    if (not relate) and (last_fn in ATTR_COMPARE_FNS):
        return "L2"

    # ----- L3: counting / integer compare, no relate -----
    if (not relate) and (last_fn in COUNT_FNS or last_fn in NUM_COMPARE_FNS):
        return "L3"

    # ----- L4: attribute & existence with relate -----
    if relate and (last_fn in ATTR_QUERY_FNS or last_fn in EXIST_FNS):
        return "L4"

    # ----- L5: count / compare (integer or attribute) with relate -----
    # strong composition: relate + any count/compare op (num or attr)
    if relate and (
        last_fn in COUNT_FNS
        or last_fn in NUM_COMPARE_FNS
        or last_fn in ATTR_COMPARE_FNS
        or any(
            step.get("function") in COUNT_FNS.union(NUM_COMPARE_FNS).union(ATTR_COMPARE_FNS)
            for step in program
        )
    ):
        return "L5"

    # Anything that doesn't fit any of the above will be counted as "unassigned"
    return None


Using CLEVR train file at: /content/drive/MyDrive/FYP/FYP impl/CLEVR_v1.0/questions/CLEVR_train_questions.json


Split into L1–L4 and save into same folder

In [None]:
def load_train_questions(path: str) -> Dict[str, Any]:
    with open(path, "r") as f:
        data = json.load(f)
    assert "questions" in data, "Expected key 'questions' in CLEVR JSON"
    return data

def save_questions(level: str, questions: List[Dict[str, Any]]) -> str:
    out_name = f"CLEVR_train_questions_{level}.json"
    out_path = os.path.join(FOLDER_PATH, out_name)
    out_data = {"questions": questions}
    with open(out_path, "w") as f:
        json.dump(out_data, f)
    print(f"Saved {len(questions):7d} questions to {out_path}")
    return out_path

def build_train_curriculum_splits():
    data = load_train_questions(TRAIN_PATH)
    all_questions = data["questions"]
    total_questions = len(all_questions)

    # Buckets for L1–L5
    buckets = defaultdict(list)

    # Track questions that did not match any level
    unassigned = []

    for q in all_questions:
        program = q.get("program", [])
        level = assign_level(program)
        if level is None:
            unassigned.append(q)
            continue
        buckets[level].append(q)

    # Save per-level files
    stats = {}
    for lvl in sorted(buckets.keys()):
        stats[lvl] = len(buckets[lvl])
        save_questions(lvl, buckets[lvl])

    # Summary
    print("\nSummary (train questions per level):")
    assigned_total = 0
    for lvl in sorted(stats.keys()):
        print(f"  {lvl}: {stats[lvl]}")
        assigned_total += stats[lvl]

    unassigned_count = len(unassigned)
    print(f"\nTotal questions in original file: {total_questions}")
    print(f"Total assigned to L1–L5:         {assigned_total}")
    print(f"Total unassigned (no level):     {unassigned_count}")

build_train_curriculum_splits()

Saved  196519 questions to /content/drive/MyDrive/FYP/FYP impl/CLEVR_v1.0/questions/CLEVR_train_questions_L1.json
Saved   31437 questions to /content/drive/MyDrive/FYP/FYP impl/CLEVR_v1.0/questions/CLEVR_train_questions_L2.json
Saved  126260 questions to /content/drive/MyDrive/FYP/FYP impl/CLEVR_v1.0/questions/CLEVR_train_questions_L3.json
Saved  149316 questions to /content/drive/MyDrive/FYP/FYP impl/CLEVR_v1.0/questions/CLEVR_train_questions_L4.json
Saved  196457 questions to /content/drive/MyDrive/FYP/FYP impl/CLEVR_v1.0/questions/CLEVR_train_questions_L5.json

Summary (train questions per level):
  L1: 196519
  L2: 31437
  L3: 126260
  L4: 149316
  L5: 196457

Total questions in original file: 699989
Total assigned to L1–L5:         699989
Total unassigned (no level):     0
