In [None]:
import os

S3_BUCKET = "s3://clevr-dataset"
LOCAL_DATA_DIR = "./dataset"

os.makedirs(LOCAL_DATA_DIR, exist_ok=True)

!aws s3 sync {S3_BUCKET} {LOCAL_DATA_DIR}


^C


In [None]:
import torch
import tqdm
from typing import Optional
from torch.utils.data import DataLoader
from transformers import ViltProcessor

# Core
from evaluation_service import EvaluationService
from curriculum_service import CurriculumManager
from checkpoint_service import CheckpointManager

# Model
from vilt_adapter import ViLTAdapter

# Data
from clevr_dataset_py import CLEVRCurriculumViltDataset,vilt_collate_fn,build_answer_vocab
CHECKPOINT_ROOT = "/content/drive/MyDrive/Colab Notebooks/FYP/checkpoints"

ModuleNotFoundError: No module named 'evaluation_service'

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
class CurriculumTrainer:
    def __init__(
        self,
        questions_dir: str,
        images_dir: str,
        answer2id: dict,
        run_name: str = "curriculum_run_v1",
        output_dir: str = "./outputs",
        batch_size: int = 32,
        max_tiers: int = 5,
        use_sspl: bool = False
    ):
        self.output_dir = os.path.join(output_dir, run_name)
        self.questions_dir = questions_dir
        self.images_dir = images_dir
        self.batch_size = batch_size
        self.answer2id = answer2id

        # Components
        self.evaluation_service = EvaluationService()
        self.curriculum = CurriculumManager(max_tiers=max_tiers)
        self.checkpoint_manager = CheckpointManager(checkpoint_dir=os.path.join(CHECKPOINT_ROOT, run_name))

        # Model
        self.model_adapter = ViLTAdapter() # This loads the model and optimizer

        # Processor (needed for dataset)
        self.processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")

        self.use_sspl = use_sspl

    def data_loader_for_tier(
        self,
        tier: int,
        split: str = "train",
        shuffle: bool = True,
    ) -> DataLoader:
        """
        Creates a DataLoader for a specific tier using user's custom logic structure.
        """
        tiers = [tier] if tier is not None else None

        # Determine dataset parameters based on split
        # Note: The user's snippet hardcoded split logic inside the call, here we make it dynamic

        dataset_sample = CLEVRCurriculumViltDataset(
            questions_dir=self.questions_dir,
            images_dir=self.images_dir,
            processor=self.processor,
            split=split,
            tiers=tiers,
            answer2id=self.answer2id if split in ["train", "val"] else None, # answer2id mostly needed for train/val
            max_length=32, # Default max length
        )

        loader = DataLoader(
            dataset_sample,
            batch_size=self.batch_size,
            shuffle=shuffle if split == "train" else False,
            num_workers=0, # As requested for Drive/Colab compatibility
            pin_memory=True,
            collate_fn=vilt_collate_fn,
        )
        return loader

def train(self):
        """
        Main training loop.
        Adapted for dynamic epochs and patience.
        """

        # 1. Load State
        start_tier, stored_metrics = self.checkpoint_manager.load_latest(
            self.model_adapter.model,
            self.model_adapter.optimizer,
            self.curriculum
        )

        print(f"Starting/Resuming at Tier {start_tier}")

        # 2. Continuous Loop until Curriculum Completed
        while not self.curriculum.is_completed:
            current_tier = self.curriculum.current_tier
            print(f"\n{'='*20}\n Entering Tier {current_tier} \n{'='*20}")

            # Setup Data (Re-initialize loaders for the current tier)
            train_loader = self.data_loader_for_tier(
                tier=current_tier,
                split='train',
                shuffle=True
            )

            val_loader = self.data_loader_for_tier(
                tier=current_tier,
                split='val',
                shuffle=False
            )

            # Patience Configuration
            patience_limit = 3
            patience_counter = 0
            best_acc_in_tier = 0.0
            epoch_in_tier = 0

            # Infinite epoch loop for this tier (broken by advancement or patience)
            while self.curriculum.current_tier == current_tier:
                epoch_in_tier += 1
                print(f"\nTier {current_tier} - Epoch {epoch_in_tier}")

                # --- TRAIN ---
                self.model_adapter.model.train()
                epoch_losses = []
                progress = tqdm.tqdm(train_loader, desc="Training")

                for batch in progress:
                    metrics = self.model_adapter.train_step(batch)
                    epoch_losses.append(metrics['loss'])
                    progress.set_postfix({'loss': metrics['loss']})

                # --- VALIDATE ---
                print("Validating...")
                self.model_adapter.model.eval()
                val_losses = []
                val_accs = []

                with torch.no_grad():
                    for batch in tqdm.tqdm(val_loader, desc="Validation"):
                        out = self.model_adapter.validation_step(batch)
                        acc = self.evaluation_service.compute_accuracy(out['logits'], out['labels'])

                        val_losses.append(out['loss'])
                        val_accs.append(acc)

                avg_val_loss = sum(val_losses) / len(val_losses) if val_losses else 0.0
                avg_val_acc = sum(val_accs) / len(val_accs) if val_accs else 0.0

                print(f"Validation: Loss={avg_val_loss:.4f}, Accuracy={avg_val_acc:.4f}")

                # Record metrics for Moving Average Calculation
                self.evaluation_service.record_metrics(avg_val_loss, avg_val_acc)

                # --- PATIENCE CHECK ---
                if avg_val_acc > best_acc_in_tier:
                    best_acc_in_tier = avg_val_acc
                    patience_counter = 0  # Reset counter
                else:
                    patience_counter += 1
                    print(f"No improvement in accuracy. Patience: {patience_counter}/{patience_limit}")

                # --- CURRICULUM CHECK ---
                should_advance = self.curriculum.should_advance(self.evaluation_service)

                # --- CHECKPOINT ---
                metrics_state = self.evaluation_service.get_latest_metrics()

                # We save every epoch as 'latest', but only mark 'is_best' if we are advancing
                self.checkpoint_manager.save(
                    model_state=self.model_adapter.get_state_dict(),
                    optimizer_state=self.model_adapter.get_optimizer_state_dict(),
                    curriculum_state=self.curriculum.get_config_state(),
                    metrics=metrics_state,
                    tier=current_tier,
                    is_best=should_advance
                )

                if should_advance:
                    # Logic: If we qualify to advance, we do so immediately.
                    self.curriculum.advance_tier()
                    # Reset metric history so the moving average starts fresh for the new tier
                    new_lr = self.curriculum.get_current_lr()
                    for param_group in self.model_adapter.optimizer.param_groups:
                        param_group['lr'] = new_lr
                    self.evaluation_service.reset_history()
                    # Break the inner epoch loop to restart outer loop with new Tier data
                    break

                if patience_counter >= patience_limit:
                    print(f"\n[STOP] Patience exhausted at Tier {current_tier}. Model is not improving.")
                    print("Stopping training to prevent overfitting or wasted compute.")
                    return  # Exit the entire training function


In [None]:
import os
import argparse
import json
from google.colab import drive


# QUESTIONS_DIR = "/content/drive/MyDrive/Colab Notebooks/FYP/dataset/clevr_kaggle/CLEVR_v1.0/questions"
# IMAGES_DIR    = "/content/drive/MyDrive/Colab Notebooks/FYP/dataset/clevr_kaggle/CLEVR_v1.0/images"

QUESTIONS_DIR = "/content/clevr_dataset/questions"
IMAGES_DIR    = "/content/clevr_dataset/images"

def main():
  # Mount Drive
    drive.mount("/content/drive")

    parser = argparse.ArgumentParser(description="Competence-Aware Curriculum VQA Training")
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--use_sspl", action="store_true")

    # IMPORTANT for Colab
    args, _ = parser.parse_known_args()

    # 1Ô∏è‚É£ Build answer vocabulary from TRAIN questions
    print("Building answer vocabulary...")
    tier_paths = [os.path.join(QUESTIONS_DIR, f"CLEVR_train_questions_L{i}.json") for i in [1,2,3,4,5]]
    answer2id = build_answer_vocab(
        tier_paths
    )

    print(f"Answer vocab size: {len(answer2id)}")

    # 2Ô∏è‚É£ Initialize trainer
    trainer = CurriculumTrainer(
        questions_dir=QUESTIONS_DIR,
        images_dir=IMAGES_DIR,
        answer2id=answer2id,
        batch_size=args.batch_size,
        use_sspl=args.use_sspl
    )

    print("üöÄ Starting Curriculum Training...")
    trainer.train()


In [None]:

if __name__ == "__main__":
    main()

In [None]:
%ls