In [None]:
# ==============================================================================
# CELL 1: SETUP AND DRIVE MOUNT
# ==============================================================================
# Run this cell once at the very beginning of your session.

from google.colab import drive
import sys
import torch
import os

print("--- Step 0: Mounting Google Drive and Initial Setup ---")
try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    sys.exit(f"CRITICAL ERROR: Could not mount Google Drive. The script cannot continue. Error: {e}")

# Define global constants that all parts will use
BASE_PROJECT_DIR = "/content/drive/.shortcut-targets-by-id/1cLgae9ycX3zn2wP-kQw-fVulnU8UEAUV/SharedTaskProject/"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {DEVICE}")

# This ensures subsequent cells can find the BASE_PROJECT_DIR and DEVICE variables
print("\nSetup complete. You can now proceed to Cell 2.")

--- Step 0: Mounting Google Drive and Initial Setup ---
Mounted at /content/drive
Google Drive mounted successfully.
Using device: cuda

Setup complete. You can now proceed to Cell 2.


In [None]:
# ==============================================================================
# CELL 2: PART 1 - PRE-COMPUTE FEATURES
# ==============================================================================
# Run this cell ONCE per dataset. This is the slow part.
# After this completes, the features are saved to your Drive forever.
# You do not need to run this cell again in future sessions.

import numpy as np
import pandas as pd
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from tqdm import tqdm
import glob
import warnings

# --- Configuration for Pre-computation ---
class PrecomputeConfig:
    FEATURES_DIR = os.path.join(BASE_PROJECT_DIR, "precomputed_features", "Subtask_A_Final")
    CLIP_MODEL_ID = "openai/clip-vit-large-patch14"
    BATCH_SIZE = 64
    LABEL_COL, IMAGE_NAME_COL, TEXT_COL = 'label', 'index', 'text'
    LABEL_TO_ID = {'No Hate': 0, 'Hate': 1}
    TRAIN_IMAGES_ROOT_DIR = os.path.join(BASE_PROJECT_DIR, "Subtask_A", "Train", "Subtask_A_Train")
    TRAIN_TEXT_CSV_FILE = os.path.join(BASE_PROJECT_DIR, "Train_Text", "STask_A_train.csv")
    VAL_IMAGES_DIR = os.path.join(BASE_PROJECT_DIR, "Subtask_A", "Evaluation", "STask_A_val_img")
    VAL_TEXT_CSV_FILE = os.path.join(BASE_PROJECT_DIR, "Eval_Data_Text", "STask-A(index,text)val.csv")
    VAL_LABELS_CSV_FILE = os.path.join(BASE_PROJECT_DIR, "Eval_Data_Labels", "STask-A(index,label)val.csv")
    TEST_IMAGES_DIR = os.path.join(BASE_PROJECT_DIR, "Subtask_A", "Test", "STask_A_test_img")
    TEST_TEXT_CSV_FILE = os.path.join(BASE_PROJECT_DIR, "Subtask_A", "Test", "STask-A(index,text)test.csv")

cfg_precompute = PrecomputeConfig()
warnings.filterwarnings("ignore", category=UserWarning)
os.makedirs(cfg_precompute.FEATURES_DIR, exist_ok=True)
print(f"--- Part 1: Pre-computation Script ---")
print(f"Saving features to: {cfg_precompute.FEATURES_DIR}")

def load_data_for_precompute(split):
    if split == 'train':
        img_root_dir, text_csv_path = cfg_precompute.TRAIN_IMAGES_ROOT_DIR, cfg_precompute.TRAIN_TEXT_CSV_FILE
        text_df = pd.read_csv(text_csv_path).rename(columns={cfg_precompute.IMAGE_NAME_COL: 'name', cfg_precompute.TEXT_COL: 'text'})
        data_list = []
        for category, label_id in cfg_precompute.LABEL_TO_ID.items():
            folder_path = os.path.join(img_root_dir, category)
            if not os.path.isdir(folder_path): continue
            for ext in ('*.png', '*.jpg', '*.jpeg'):
                for img_path in glob.glob(os.path.join(folder_path, ext)):
                    img_filename = os.path.basename(img_path)
                    text_row = text_df[text_df['name'] == img_filename]
                    text = str(text_row.iloc[0]['text']) if not text_row.empty and pd.notna(text_row.iloc[0]['text']) else ""
                    data_list.append({'name': img_filename, 'text': text, cfg_precompute.LABEL_COL: label_id, 'img_path': img_path})
        return pd.DataFrame(data_list)
    elif split == 'val':
        labels_df = pd.read_csv(cfg_precompute.VAL_LABELS_CSV_FILE).rename(columns={cfg_precompute.IMAGE_NAME_COL: 'name', 'label': cfg_precompute.LABEL_COL})
        text_df = pd.read_csv(cfg_precompute.VAL_TEXT_CSV_FILE).rename(columns={cfg_precompute.IMAGE_NAME_COL: 'name', cfg_precompute.TEXT_COL: 'text'})
        df = pd.merge(labels_df, text_df, on='name', how='inner')
        df[cfg_precompute.LABEL_COL] = df[cfg_precompute.LABEL_COL].astype(int)
        df['img_path'] = df['name'].apply(lambda x: os.path.join(cfg_precompute.VAL_IMAGES_DIR, x))
        return df
    elif split == 'test':
        df = pd.read_csv(cfg_precompute.TEST_TEXT_CSV_FILE).rename(columns={'index': 'name', 'text': 'text'})
        df['img_path'] = df['name'].apply(lambda x: os.path.join(cfg_precompute.TEST_IMAGES_DIR, x))
        # Add a dummy label column for dataloader compatibility
        df[cfg_precompute.LABEL_COL] = -1
        return df

print(f"Loading CLIP model: {cfg_precompute.CLIP_MODEL_ID}")
processor = CLIPProcessor.from_pretrained(cfg_precompute.CLIP_MODEL_ID)
model_clip = CLIPModel.from_pretrained(cfg_precompute.CLIP_MODEL_ID).to(DEVICE).eval()

splits_to_process = ['test']

for split in splits_to_process:
    print(f"\n--- Processing {split} split ---")
    df = load_data_for_precompute(split)

    if df.empty:
        print(f"Warning: No data found for split '{split}'. Skipping.")
        continue

    # Sort test set now to preserve order for final submission
    if split == 'test':
        # Handle cases where the test set might not have numeric filenames
        try:
            df['sort_key'] = df['name'].str.extract('(\d+)').astype(int)
            df = df.sort_values(by='sort_key').drop(columns=['sort_key']).reset_index(drop=True)
        except:
            print("Could not sort test files numerically, using alphabetical sort.")
            df = df.sort_values(by='name').reset_index(drop=True)

        # Save the sorted names for later
        df[['name']].to_csv(os.path.join(cfg_precompute.FEATURES_DIR, "test_names_sorted.csv"), index=False)

    all_img_features, all_txt_features, all_labels = [], [], []
    class InferenceDataset(torch.utils.data.Dataset):
        def __init__(self, df): self.df = df
        def __len__(self): return len(self.df)
        def __getitem__(self, idx): return self.df.iloc[idx].to_dict()
    dataloader = torch.utils.data.DataLoader(InferenceDataset(df), batch_size=cfg_precompute.BATCH_SIZE, shuffle=False, num_workers=2)

    with torch.no_grad():
        for batch in tqdm(dataloader, desc=f"Extracting {split} features"):
            # This try-except block makes image loading more robust.
            try:
                images = [Image.open(p).convert("RGB") for p in batch['img_path']]
                image_inputs = processor(images=images, return_tensors="pt", padding=True).to(DEVICE)
                img_features = model_clip.get_image_features(**image_inputs)
            except Exception as e:
                print(f"Warning: A batch of images could not be loaded. Using zero features. Error: {e}")
                img_features = torch.zeros((len(batch['img_path']), model_clip.projection_dim)).to(DEVICE)

            texts = list(batch['text'])
            text_inputs = processor(text=texts, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
            txt_features = model_clip.get_text_features(**text_inputs)
            all_img_features.append(img_features.cpu()); all_txt_features.append(txt_features.cpu())
            if split != 'test':
                all_labels.append(torch.tensor(batch['label'].tolist(), dtype=torch.long))

    img_features_tensor = torch.cat(all_img_features, dim=0)
    txt_features_tensor = torch.cat(all_txt_features, dim=0)

    torch.save(img_features_tensor, os.path.join(cfg_precompute.FEATURES_DIR, f"{split}_img_features.pt"))
    torch.save(txt_features_tensor, os.path.join(cfg_precompute.FEATURES_DIR, f"{split}_txt_features.pt"))

    if split != 'test':
        labels_tensor = torch.cat(all_labels, dim=0)
        torch.save(labels_tensor, os.path.join(cfg_precompute.FEATURES_DIR, f"{split}_labels.pt"))

    print(f"Successfully saved all features for {split} split.")

del model_clip, processor
torch.cuda.empty_cache()
print("\n--- PART 1 COMPLETE. YOU CAN NOW RUN CELL 3. ---")

In [None]:
# ==============================================================================
# CELL 3: PART 2 & 3 (FINAL "RADICAL SIMPLICITY" VERSION)
# ==============================================================================
# This script uses the most robust and reliable training setup: AdamW with a
# fixed learning rate. All experimental complexity has been removed.
# This is the definitive strategy to achieve a high score.

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
import os
from pathlib import Path
from tqdm import tqdm
import random
import warnings
import json
import zipfile
import sys
import gc

# ==============================================================================
# CELL 1: SETUP AND DRIVE MOUNT
# ==============================================================================
# Run this cell once at the very beginning of your session.

from google.colab import drive
import sys
import torch
import os

print("--- Step 0: Mounting Google Drive and Initial Setup ---")
try:
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")
except Exception as e:
    sys.exit(f"CRITICAL ERROR: Could not mount Google Drive. The script cannot continue. Error: {e}")

# Define global constants that all parts will use
BASE_PROJECT_DIR = "/content/drive/.shortcut-targets-by-id/1cLgae9ycX3zn2wP-kQw-fVulnU8UEAUV/SharedTaskProject/"
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {DEVICE}")

# This ensures subsequent cells can find the BASE_PROJECT_DIR and DEVICE variables
print("\nSetup complete. You can now proceed to Cell 2.")

# --- 1. SETUP & ADVANCED CONFIGURATION ---
print("--- Final Training & Submission Script (Radical Simplicity Version) ---")
print(f"DEVICE: {DEVICE.upper()}")

# --- CORE PATHS CONFIGURATION ---
class FinalConfig:
    FEATURES_DIR = Path(BASE_PROJECT_DIR) / "precomputed_features" / "Subtask_A_Final"
    MODELS_DIR = Path(BASE_PROJECT_DIR) / "models" / "Subtask_A"
    EXP_NAME = "memeclip_COATTENTION_RADICAL_SIMPLICITY"
    MODEL_SAVE_PATH = MODELS_DIR / EXP_NAME
    SUBMISSION_DIR = Path(BASE_PROJECT_DIR) / "submissions" / "Subtask_A_Radical_Simplicity"
    SUBMISSION_JSON_PATH = SUBMISSION_DIR / "submission.json"
    SUBMISSION_ZIP_PATH = SUBMISSION_DIR / "ref.zip"

    # --- Ensemble & Training Strategy ---
    NUM_ENSEMBLE_MODELS = 5
    NUM_EPOCHS = 40 # Give it plenty of time to converge with a small LR
    BATCH_SIZE = 1024
    LABEL_SMOOTHING = 0.1

    # --- Optimizer (NO SCHEDULER) ---
    LEARNING_RATE = 2e-4 # A small, fixed learning rate
    WEIGHT_DECAY = 0.1

    # --- SOTA Co-Attention Model Architecture ---
    FEATURE_DIM = 768
    ATTENTION_HEADS = 8
    ATTENTION_LAYERS = 4
    ATTENTION_DROPOUT = 0.25
    OUTPUT_DIM = 2

    INFERENCE_BATCH_SIZE = 2048
    SEEDS = [2277, 1365, 2614, 727, 206924]

cfg = FinalConfig()
cfg.MODEL_SAVE_PATH.mkdir(parents=True, exist_ok=True)
cfg.SUBMISSION_DIR.mkdir(parents=True, exist_ok=True)

# --- 2. RE-DEFINE MODEL ARCHITECTURE & TRAINING LOGIC ---
def seed_everything(seed):
    random.seed(seed); os.environ['PYTHONHASHSEED'] = str(seed); np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed); torch.backends.cudnn.deterministic = True; torch.backends.cudnn.benchmark = False

class CoAttentionClassifier(nn.Module):
    def __init__(self, feature_dim, num_heads, num_layers, dropout, output_dim):
        super().__init__()
        self.feature_dim = feature_dim
        self.img_proj = nn.Linear(feature_dim, feature_dim)
        self.txt_proj = nn.Linear(feature_dim, feature_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=feature_dim, nhead=num_heads, dropout=dropout, batch_first=True, activation='gelu', dim_feedforward=feature_dim*4)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier_head = nn.Sequential(
            nn.LayerNorm(feature_dim * 2),
            nn.Linear(feature_dim * 2, feature_dim), nn.GELU(),
            nn.Dropout(dropout), nn.Linear(feature_dim, output_dim)
        )
    def forward(self, img_feat, txt_feat):
        img_proj = self.img_proj(img_feat).unsqueeze(1)
        txt_proj = self.txt_proj(txt_feat).unsqueeze(1)
        combined_seq = torch.cat([img_proj, txt_proj], dim=1)
        attended_seq = self.transformer_encoder(combined_seq)
        flat_features = attended_seq.flatten(start_dim=1)
        return self.classifier_head(flat_features)

def train_one_model(seed, train_loader, val_loader, val_labels_np):
    seed_everything(seed)
    print(f"\n--- Training Model with seed {seed} ---")
    model = CoAttentionClassifier(cfg.FEATURE_DIM, cfg.ATTENTION_HEADS, cfg.ATTENTION_LAYERS, cfg.ATTENTION_DROPOUT, cfg.OUTPUT_DIM).to(DEVICE)

    # The most robust setup: AdamW with a fixed learning rate.
    optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.LEARNING_RATE, weight_decay=cfg.WEIGHT_DECAY)
    criterion = nn.CrossEntropyLoss(label_smoothing=cfg.LABEL_SMOOTHING)

    best_val_f1 = 0.0
    patience_counter = 0
    early_stopping_patience = 7 # Stop if no improvement for 7 epochs

    for epoch in range(cfg.NUM_EPOCHS):
        model.train()
        for img_feat, txt_feat, labels in train_loader:
            img_feat, txt_feat, labels = img_feat.to(DEVICE), txt_feat.to(DEVICE), labels.to(DEVICE)

            optimizer.zero_grad()
            logits = model(img_feat, txt_feat)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        all_preds_probas = []
        with torch.no_grad():
            for img_feat, txt_feat, _ in val_loader:
                img_feat, txt_feat = img_feat.to(DEVICE), txt_feat.to(DEVICE)
                all_preds_probas.append(torch.softmax(model(img_feat, txt_feat), dim=1).cpu())
        val_preds_labels = torch.argmax(torch.cat(all_preds_probas, dim=0), dim=1)
        val_f1 = f1_score(val_labels_np, val_preds_labels.numpy(), average='weighted')

        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            patience_counter = 0
            print(f"  Epoch {epoch+1}/{cfg.NUM_EPOCHS} -> New Best Val F1: {val_f1:.4f}")
            torch.save(model.state_dict(), cfg.MODEL_SAVE_PATH / f"simple_adamw_seed_{seed}.pt")
        else:
            patience_counter += 1

        if patience_counter >= early_stopping_patience:
            print(f"  Early stopping triggered at epoch {epoch+1}.")
            break

    print(f"  Finished training for seed {seed}. Best Val F1: {best_val_f1:.4f}")

# --- 3. EXECUTE TRAINING & SUBMISSION PIPELINE ---
print("\n" + "="*80)
print("--- PART 2: TRAINING SOTA CO-ATTENTION ENSEMBLE (ROBUST) ---")
print("="*80)

print("Loading pre-computed features...")
try:
    train_img_feat = torch.load(cfg.FEATURES_DIR / "train_img_features.pt")
    train_txt_feat = torch.load(cfg.FEATURES_DIR / "train_txt_features.pt")
    train_labels = torch.load(cfg.FEATURES_DIR / "train_labels.pt")
    val_img_feat = torch.load(cfg.FEATURES_DIR / "val_img_features.pt")
    val_txt_feat = torch.load(cfg.FEATURES_DIR / "val_txt_features.pt")
    val_labels = torch.load(cfg.FEATURES_DIR / "val_labels.pt")
except FileNotFoundError as e:
    sys.exit(f"FATAL ERROR: Pre-computed feature files not found. Please run Cell 2 first. Missing file: {e}")

val_labels_np = val_labels.numpy()
train_loader = DataLoader(TensorDataset(train_img_feat, train_txt_feat, train_labels), batch_size=cfg.BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(TensorDataset(val_img_feat, val_txt_feat, val_labels), batch_size=cfg.BATCH_SIZE * 2, num_workers=2)

for seed in cfg.SEEDS:
    train_one_model(seed, train_loader, val_loader, val_labels_np)

print("\n--- ENSEMBLE TRAINING COMPLETE ---")

print("\n" + "="*80)
print("--- PART 3: GENERATING FINAL SUBMISSION ---")
print("="*80)

print("Loading pre-computed test features...")
test_img_feat = torch.load(cfg.FEATURES_DIR / "test_img_features.pt")
test_txt_feat = torch.load(cfg.FEATURES_DIR / "test_txt_features.pt")
test_names_df = pd.read_csv(cfg.FEATURES_DIR / "test_names_sorted.csv")

print("Loading trained ensemble models...")
models = []
for seed in cfg.SEEDS:
    model_path = cfg.MODEL_SAVE_PATH / f"simple_adamw_seed_{seed}.pt"
    model = CoAttentionClassifier(cfg.FEATURE_DIM, cfg.ATTENTION_HEADS, cfg.ATTENTION_LAYERS, cfg.ATTENTION_DROPOUT, cfg.OUTPUT_DIM).to(DEVICE)
    # Use a try-except block here in case one of the models failed to save (e.g., if it never improved)
    try:
        model.load_state_dict(torch.load(model_path, map_location=DEVICE))
        model.eval()
        models.append(model)
    except FileNotFoundError:
        print(f"Warning: Model for seed {seed} not found at {model_path}. It might not have improved past epoch 0. Skipping.")

print(f"Successfully loaded {len(models)} models for ensembling.")

if not models:
    sys.exit("FATAL ERROR: No models were successfully loaded. Cannot create submission.")

inference_loader = DataLoader(TensorDataset(test_img_feat, test_txt_feat), batch_size=cfg.INFERENCE_BATCH_SIZE, shuffle=False)
all_model_probas = []
with torch.no_grad():
    for model in tqdm(models, desc="Ensemble Inference on Test Set"):
        current_model_probas = []
        for img_feat, txt_feat in inference_loader:
            img_feat, txt_feat = img_feat.to(DEVICE), txt_feat.to(DEVICE)
            logits = model(img_feat, txt_feat)
            current_model_probas.append(torch.softmax(logits, dim=1).cpu())
        all_model_probas.append(torch.cat(current_model_probas, dim=0))

ensemble_probas = torch.stack(all_model_probas, dim=0).mean(dim=0)
final_predictions = torch.argmax(ensemble_probas, dim=1).numpy()

print("\nCreating submission file in the specified format...")
submission_df = pd.DataFrame({'index': test_names_df['name'], 'prediction': final_predictions})
submission_list = submission_df.to_dict('records')

with open(cfg.SUBMISSION_JSON_PATH, 'w') as f:
    for item in submission_list:
        f.write(json.dumps(item) + '\n')
print(f"submission.json created successfully at: {cfg.SUBMISSION_JSON_PATH}")

print("\nZipping submission file for CodaLab...")
with zipfile.ZipFile(cfg.SUBMISSION_ZIP_PATH, 'w', zipfile.ZIP_DEFLATED) as zf:
    zf.write(cfg.SUBMISSION_JSON_PATH, arcname='submission.json')
print(f"\n✅ Submission ready! ref.zip created at: {cfg.SUBMISSION_ZIP_PATH}")

print("\n" + "*"*80)
print("--- MISSION ACCOMPLISHED. THE DEFINITIVE SUBMISSION IS READY. ---")
print("*"*80)

--- Step 0: Mounting Google Drive and Initial Setup ---
Mounted at /content/drive
Google Drive mounted successfully.
Using device: cuda

Setup complete. You can now proceed to Cell 2.
--- Final Training & Submission Script (Radical Simplicity Version) ---
DEVICE: CUDA

--- PART 2: TRAINING SOTA CO-ATTENTION ENSEMBLE (ROBUST) ---
Loading pre-computed features...

--- Training Model with seed 2277 ---
  Epoch 1/40 -> New Best Val F1: 0.3479
  Epoch 2/40 -> New Best Val F1: 0.7586
  Epoch 3/40 -> New Best Val F1: 0.7729
  Epoch 4/40 -> New Best Val F1: 0.7904
  Epoch 5/40 -> New Best Val F1: 0.7981
  Epoch 6/40 -> New Best Val F1: 0.8098
  Epoch 7/40 -> New Best Val F1: 0.8198
  Early stopping triggered at epoch 14.
  Finished training for seed 2277. Best Val F1: 0.8198

--- Training Model with seed 1365 ---
  Epoch 1/40 -> New Best Val F1: 0.4502
  Epoch 2/40 -> New Best Val F1: 0.7613
  Epoch 3/40 -> New Best Val F1: 0.7812
  Epoch 4/40 -> New Best Val F1: 0.7967
  Epoch 5/40 -> New Bes

Ensemble Inference on Test Set: 100%|██████████| 5/5 [00:00<00:00, 23.60it/s]



Creating submission file in the specified format...
submission.json created successfully at: /content/drive/.shortcut-targets-by-id/1cLgae9ycX3zn2wP-kQw-fVulnU8UEAUV/SharedTaskProject/submissions/Subtask_A_Radical_Simplicity/submission.json

Zipping submission file for CodaLab...

✅ Submission ready! ref.zip created at: /content/drive/.shortcut-targets-by-id/1cLgae9ycX3zn2wP-kQw-fVulnU8UEAUV/SharedTaskProject/submissions/Subtask_A_Radical_Simplicity/ref.zip

********************************************************************************
--- MISSION ACCOMPLISHED. THE DEFINITIVE SUBMISSION IS READY. ---
********************************************************************************
