In [1]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c neural-networks-data-science-uc-3-m

!zipinfo neural-networks-data-science-uc-3-m.zip

!jar xvf neural-networks-data-science-uc-3-m.zip


Archive:  neural-networks-data-science-uc-3-m.zip
Zip file size: 1972451903 bytes, number of entries: 5
-rw----     5.1 fat 243321852 bx defN 25-Jan-23 14:50 new-train-metadata.csv
-rw----     5.1 fat       66 bx defN 25-Jan-23 14:50 sample_submission.csv
-rw----     5.1 fat    58961 bx defN 25-Jan-23 14:50 students-test-metadata.csv
-rw----     5.1 fat   694335 bx defN 25-Jan-23 14:50 test-image.hdf5
-rw----     5.1 fat 2829955538 bx defN 25-Jan-23 14:50 train-image.hdf5
5 files, 3074030752 bytes uncompressed, 1972451101 bytes compressed:  35.8%
 inflated: new-train-metadata.csv
 inflated: sample_submission.csv
 inflated: students-test-metadata.csv
 inflated: test-image.hdf5
 inflated: train-image.hdf5


In [None]:
import os
import h5py
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms as T
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
import torch.optim as optim
from torch.optim.lr_scheduler import OneCycleLR, CosineAnnealingWarmRestarts
from torch.amp import autocast, GradScaler
import torch.nn.utils as nn_utils
from tqdm import tqdm
import timm
import warnings
import albumentations as A
from albumentations.pytorch import ToTensorV2
from collections import defaultdict

warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# ========== CONFIG ==========
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
TRAIN_CSV = "new-train-metadata.csv"
TRAIN_HDF5 = "train-image.hdf5"

# Improved configuration
POS_RATIO = 0.4  # Better balancing for minority class
NUM_FOLDS = 5
TOP_K = 3
SEED = 42
PATIENCE = 7  # Increased patience
BATCH_SIZE = 16  # Smaller batch size for better generalization
MAX_EPOCHS = 25  # More epochs with early stopping
BASE_MODELS = [
    ("efficientnetv2_rw_s", 384),  # More modern, stronger backbone
    ("convnext_small", 384),       # Different architecture family
    ("eca_nfnet_l0", 384)          # ECA-NFNet with attention
]
USE_SWA = False  # Disabled SWA
AUG_STRENGTH = 0.7  # Stronger augmentations
MIXUP_ALPHA = 0.4  # Stronger mixup
CUTMIX_PROB = 0.3  # Add cutmix
LABEL_SMOOTHING = 0.05  # Label smoothing

# ========== DATASET ==========
class ISIC_HDF5_MetaDataset(Dataset):
    def __init__(self, df, hdf5_path, img_size=384, transform=None):
        self.df = df.reset_index(drop=True)
        self.hdf5_path = hdf5_path
        self.transform = transform
        self.img_size = img_size
        self.meta = self._preprocess_metadata(df)

    def _preprocess_metadata(self, df):
        df = df.copy()
        # Enhanced feature engineering
        df['sex'] = LabelEncoder().fit_transform(df['sex'].fillna("unknown"))
        df['anatom_site_general'] = LabelEncoder().fit_transform(df['anatom_site_general'].fillna("unknown"))
        df['age_approx'] = df['age_approx'].fillna(df['age_approx'].median())

        # Add age bins as categorical features
        df['age_bin'] = pd.cut(df['age_approx'], bins=[0, 30, 45, 60, 75, 100], labels=False)

        # Create more sophisticated feature interactions
        base_features = df[['age_approx', 'sex', 'anatom_site_general', 'age_bin']].values
        poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
        interactions = poly.fit_transform(base_features)

        # Apply standardization
        meta_features = StandardScaler().fit_transform(interactions)
        return torch.tensor(meta_features, dtype=torch.float32)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        isic_id = row["isic_id"]
        label = torch.tensor(row["target"], dtype=torch.float32)
        meta = self.meta[idx]

        with h5py.File(self.hdf5_path, 'r') as hf:
            encoded_bytes = hf[isic_id][()]
        image_bgr = cv2.imdecode(encoded_bytes, cv2.IMREAD_COLOR)
        image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)

        if self.transform:
            transformed = self.transform(image=image_rgb)
            image = transformed["image"]
        else:
            # Default resize if no transform
            image = cv2.resize(image_rgb, (self.img_size, self.img_size))
            image = torch.from_numpy(image.transpose(2, 0, 1)).float() / 255.0

        return image, meta, label, isic_id

# ========== ATTENTION MODULE ==========
class SpatialAttention(nn.Module):
    def __init__(self, in_channels):
        super(SpatialAttention, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, in_channels // 8, kernel_size=1),
            nn.BatchNorm2d(in_channels // 8),
            nn.ReLU(),
            nn.Conv2d(in_channels // 8, 1, kernel_size=1),
            nn.Sigmoid()
        )

    def forward(self, x):
        attention_map = self.conv(x)
        return x * attention_map

# ========== MODEL ==========
class AdvancedModelMeta(nn.Module):
    def __init__(self, model_name, meta_dim=18, pretrained=True):
        super().__init__()
        # Use more powerful backbone
        self.backbone = timm.create_model(model_name, pretrained=pretrained, num_classes=0)
        backbone_dim = self.backbone.num_features

        # Add attention mechanism
        self.attention = SpatialAttention(backbone_dim)

        # Enhanced metadata processing path
        self.meta_fc = nn.Sequential(
            nn.Linear(meta_dim, 256),
            nn.LayerNorm(256),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.LayerNorm(128),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.LayerNorm(64),
            nn.GELU()
        )

        # Combined head
        self.head = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(backbone_dim + 64, 128),
            nn.LayerNorm(128),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1)
        )

    def forward(self, x_img, x_meta):
        # Extract image features with attention
        img_features = self.backbone(x_img)
        if len(img_features.shape) == 4:  # If features have spatial dimensions
            batch_size, channels, height, width = img_features.shape
            img_features = self.attention(img_features)
            img_features = F.adaptive_avg_pool2d(img_features, (1, 1)).view(batch_size, channels)

        # Process metadata
        meta_features = self.meta_fc(x_meta)

        # Combine features
        combined = torch.cat([img_features, meta_features], dim=1)
        return self.head(combined)

# ========== LOSS ==========
class FocalLossWithSmoothing(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, smoothing=0.05):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.smoothing = smoothing

    def forward(self, inputs, targets):
        # Apply label smoothing
        if self.smoothing > 0:
            # For binary case
            targets = targets * (1 - self.smoothing) + 0.5 * self.smoothing

        bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-bce_loss)
        focal_loss = self.alpha * (1 - pt) ** self.gamma * bce_loss
        return focal_loss.mean()

# ========== UTILS ==========
def balance_dataset(df, pos_ratio, seed=42):
    pos_df = df[df['target'] == 1]
    neg_df = df[df['target'] == 0]
    num_pos = len(pos_df)

    # Dynamic ratio adjustment based on dataset size
    # For smaller datasets, we want slightly more negative samples
    if num_pos < 200:
        adjusted_ratio = max(0.35, pos_ratio - 0.05)
    else:
        adjusted_ratio = pos_ratio

    num_neg = int((num_pos * (1 - adjusted_ratio)) / adjusted_ratio)
    neg_sample = neg_df.sample(n=min(num_neg, len(neg_df)), random_state=seed)
    return pd.concat([pos_df, neg_sample]).sample(frac=1.0, random_state=seed).reset_index(drop=True)

# ========== TRANSFORMS ==========
def get_train_transforms(img_size, aug_strength=0.7):
    return A.Compose([
        A.RandomResizedCrop(size=(img_size, img_size), scale=(0.8, 1.0)),
        A.OneOf([
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.5),
            A.RandomRotate90(p=0.5),
            A.Transpose(p=0.5),
        ], p=aug_strength),
        A.OneOf([
            A.MotionBlur(blur_limit=3, p=0.2),
            A.MedianBlur(blur_limit=3, p=0.3),
            A.GaussianBlur(blur_limit=3, p=0.3),
            A.GaussNoise(var_limit=(10.0, 50.0), p=0.2),
        ], p=aug_strength*0.7),
        A.OneOf([
            A.OpticalDistortion(p=0.3),
            A.GridDistortion(p=0.2),
            A.ElasticTransform(p=0.2),
        ], p=aug_strength*0.5),
        A.OneOf([
            A.CLAHE(clip_limit=2, p=0.5),
            A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),
            A.HueSaturationValue(hue_shift_limit=10, sat_shift_limit=30, val_shift_limit=20, p=0.5),
            A.RGBShift(r_shift_limit=20, g_shift_limit=20, b_shift_limit=20, p=0.5),
        ], p=aug_strength*0.8),
        A.CoarseDropout(max_holes=8, max_height=32, max_width=32, min_holes=2, p=aug_strength*0.3),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ])

def get_valid_transforms(img_size):
    return A.Compose([
        A.Resize(img_size, img_size),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ])

# ========== AUGMENTATION STRATEGIES ==========
def mixup_data(x, meta, y, alpha=0.4):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    mixed_meta = lam * meta + (1 - lam) * meta[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, mixed_meta, y_a, y_b, lam

def cutmix_data(x, meta, y, alpha=1.0):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        return x, meta, y, y, 1.0

    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)

    # Get cutmix dimensions
    h, w = x.shape[2], x.shape[3]
    cut_h, cut_w = int(h * np.sqrt(1 - lam)), int(w * np.sqrt(1 - lam))
    cy, cx = np.random.randint(h), np.random.randint(w)

    # Get bounding box
    bbx1 = np.clip(cx - cut_w // 2, 0, w)
    bby1 = np.clip(cy - cut_h // 2, 0, h)
    bbx2 = np.clip(cx + cut_w // 2, 0, w)
    bby2 = np.clip(cy + cut_h // 2, 0, h)

    # Apply cutmix to images
    x_mixed = x.clone()
    x_mixed[:, :, bby1:bby2, bbx1:bbx2] = x[index, :, bby1:bby2, bbx1:bbx2]

    # Update lambda to reflect actual area ratio
    lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (h * w))

    # Mix metadata too (like mixup)
    meta_mixed = lam * meta + (1 - lam) * meta[index, :]

    return x_mixed, meta_mixed, y, y[index], lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

# ========== TRAINING ==========
def train_fold(model_config, fold, train_idx, val_idx, df):
    model_name, img_size = model_config

    print(f"Training {model_name} with image size {img_size} on fold {fold}")

    train_df = df.iloc[train_idx].reset_index(drop=True)
    val_df = df.iloc[val_idx].reset_index(drop=True)

    train_dataset = ISIC_HDF5_MetaDataset(
        train_df, TRAIN_HDF5, img_size=img_size,
        transform=get_train_transforms(img_size, AUG_STRENGTH)
    )
    val_dataset = ISIC_HDF5_MetaDataset(
        val_df, TRAIN_HDF5, img_size=img_size,
        transform=get_valid_transforms(img_size)
    )

    train_loader = DataLoader(
        train_dataset, batch_size=BATCH_SIZE, shuffle=True,
        num_workers=2, pin_memory=True
    )
    val_loader = DataLoader(
        val_dataset, batch_size=BATCH_SIZE, shuffle=False,
        num_workers=2, pin_memory=True
    )

    # Get the number of metadata features dynamically from the dataset
    meta_dim = train_dataset.meta.shape[1]

    model = AdvancedModelMeta(model_name, meta_dim=meta_dim).to(device)

    # Enhanced loss with label smoothing
    criterion = FocalLossWithSmoothing(
        alpha=0.25, gamma=2.0, smoothing=LABEL_SMOOTHING
    )

    # Different learning rates for backbone and new layers
    param_groups = [
        {'params': model.backbone.parameters(), 'lr': 2e-5},
        {'params': model.attention.parameters(), 'lr': 5e-4},
        {'params': model.meta_fc.parameters(), 'lr': 4e-4},
        {'params': model.head.parameters(), 'lr': 4e-4}
    ]

    optimizer = optim.AdamW(param_groups, weight_decay=1e-4)

    # LR scheduler - Cosine annealing with warm restarts
    scheduler = CosineAnnealingWarmRestarts(
        optimizer, T_0=5, T_mult=1, eta_min=1e-6
    )

    # Mixed precision for faster training
    scaler = GradScaler()

    best_auc = 0
    no_improve_epochs = 0
    history = []

    for epoch in range(1, MAX_EPOCHS + 1):
        model.train()
        running_loss = 0.0

        for x_img, x_meta, y, _ in tqdm(train_loader, desc=f"Fold {fold} | Epoch {epoch}"):
            x_img, x_meta, y = x_img.to(device), x_meta.to(device), y.to(device)

            # Apply mixup or cutmix with probability
            rand_prob = np.random.random()
            if rand_prob < 0.4:  # 40% chance for mixup
                x_img, x_meta, y_a, y_b, lam = mixup_data(x_img, x_meta, y, alpha=MIXUP_ALPHA)
                aug_type = "mixup"
            elif rand_prob < 0.6:  # 20% chance for cutmix
                x_img, x_meta, y_a, y_b, lam = cutmix_data(x_img, x_meta, y, alpha=MIXUP_ALPHA)
                aug_type = "cutmix"
            else:  # 40% no mixing
                y_a, y_b, lam = y, y, 1.0
                aug_type = "none"

            optimizer.zero_grad()
            with autocast(device_type=device.type if device.type != 'mps' else 'cpu'):
                logits = model(x_img, x_meta).view(-1)
                if aug_type != "none":
                    loss = mixup_criterion(criterion, logits, y_a, y_b, lam)
                else:
                    loss = criterion(logits, y)

            # Use gradient scaling with mixed precision
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            nn_utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()

            running_loss += loss.item()

        # Update learning rate
        scheduler.step()

        # Validation phase
        model.eval()
        probs, targets = [], []
        val_loss = 0.0

        with torch.no_grad():
            for x_img, x_meta, y, _ in val_loader:
                x_img, x_meta, y = x_img.to(device), x_meta.to(device), y.to(device)

                with autocast(device_type=device.type if device.type != 'mps' else 'cpu'):
                    logits = model(x_img, x_meta).view(-1)
                    loss = criterion(logits, y)
                    val_loss += loss.item()

                preds = torch.sigmoid(logits).cpu().numpy()
                probs.extend(preds)
                targets.extend(y.cpu().numpy())

        # Calculate metrics
        avg_train_loss = running_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        auc = roc_auc_score(targets, probs)

        history.append({
            'epoch': epoch,
            'train_loss': avg_train_loss,
            'val_loss': avg_val_loss,
            'auc': auc
        })

        print(f"Fold {fold} | Epoch {epoch} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | AUC: {auc:.4f}")

        # Save model on improved performance
        if auc > best_auc:
            best_auc = auc
            # Include model name in the saved file
            model_path = f"{model_name}_fold{fold}.pt"
            torch.save(model.state_dict(), model_path)
            print(f"✅ Saved model to {model_path} with AUC {auc:.4f}")
            no_improve_epochs = 0
        else:
            no_improve_epochs += 1
            if no_improve_epochs >= PATIENCE:
                print(f"⛔ Early stopping on epoch {epoch} for fold {fold}")
                break

    # Save training history
    history_df = pd.DataFrame(history)
    history_df.to_csv(f"history_{model_name}_fold{fold}.csv", index=False)

    return best_auc, model_name

# ========== MAIN ==========
if __name__ == "__main__":
    print(f"Using device: {device}")
    print(f"Loading data from {TRAIN_CSV}")

    df = pd.read_csv(TRAIN_CSV)
    print(f"Original dataset size: {len(df)}")

    # Balance dataset
    df = balance_dataset(df, pos_ratio=POS_RATIO, seed=SEED)
    print(f"Balanced dataset size: {len(df)}")

    # Set up cross-validation
    skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)

    # Dictionary to track performance across models and folds
    results = defaultdict(list)

    # Train each model architecture on each fold
    for model_config in BASE_MODELS:
        model_name = model_config[0]
        print(f"\n{'='*20} Training {model_name} {'='*20}")

        for fold, (train_idx, val_idx) in enumerate(skf.split(df, df['target'])):
            print(f"\n{'='*20} {model_name} - Fold {fold} {'='*20}")
            auc, model_name = train_fold(model_config, fold, train_idx, val_idx, df)
            results[model_name].append(auc)

    # Save results for all models and folds
    all_results = []
    for model_name, aucs in results.items():
        for fold, auc in enumerate(aucs):
            all_results.append({
                'model': model_name,
                'fold': fold,
                'auc': auc
            })

    pd.DataFrame(all_results).to_csv("all_fold_results.csv", index=False)

    # For each model, select top K folds
    top_folds = {}
    for model_name, aucs in results.items():
        top_idx = np.argsort(aucs)[-TOP_K:][::-1]
        top_folds[model_name] = top_idx.tolist()

        # Write top folds to file
        with open(f"top_folds_{model_name}.txt", "w") as f:
            for fold in top_idx:
                f.write(f"{fold}\n")

        print(f"\n✅ {model_name} - Best folds: {top_idx.tolist()} with AUCs: {[aucs[i] for i in top_idx]}")

    print("\n✅ Training complete!")
    print(f"Results saved to all_fold_results.csv")
    print(f"Top folds saved to top_folds_<model_name>.txt files")

In [None]:
import os
import h5py
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
import timm
import albumentations as A
from albumentations.pytorch import ToTensorV2
from tqdm import tqdm
import warnings

warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# ========== CONFIG ==========
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
TEST_CSV = "students-test-metadata.csv"
TEST_HDF5 = "test-image.hdf5"
OUTPUT_CSV = "predictions.csv"

# Configuración para inferencia
BATCH_SIZE = 16
TOP_K = 3  # Usar los K mejores folds de cada modelo para ensemble

BASE_MODELS = [
    ("efficientnetv2_rw_s", 384),
    ("convnext_small", 384),
    ("eca_nfnet_l0", 384)
]

MODEL_WEIGHTS = {
    "efficientnetv2_rw_s": 1.0,  # Ajusta estos valores según el rendimiento en validación
    "convnext_small": 1.2,
    "eca_nfnet_l0": 0.8
}
# ========== ATTENTION MODULE ==========
class SpatialAttention(nn.Module):
    def __init__(self, in_channels):
        super(SpatialAttention, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, in_channels // 8, kernel_size=1),
            nn.BatchNorm2d(in_channels // 8),
            nn.ReLU(),
            nn.Conv2d(in_channels // 8, 1, kernel_size=1),
            nn.Sigmoid()
        )

    def forward(self, x):
        attention_map = self.conv(x)
        return x * attention_map

# ========== MODEL ==========
class AdvancedModelMeta(nn.Module):
    def __init__(self, model_name, meta_dim=18, pretrained=False):
        super().__init__()
        # Backbone
        self.backbone = timm.create_model(model_name, pretrained=pretrained, num_classes=0)
        backbone_dim = self.backbone.num_features

        # Attention mechanism
        self.attention = SpatialAttention(backbone_dim)

        # Metadata processing path
        self.meta_fc = nn.Sequential(
            nn.Linear(meta_dim, 256),
            nn.LayerNorm(256),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.LayerNorm(128),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.LayerNorm(64),
            nn.GELU()
        )

        # Combined head
        self.head = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(backbone_dim + 64, 128),
            nn.LayerNorm(128),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1)
        )

    def forward(self, x_img, x_meta):
        # Extract image features with attention
        img_features = self.backbone(x_img)
        if len(img_features.shape) == 4:  # If features have spatial dimensions
            batch_size, channels, height, width = img_features.shape
            img_features = self.attention(img_features)
            img_features = F.adaptive_avg_pool2d(img_features, (1, 1)).view(batch_size, channels)

        # Process metadata
        meta_features = self.meta_fc(x_meta)

        # Combine features
        combined = torch.cat([img_features, meta_features], dim=1)
        return self.head(combined)

# ========== DATASET ==========
class ISIC_HDF5_TestDataset(Dataset):
    def __init__(self, df, hdf5_path, img_size=384, transform=None):
        self.df = df.reset_index(drop=True)
        self.hdf5_path = hdf5_path
        self.transform = transform
        self.img_size = img_size
        self.meta = self._preprocess_metadata(df)

    def _preprocess_metadata(self, df):
        df = df.copy()
        # Apply the same preprocessing as in training
        df['sex'] = LabelEncoder().fit_transform(df['sex'].fillna("unknown"))
        df['anatom_site_general'] = LabelEncoder().fit_transform(df['anatom_site_general'].fillna("unknown"))
        df['age_approx'] = df['age_approx'].fillna(df['age_approx'].median())

        # Add age bins as categorical features
        df['age_bin'] = pd.cut(df['age_approx'], bins=[0, 30, 45, 60, 75, 100], labels=False)

        # Create feature interactions
        base_features = df[['age_approx', 'sex', 'anatom_site_general', 'age_bin']].values
        poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
        interactions = poly.fit_transform(base_features)

        # Apply standardization
        meta_features = StandardScaler().fit_transform(interactions)
        return torch.tensor(meta_features, dtype=torch.float32)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        isic_id = row["isic_id"]
        meta = self.meta[idx]

        with h5py.File(self.hdf5_path, 'r') as hf:
            encoded_bytes = hf[isic_id][()]
        image_bgr = cv2.imdecode(encoded_bytes, cv2.IMREAD_COLOR)
        image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)

        if self.transform:
            transformed = self.transform(image=image_rgb)
            image = transformed["image"]
        else:
            # Default resize if no transform
            image = cv2.resize(image_rgb, (self.img_size, self.img_size))
            image = torch.from_numpy(image.transpose(2, 0, 1)).float() / 255.0

        return image, meta, isic_id

# ========== TEST TIME AUGMENTATION ==========
def tta_inference(model, img, meta, tta_transforms, device):
    model.eval()
    probs = []

    # Original prediction
    with torch.no_grad():
        pred = torch.sigmoid(model(img.to(device), meta.to(device))).cpu().numpy()
    probs.append(pred)

    # Test-time augmentations
    for transform in tta_transforms:
        aug_img = transform(img.cpu()).to(device)
        with torch.no_grad():
            aug_pred = torch.sigmoid(model(aug_img, meta.to(device))).cpu().numpy()
        probs.append(aug_pred)

    # Average predictions
    return np.mean(probs, axis=0)

# ========== TRANSFORMS ==========
def get_test_transforms(img_size):
    return A.Compose([
        A.Resize(img_size, img_size),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ])

def get_tta_transforms():
    tta_transforms = [
        lambda x: torch.flip(x, dims=[-1]),  # Horizontal flip
        lambda x: torch.flip(x, dims=[-2]),  # Vertical flip
        lambda x: torch.rot90(x, k=1, dims=[-2, -1]),  # 90 degree rotation
        lambda x: torch.rot90(x, k=2, dims=[-2, -1]),  # 180 degree rotation
    ]
    return tta_transforms

# ========== LOAD TOP FOLDS ==========
def get_top_folds():
    top_folds_dict = {}
    for model_name, _ in BASE_MODELS:
        try:
            with open(f"top_folds_{model_name}.txt", "r") as f:
                top_folds = [int(line.strip()) for line in f.readlines()]
                top_folds_dict[model_name] = top_folds[:TOP_K]
        except FileNotFoundError:
            print(f"Warning: Could not find top folds file for {model_name}. Using folds 0-{TOP_K-1}.")
            top_folds_dict[model_name] = list(range(TOP_K))
    return top_folds_dict


# ========== INFERENCE ==========
def run_inference():
    print(f"Using device: {device}")
    print(f"Loading test data from {TEST_CSV}")

    df_test = pd.read_csv(TEST_CSV)
    print(f"Test dataset size: {len(df_test)}")

    # Prepare top folds for each model
    top_folds_dict = get_top_folds()
    print("Top folds for each model:")
    for model_name, folds in top_folds_dict.items():
        print(f"  - {model_name}: {folds}")

    # Create test dataset and dataloader
    test_predictions = {}

    # Run inference for each model and fold
    for model_config in BASE_MODELS:
        model_name, img_size = model_config
        print(f"\n{'='*20} Inference for {model_name} {'='*20}")

        # Create dataset with appropriate image size
        test_dataset = ISIC_HDF5_TestDataset(
            df_test, TEST_HDF5, img_size=img_size,
            transform=get_test_transforms(img_size)
        )

        test_loader = DataLoader(
            test_dataset, batch_size=BATCH_SIZE, shuffle=False,
            num_workers=2, pin_memory=True
        )

        # Load the models for this architecture
        fold_predictions = {}

        for fold in top_folds_dict[model_name]:
            print(f"Loading {model_name} - Fold {fold}")

            # Get meta dimension from the dataset
            meta_dim = test_dataset.meta.shape[1]

            # Initialize model
            model = AdvancedModelMeta(model_name, meta_dim=meta_dim).to(device)

            # Try to load model weights
            try:
                model.load_state_dict(torch.load(f"{model_name}_fold{fold}.pt", map_location=device))
                print(f"✅ Loaded model from {model_name}_fold{fold}.pt")
            except FileNotFoundError:
                print(f"⚠️ Could not find weights for {model_name} fold {fold}, skipping...")
                continue

            # Get TTA transforms
            tta_transforms = get_tta_transforms()

            # Run inference with TTA
            model.eval()
            all_preds = []
            all_ids = []

            with torch.no_grad():
                for images, meta, isic_ids in tqdm(test_loader, desc=f"Predicting with {model_name} fold {fold}"):
                    images = images.to(device)
                    meta = meta.to(device)

                    # Apply TTA
                    batch_preds = tta_inference(model, images, meta, tta_transforms, device)

                    all_preds.extend(batch_preds.flatten().tolist())
                    all_ids.extend(isic_ids)

            # Store predictions for this fold
            for idx, pred in zip(all_ids, all_preds):
                if idx not in fold_predictions:
                    fold_predictions[idx] = []
                fold_predictions[idx].append(pred)

        # Average predictions across folds for this model
        for idx, preds in fold_predictions.items():
            if idx not in test_predictions:
                test_predictions[idx] = []
            test_predictions[idx].append(np.mean(preds))

    """# Average predictions across all models
    final_predictions = {}
    for idx, preds in test_predictions.items():
        final_predictions[idx] = np.mean(preds)"""

    def weighted_ensemble_prediction(predictions_dict, model_weights):
    #Ensemble ponderado usando los pesos especificados para cada modelo
      final_predictions = {}

      for isic_id, model_preds in predictions_dict.items():
          weighted_sum = 0
          total_weight = 0

          for model_idx, pred in enumerate(model_preds):
              model_name = BASE_MODELS[model_idx][0]
              weight = MODEL_WEIGHTS.get(model_name, 1.0)

              weighted_sum += pred * weight
              total_weight += weight

          final_predictions[isic_id] = weighted_sum / total_weight

      return final_predictions

    final_predictions = weighted_ensemble_prediction(test_predictions, MODEL_WEIGHTS)

    # Create submission dataframe
    submission = pd.DataFrame({
        'isic_id': list(final_predictions.keys()),
        'target': list(final_predictions.values())
    })

    submission.to_csv(OUTPUT_CSV, index=False)
    print(f"✅ Predictions saved to {OUTPUT_CSV}")
    print(f"Total predictions: {len(submission)}")

if __name__ == "__main__":
    run_inference()