In [1]:
from pathlib import Path
import pandas as pd

trainval_labels1 = pd.read_csv('aptos2019-blindness-detection/train.csv')
trainval_imgs_dir1 = Path('aptos2019-blindness-detection/train_images/processed')

trainval_labels2 = pd.read_csv('diabetic-retinopathy-detection/trainLabels.csv')
trainval_imgs_dir2 = Path('diabetic-retinopathy-detection/train/processed')

In [2]:
# Standardize trainval_labels1
df1 = trainval_labels1.copy()
df1.columns = ['id', 'label']
df1['group'] = df1['id']
df1['img_dir'] = df1['id'].apply(lambda x: trainval_imgs_dir1 / f"{x}.png")

# Standardize trainval_labels2
df2 = trainval_labels2.copy()
df2.columns = ['id', 'label']
df2['group'] = df2['id'].apply(lambda x: x.split('_')[0])
df2['img_dir'] = df2['id'].apply(lambda x: trainval_imgs_dir2 / f"{x}.jpeg")

# Combine both
df_trainval = pd.concat([df1, df2], ignore_index=True)

In [4]:
from sklearn.model_selection import GroupShuffleSplit

# Initialize splitter for an 80/20 split
splitter = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Perform the split
train_idx, val_idx = next(splitter.split(df_trainval, groups=df_trainval['group']))

# Use the indices to create train and val DataFrames
df_train = df_trainval.iloc[train_idx].copy().drop(columns='group')
df_val = df_trainval.iloc[val_idx].copy().drop(columns='group')

df_train.to_csv('df_train.csv', index=False)
df_val.to_csv('df_val.csv', index=False)

==============================================================

In [1]:
import pandas as pd

df_train = pd.read_csv('df_train.csv')
df_val = pd.read_csv('df_val.csv')

In [2]:
from torch.utils.data import Dataset
from PIL import Image
import numpy as np

class TrainDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # id = row['id']
        label = int(row['label'])
        img_dir = row['img_dir']

        image = Image.open(img_dir).convert('RGB')  # ensure 3 channels

        if self.transform:
            image = np.array(image)
            image = self.transform(image=image)['image']

        return image, label

In [3]:
import albumentations as A
from albumentations.pytorch import ToTensorV2

IMAGE_SIZE_TRAIN = 352
IMAGE_SIZE_VAL = 480

train_transform = A.Compose([
    A.Resize(IMAGE_SIZE_VAL, IMAGE_SIZE_VAL),
    A.HorizontalFlip(p=0.5),
    A.Rotate(limit=360, p=1.0),
    A.RandomCrop(IMAGE_SIZE_TRAIN, IMAGE_SIZE_TRAIN),

    A.RandomBrightnessContrast(
        brightness_limit=0.2,  # ±20% brightness
        contrast_limit=0.2,    # ±20% contrast
        p=0.3
    ),
    A.HueSaturationValue(
        hue_shift_limit=10,        # ±10 degrees
        sat_shift_limit=20,        # ±20%
        val_shift_limit=10,        # ±10%
        p=0.3
    ),
    A.GaussianBlur(blur_limit=(3, 5), p=0.3),
    A.Sharpen(alpha=(0.2, 0.5), lightness=(0.5, 1.0), p=0.3),

    A.Normalize(  # For model pretrained on ImageNet
        mean=[0.485, 0.456, 0.406],
        std =[0.229, 0.224, 0.225]
    ),
    ToTensorV2(),
])

val_transform = A.Compose([
    A.Resize(IMAGE_SIZE_VAL, IMAGE_SIZE_VAL),
    A.Normalize(  # For model pretrained on ImageNet
        mean=[0.485, 0.456, 0.406],
        std =[0.229, 0.224, 0.225]
    ),
    ToTensorV2(),
])

  data = fetch_version_info()


In [23]:
from torch.utils.data import DataLoader

BATCH_SIZE_TRAIN = 40
# BATCH_SIZE_VAL = 64
BATCH_SIZE_VAL = 16

train_dataset = TrainDataset(df_train, train_transform)
val_dataset = TrainDataset(df_val, val_transform)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE_TRAIN, shuffle=True, num_workers=10)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE_VAL, shuffle=False, num_workers=10)

============================================================

In [5]:
import torch
import torch.nn.functional as F

import timm
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from torch.optim.lr_scheduler import ReduceLROnPlateau

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [13]:
from torch import nn

class EfficientNetV2OrdinalClassifier(pl.LightningModule):
    def __init__(self, model_name="efficientnetv2_rw_m", lr=1e-4, num_classes=5):
        super().__init__()
        self.save_hyperparameters()
        self.num_classes = num_classes
        
        # Use timm to load pretrained backbone, remove classifier head
        self.net = timm.create_model(
            self.hparams.model_name,
            pretrained=True,
            num_classes=0,  # remove original head
            
            drop_rate=0.3,
            drop_path_rate=0.3
        )

        in_features = self.net.num_features
        self.head = nn.Linear(in_features, self.num_classes - 1)  # 4 outputs for 5 ordinal classes

    def forward(self, x):
        features = self.net(x)
        logits = self.head(features)
        return logits
    
    def predict_class(self, logits):
        probas = logits.sigmoid()
        return (probas > 0.5).sum(dim=1)
    
    def ordinal_targets(self, labels):
        """
        Converts integer class labels (0 to num_classes - 1) into ordinal binary targets.
        For example, label 2 becomes [1, 1, 0, 0] for num_classes = 5
        """
        batch_size = labels.size(0)
        num_thresholds = self.num_classes - 1  # one less than number of classes
        labels_expanded = labels.unsqueeze(1)  # Expand labels to shape (batch_size, 1)
        # Create comparison thresholds: shape (1, num_thresholds) = [0, 1, 2, 3]
        thresholds = torch.arange(num_thresholds, device=labels.device).unsqueeze(0)
        # Compare each label to thresholds: True where label > threshold
        binary_targets = labels_expanded > thresholds  # shape (batch_size, num_thresholds)
        return binary_targets.float()

    def training_step(self, batch, batch_idx):
        imgs, labels = batch
        logits = self(imgs)
        targets = self.ordinal_targets(labels)
        loss = F.binary_cross_entropy_with_logits(logits, targets)
        
        preds = (logits.sigmoid() > 0.5).sum(dim=1)
        acc = (preds == labels).float().mean()

        self.log('train_loss', loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log('train_acc',  acc, prog_bar=True, on_step=False, on_epoch=True)
        
        return loss

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        logits = self(imgs)
        targets = self.ordinal_targets(labels)
        loss = F.binary_cross_entropy_with_logits(logits, targets)

        preds = (logits.sigmoid() > 0.5).sum(dim=1)
        acc = (preds == labels).float().mean()
        
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc',  acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=5e-5)

        scheduler = {
            'scheduler': ReduceLROnPlateau(
                optimizer,
                mode='min',              # we're watching val_loss (lower is better)
                factor=0.5,              # reduce LR by this factor
                patience=5,              # after N epochs of no improvement
                min_lr=1e-6,             # don’t go below this
                verbose=True
            ),
            'monitor': 'val_loss',
            'interval': 'epoch',
            'frequency': 1
        }

        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

In [None]:
import mlflow.pytorch

mlflow.pytorch.autolog()

# pl.seed_everything(42)

checkpoint_cb = ModelCheckpoint(
    monitor='val_loss',
    dirpath='checkpoints/',
    filename='effnet-v2rw-m-ordinal-dropout2-l2reg2-augs-full-{epoch:02d}-{val_loss:.4f}-{val_acc:.4f}',
    save_top_k=1,
    mode='min',
)
earlystop_cb = EarlyStopping(
    monitor='val_loss',
    patience=15,
    mode='min',
)

trainer = pl.Trainer(
    max_epochs=50,
    callbacks=[earlystop_cb, checkpoint_cb],
    accelerator='auto',  # GPU if available
    precision='16-mixed',
    devices=1
)

model = EfficientNetV2OrdinalClassifier(lr=1e-4, num_classes=5)
trainer.fit(model, train_dataloader, val_dataloader)

print("✅ Best checkpoint:", checkpoint_cb.best_model_path)

---

In [9]:
from torch import nn

class AdjacentLabelSmoothingLoss(nn.Module):
    def __init__(self, smoothing: float = 0.1, num_classes: int = 5, window_size: int = 1):
        """
        Exponential-decay label smoothing for ordinal targets.

        Args:
          smoothing: total probability mass to smooth away from the true class (0 <= s < 1).
          num_classes: total number of ordinal classes.
          window_size: how many steps to consider on each side of the true class.
                       (1 for adjacent only, 2 to include distance-2 neighbors, etc.)
        """
        super().__init__()
        assert 0 <= smoothing < 1, "smoothing must be in [0,1)."
        self.smoothing = smoothing
        self.num_classes = num_classes
        self.window_size = window_size

        # Precompute smoothing distributions for each possible true class 0..num_classes-1
        # according to your two requirements:
        #   1) exp decay by distance
        #   2) center weight = 1 - smoothing
        R = smoothing / (1.0 - smoothing)  # ratio of total neighbor mass to center mass

        weight_matrix = []
        for t in range(num_classes):
            # how many valid steps on each side
            left_n  = min(window_size,             t)
            right_n = min(window_size, num_classes - 1 - t)

            # Solve for decay d so that sum(d^1..d^left_n) + sum(d^1..d^right_n) = R
            # Newton's method on f(d) = sum_{k=1..L} d^k + sum_{k=1..R} d^k - R = 0
            if left_n + right_n == 0:
                d = 0.0
            else:
                # initial guess
                d = R / (left_n + right_n)
                for _ in range(50):
                    # f(d) and f'(d)
                    f = sum(d**k for k in range(1, left_n+1)) + sum(d**k for k in range(1, right_n+1)) - R
                    fp = sum(k * d**(k-1) for k in range(1, left_n+1)) + sum(k * d**(k-1) for k in range(1, right_n+1))
                    d = max(d - f/(fp + 1e-12), 1e-12)

            # Build raw (unnormalized) weights for this true class t
            raw = torch.zeros(num_classes, dtype=torch.float64)
            for c in range(num_classes):
                dist = abs(c - t)
                if dist == 0:
                    raw[c] = 1.0
                elif dist <= window_size:
                    raw[c] = d**dist
                # else remains 0

            # Normalize so sum(raw) = 1 and center weight = 1 - smoothing
            raw = raw / raw.sum()
            weight_matrix.append(raw.float())

        # Stack into [num_classes, num_classes] tensor
        weight_matrix = torch.stack(weight_matrix, dim=0)  # weight_matrix[t] is the distribution for true class t
        self.register_buffer('weight_matrix', weight_matrix)

    def forward(self, logits: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
        """
        logits: (B, C)
        target: (B,) integer tensor in [0..C-1]
        """
        # Get the precomputed soft-target distributions
        # shape → (B, C)
        true_dist = self.weight_matrix[target]

        # Standard cross-entropy with log-softmax
        log_probs = F.log_softmax(logits, dim=-1)
        loss = -(true_dist * log_probs).sum(dim=-1).mean()
        return loss
    
    def debug_dist(self, true_class: int) -> torch.Tensor:
        """
        Returns the smoothed distribution for a given true class.

        Args:
            true_class: int, between 0 and num_classes - 1.

        Returns:
            A tensor of shape (num_classes,) showing the target distribution.
        """
        if not (0 <= true_class < self.num_classes):
            raise ValueError(f"true_class must be between 0 and {self.num_classes - 1}, got {true_class}.")
        
        dist = self.weight_matrix[true_class]
        return dist

In [30]:
loss_fn = AdjacentLabelSmoothingLoss(smoothing=0.1, window_size=1)
print(loss_fn.debug_dist(0))
print(loss_fn.debug_dist(1))
print(loss_fn.debug_dist(2))

tensor([0.9000, 0.1000, 0.0000, 0.0000, 0.0000])
tensor([0.0500, 0.9000, 0.0500, 0.0000, 0.0000])
tensor([0.0000, 0.0500, 0.9000, 0.0500, 0.0000])


In [10]:
class ConvNeXtClassifier(pl.LightningModule):
    def __init__(self, model_name="convnext_small.fb_in22k_ft_in1k_384", lr=1e-4, num_classes=5, smoothing=0.1):
        super().__init__()
        self.save_hyperparameters()
        # create & swap in a new head
        self.net = timm.create_model(
            self.hparams.model_name,
            pretrained=True,
            num_classes=self.hparams.num_classes,
            
            drop_rate=0.2,
            drop_path_rate=0.2,
        )
        
        self.criterion = AdjacentLabelSmoothingLoss(
            smoothing=self.hparams.smoothing,
            num_classes=self.hparams.num_classes
        )

    def forward(self, x):
        return self.net(x)

    def predict_class(self, logits):
        return logits.argmax(dim=-1)

    def training_step(self, batch, batch_idx):
        imgs, labels = batch
        logits = self(imgs)
        loss = self.criterion(logits, labels)
        acc  = (logits.argmax(dim=-1) == labels).float().mean()

        self.log('train_loss', loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log('train_acc',  acc, prog_bar=True, on_step=False, on_epoch=True)
        
        return loss

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        logits = self(imgs)
        loss = self.criterion(logits, labels)
        acc  = (logits.argmax(dim=-1) == labels).float().mean()
        
        self.log('val_loss', loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log('val_acc',  acc, prog_bar=True, on_step=False, on_epoch=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=1e-5)

        scheduler = {
            'scheduler': ReduceLROnPlateau(
                optimizer,
                mode='min',              # we're watching val_loss (lower is better)
                factor=0.5,              # reduce LR by this factor
                patience=5,              # after N epochs of no improvement
                min_lr=1e-6,             # don’t go below this
                verbose=True
            ),
            'monitor': 'val_loss',
            'interval': 'epoch',
            'frequency': 1
        }

        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

In [11]:
class EfficientNetV2Classifier(pl.LightningModule):
    def __init__(self, model_name="efficientnetv2_rw_m", lr=1e-4, num_classes=5, smoothing=0.1):
        super().__init__()
        self.save_hyperparameters()
        # create & swap in a new head
        self.net = timm.create_model(
            self.hparams.model_name,
            pretrained=True,
            num_classes=self.hparams.num_classes,
            
            drop_rate=0.4,        # 🔥 add stronger dropout (applied before final FC)
            drop_path_rate=0.3,   # 🔥 stochastic depth (helps regularize deep nets)
        )
        
        self.criterion = AdjacentLabelSmoothingLoss(
            smoothing=self.hparams.smoothing,
            num_classes=self.hparams.num_classes
        )

    def forward(self, x):
        return self.net(x)

    def predict_class(self, logits):
        return logits.argmax(dim=-1)

    def training_step(self, batch, batch_idx):
        imgs, labels = batch
        logits = self(imgs)
        loss = self.criterion(logits, labels)
        acc  = (logits.argmax(dim=-1) == labels).float().mean()

        self.log('train_loss', loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log('train_acc',  acc, prog_bar=True, on_step=False, on_epoch=True)
        
        return loss

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        logits = self(imgs)
        loss = self.criterion(logits, labels)
        acc  = (logits.argmax(dim=-1) == labels).float().mean()
        
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc',  acc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=1e-4)

        scheduler = {
            'scheduler': ReduceLROnPlateau(
                optimizer,
                mode='min',              # we're watching val_loss (lower is better)
                factor=0.5,              # reduce LR by this factor
                patience=5,              # after N epochs of no improvement
                min_lr=1e-6,             # don’t go below this
                verbose=True
            ),
            'monitor': 'val_loss',
            'interval': 'epoch',
            'frequency': 1
        }

        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

In [12]:
class ResNet50Classifier(pl.LightningModule):
    def __init__(self, model_name="resnet50.a1_in1k", lr=1e-4, num_classes=5,
                 smoothing=0.1, window_size=1):
        super().__init__()
        self.save_hyperparameters()
        # create & swap in a new head
        self.net = timm.create_model(
            self.hparams.model_name,
            pretrained=True,
            num_classes=self.hparams.num_classes,
            
            drop_rate=0.3,        # 🔥 add stronger dropout (applied before final FC)
            # drop_path_rate=0.3,   # 🔥 stochastic depth (helps regularize deep nets)
        )
        
        self.criterion = AdjacentLabelSmoothingLoss(
            smoothing=self.hparams.smoothing,
            window_size=self.hparams.window_size,
            num_classes=self.hparams.num_classes
        )

    def forward(self, x):
        return self.net(x)

    def predict_class(self, logits):
        return logits.argmax(dim=-1)

    def training_step(self, batch, batch_idx):
        imgs, labels = batch
        logits = self(imgs)
        loss = self.criterion(logits, labels)
        
        preds = logits.argmax(dim=-1)
        acc  = (preds == labels).float().mean()

        self.log('train_loss', loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log('train_acc',  acc, prog_bar=True, on_step=False, on_epoch=True)
        
        return loss

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        logits = self(imgs)
        loss = self.criterion(logits, labels)

        preds = logits.argmax(dim=-1)
        acc  = (preds == labels).float().mean()
        
        self.log('val_loss', loss, prog_bar=True, on_epoch=True)
        self.log('val_acc',  acc, prog_bar=True, on_epoch=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=1e-4)

        scheduler = {
            'scheduler': ReduceLROnPlateau(
                optimizer,
                mode='min',              # we're watching val_loss (lower is better)
                factor=0.5,              # reduce LR by this factor
                patience=5,              # after N epochs of no improvement
                min_lr=1e-6,             # don’t go below this
                verbose=True
            ),
            'monitor': 'val_loss',
            'interval': 'epoch',
            'frequency': 1
        }

        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

In [None]:
# from timm.data import resolve_data_config
# from timm.layers import apply_test_time_pool
import mlflow.pytorch

mlflow.pytorch.autolog()

# pl.seed_everything(42)

checkpoint_cb = ModelCheckpoint(
    monitor='val_loss',
    dirpath='checkpoints/',
    filename='resnet-50-dropout-l2reg2-augs-adjsmooth2-full-{epoch:02d}-{val_loss:.4f}-{val_acc:.4f}',
    save_top_k=1,
    mode='min',
)
earlystop_cb = EarlyStopping(
    monitor='val_loss',
    patience=15,
    mode='min',
)

trainer = pl.Trainer(
    max_epochs=50,
    callbacks=[earlystop_cb, checkpoint_cb],
    accelerator='auto',  # GPU if available
    precision='16-mixed',
    devices=1,
)

model = ResNet50Classifier(lr=1e-4, num_classes=5, smoothing=0.2, window_size=2)
# checkpoint = torch.load('checkpoints/resnet-50-dropout-l2reg-augs-adjsmooth-full-epoch=22-val_loss=0.7101-val_acc=0.8264.ckpt')
# model.load_state_dict(checkpoint['state_dict'], strict=True)

# # Weird TIMM thing to deal with varied image sizes
# data_config = resolve_data_config({}, model=model)
# data_config['input_size'] = [3, IMAGE_SIZE_VAL, IMAGE_SIZE_VAL]
# model, using_test_pool = apply_test_time_pool(model, data_config)
# if using_test_pool:
#     print("✅ Test-Time Pool head is in place.")

trainer.fit(model, train_dataloader, val_dataloader)

print("✅ Best checkpoint:", checkpoint_cb.best_model_path)

INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
2025/04/28 13:05:58 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '55895ce18e1147208e09dbd5f0f57115', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pytorch workflow
/home/mauribuntu/miniconda3/envs/causal-dl-torch/lib/python3.12/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /mnt/g/Kaggle-Diabetic-Retinopathy/checkpoints exists and is not empty.

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: 
Detected KeyboardInterrupt, attempting graceful shutdown ...
INFO:lightning.pytorch.utilities.rank_zero:
Detected KeyboardInterrupt, attempting graceful shutdown ...


.

In [15]:
class ConvNeXtRegressor(pl.LightningModule):
    def __init__(self, model_name="convnext_small.fb_in22k_ft_in1k_384", lr=1e-4, num_classes=5):
        super().__init__()
        self.save_hyperparameters()
        self.net = timm.create_model(
            model_name,
            pretrained=True,    # We will manually load weights later
            num_classes=1,        # One output neuron for regression
            
            drop_rate=0.2,
            drop_path_rate=0.2,
        )

    def forward(self, x):
        return self.net(x).squeeze(1)  # Output shape [batch_size]
    
    def predict_class(self, outputs):
        return outputs.round().clamp(0, self.hparams.num_classes - 1).long()

    def training_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self(imgs)
        loss = F.mse_loss(preds, labels.float())

        preds_rounded = preds.round().clamp(0, self.hparams.num_classes - 1)
        acc = (preds_rounded == labels).float().mean()

        self.log('train_loss', loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log('train_acc',  acc, prog_bar=True, on_step=False, on_epoch=True)

        return loss

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self(imgs)
        loss = F.mse_loss(preds, labels.float())

        preds_rounded = preds.round().clamp(0, self.hparams.num_classes - 1)
        acc = (preds_rounded == labels).float().mean()
        
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc',  acc, prog_bar=True)
        
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=1e-5)

        scheduler = {
            'scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,
                mode='min',          # still minimizing val_loss (MSE)
                factor=0.5,
                patience=5,
                min_lr=1e-6,
                verbose=True
            ),
            'monitor': 'val_loss',   # watch val_loss (MSE) to reduce LR
            'interval': 'epoch',
            'frequency': 1
        }

        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

    @classmethod
    def load_from_classifier_ckpt(cls, path, model_name="convnext_small.fb_in22k_ft_in1k_384",
                                  lr=1e-4, num_classes=5):
        """
        Create a ConvNeXtRegressor and load weights from a classification checkpoint.
        """
        model = cls(model_name=model_name, lr=lr)
        checkpoint = torch.load(path, map_location='cpu')

        state_dict = checkpoint['state_dict']

        # Remove classification head weights (they don't match)
        filtered_state_dict = {k: v for k, v in state_dict.items() if 'head' not in k}
        model.load_state_dict(filtered_state_dict, strict=False)
        return model


In [16]:
class ResNet50Regressor(pl.LightningModule):
    def __init__(self, model_name="resnet50.a1_in1k", lr=1e-4, num_classes=5):
        super().__init__()
        self.save_hyperparameters()
        self.net = timm.create_model(
            model_name,
            pretrained=True,    # We will manually load weights later
            num_classes=1,        # One output neuron for regression
            
            drop_rate=0.3,
            # drop_path_rate=0.2,
        )

    def forward(self, x):
        return self.net(x).squeeze(1)  # Output shape [batch_size]
    
    def predict_class(self, outputs):
        return outputs.round().clamp(0, self.hparams.num_classes - 1).long()

    def training_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self(imgs)
        loss = F.mse_loss(preds, labels.float())

        preds_rounded = preds.round().clamp(0, self.hparams.num_classes - 1)
        acc = (preds_rounded == labels).float().mean()

        self.log('train_loss', loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log('train_acc',  acc, prog_bar=True, on_step=False, on_epoch=True)

        return loss

    def validation_step(self, batch, batch_idx):
        imgs, labels = batch
        preds = self(imgs)
        loss = F.mse_loss(preds, labels.float())

        preds_rounded = preds.round().clamp(0, self.hparams.num_classes - 1)
        acc = (preds_rounded == labels).float().mean()
        
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc',  acc, prog_bar=True)
        
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.lr, weight_decay=1e-4)

        scheduler = {
            'scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,
                mode='min',          # still minimizing val_loss (MSE)
                factor=0.5,
                patience=5,
                min_lr=1e-6,
                verbose=True
            ),
            'monitor': 'val_loss',   # watch val_loss (MSE) to reduce LR
            'interval': 'epoch',
            'frequency': 1
        }

        return {'optimizer': optimizer, 'lr_scheduler': scheduler}

    @classmethod
    def load_from_classifier_ckpt(cls, path, model_name="convnext_small.fb_in22k_ft_in1k_384",
                                  lr=1e-4, num_classes=5):
        """
        Create a ConvNeXtRegressor and load weights from a classification checkpoint.
        """
        model = cls(model_name=model_name, lr=lr)
        checkpoint = torch.load(path, map_location='cpu')

        state_dict = checkpoint['state_dict']

        # Remove classification head weights (they don't match)
        filtered_state_dict = {k: v for k, v in state_dict.items() if 'fc' not in k}
        model.load_state_dict(filtered_state_dict, strict=False)
        return model

In [None]:
# from timm.data import resolve_data_config
# from timm.layers import apply_test_time_pool
import mlflow.pytorch

mlflow.pytorch.autolog()

# pl.seed_everything(42)

checkpoint_cb = ModelCheckpoint(
    monitor='val_loss',
    dirpath='checkpoints/',
    filename='resnet-50-reg-dropout-l2reg-augs-full-{epoch:02d}-{val_loss:.4f}-{val_acc:.4f}',
    save_top_k=1,
    mode='min',
)
earlystop_cb = EarlyStopping(
    monitor='val_loss',
    patience=15,
    mode='min',
)

trainer = pl.Trainer(
    max_epochs=50,
    callbacks=[earlystop_cb, checkpoint_cb],
    accelerator='auto',  # GPU if available
    precision='16-mixed',
    devices=1,
)

model = ResNet50Regressor(lr=1e-4, num_classes=5)

# # Weird TIMM thing to deal with varied image sizes
# data_config = resolve_data_config({}, model=model)
# data_config['input_size'] = [3, IMAGE_SIZE_VAL, IMAGE_SIZE_VAL]
# model, using_test_pool = apply_test_time_pool(model, data_config)
# if using_test_pool:
#     print("✅ Test-Time Pool head is in place.")

trainer.fit(model, train_dataloader, val_dataloader)

print("✅ Best checkpoint:", checkpoint_cb.best_model_path)

INFO: Using 16bit Automatic Mixed Precision (AMP)
INFO:lightning.pytorch.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
2025/04/29 00:50:37 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'fdb077ae059f40008c9cac72037baf2f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pytorch workflow
/home/mauribuntu/miniconda3/envs/causal-dl-torch/lib/python3.12/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /mnt/g/Kaggle-Diabetic-Retinopathy/checkpoints exists and is not empty.

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]



✅ Best checkpoint: /mnt/g/Kaggle-Diabetic-Retinopathy/checkpoints/resnet-50-reg-dropout-l2reg-augs-full-epoch=26-val_loss=0.3978-val_acc=0.7151.ckpt


========================================

In [19]:
models = {
    'convnext':   ('checkpoints/convnext-small-dropout-l2reg-augs-adjsmooth-full-refined-epoch=00-val_loss=0.6888-val_acc=0.8396.ckpt',
                   ConvNeXtClassifier),
    'convnext-r': ('checkpoints/convnext-reg-dropout-l2reg-augs-full-epoch=23-val_loss=0.2984-val_acc=0.8075.ckpt',
                   ConvNeXtRegressor),
    'effnet':     ('checkpoints/effnet-v2rw-m-dropout-l2reg-augs-adjsmooth-full-epoch=14-val_loss=0.7004-val_acc=0.8305.ckpt',
                   EfficientNetV2Classifier),
    'effnet-p':   ('checkpoints/effnet-v2rw-m-dropout-l2reg-augs-adjsmooth-epoch=12-val_loss=0.6849-val_acc=0.8370.ckpt',
                   EfficientNetV2Classifier),
    'effnet-o':   ('checkpoints/effnet-v2rw-m-ordinal-dropout2-l2reg2-augs-full-epoch=22-val_loss=0.1580-val_acc=0.8278.ckpt',
                   EfficientNetV2OrdinalClassifier),
    'resnet':     ('checkpoints/resnet-50-dropout-l2reg2-augs-adjsmooth2-full-epoch=39-val_loss=0.8696-val_acc=0.8299.ckpt',
                   ResNet50Classifier),
    'resnet-n':   ('checkpoints/resnet-50-dropout-l2reg-augs-adjsmooth-full-refined-epoch=00-val_loss=0.7068-val_acc=0.8268.ckpt',
                   ResNet50Classifier),
    'resnet-r':   ('checkpoints/resnet-50-reg-dropout-l2reg-augs-full-epoch=26-val_loss=0.3978-val_acc=0.7151.ckpt',
                   ResNet50Regressor),
}

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

ckpt_path, model_class = models['resnet-r']
model = model_class.load_from_checkpoint(ckpt_path, strict=False)
model.to(device)
model.eval()

In [None]:
from tqdm.notebook import tqdm

all_preds = []
all_labels = []

with torch.no_grad():
    for imgs, labels in tqdm(val_dataloader):
        imgs = imgs.to(device)

        with torch.amp.autocast(device.type):
            preds_raw = model(imgs)
        preds = model.predict_class(preds_raw)

        all_preds.append(preds.cpu())
        all_labels.append(labels.cpu())

# Concatenate into single tensors
all_preds = torch.cat(all_preds)
all_labels = torch.cat(all_labels)

  0%|          | 0/122 [00:00<?, ?it/s]

In [35]:
from sklearn.metrics import cohen_kappa_score, accuracy_score

qwk = cohen_kappa_score(all_labels, all_preds, weights='quadratic')
acc = accuracy_score(all_labels, all_preds)

print('convnext')
print(f'Validation QWK = {qwk:.5f}')
print(f'Validation Accuracy = {acc:.5f}')

convnext
Validation QWK = 0.81847
Validation Accuracy = 0.83963


In [75]:
from sklearn.metrics import cohen_kappa_score, accuracy_score

qwk = cohen_kappa_score(all_labels, all_preds, weights='quadratic')
acc = accuracy_score(all_labels, all_preds)

print('convnext-r')
print(f'Validation QWK = {qwk:.5f}')
print(f'Validation Accuracy = {acc:.5f}')

convnext-r
Validation QWK = 0.82712
Validation Accuracy = 0.80778


In [32]:
from sklearn.metrics import cohen_kappa_score, accuracy_score

qwk = cohen_kappa_score(all_labels, all_preds, weights='quadratic')
acc = accuracy_score(all_labels, all_preds)

print('effnet')
print(f'Validation QWK = {qwk:.5f}')
print(f'Validation Accuracy = {acc:.5f}')

effnet
Validation QWK = 0.75777
Validation Accuracy = 0.82858


In [86]:
from sklearn.metrics import cohen_kappa_score, accuracy_score

qwk = cohen_kappa_score(all_labels, all_preds, weights='quadratic')
acc = accuracy_score(all_labels, all_preds)

print('effnet-p')
print(f'Validation QWK = {qwk:.5f}')
print(f'Validation Accuracy = {acc:.5f}')

effnet-p
Validation QWK = 0.76999
Validation Accuracy = 0.82974


In [78]:
from sklearn.metrics import cohen_kappa_score, accuracy_score

qwk = cohen_kappa_score(all_labels, all_preds, weights='quadratic')
acc = accuracy_score(all_labels, all_preds)

print('effnet-o')
print(f'Validation QWK = {qwk:.5f}')
print(f'Validation Accuracy = {acc:.5f}')

effnet-o
Validation QWK = 0.80779
Validation Accuracy = 0.82396


In [38]:
from sklearn.metrics import cohen_kappa_score, accuracy_score

qwk = cohen_kappa_score(all_labels, all_preds, weights='quadratic')
acc = accuracy_score(all_labels, all_preds)

print('resnet')
print(f'Validation QWK = {qwk:.5f}')
print(f'Validation Accuracy = {acc:.5f}')

resnet
Validation QWK = 0.76418
Validation Accuracy = 0.82987


In [90]:
from sklearn.metrics import cohen_kappa_score, accuracy_score

qwk = cohen_kappa_score(all_labels, all_preds, weights='quadratic')
acc = accuracy_score(all_labels, all_preds)

print('resnet-n')
print(f'Validation QWK = {qwk:.5f}')
print(f'Validation Accuracy = {acc:.5f}')

resnet-n
Validation QWK = 0.77617
Validation Accuracy = 0.82678


In [22]:
from sklearn.metrics import cohen_kappa_score, accuracy_score

qwk = cohen_kappa_score(all_labels, all_preds, weights='quadratic')
acc = accuracy_score(all_labels, all_preds)

print('resnet-r')
print(f'Validation QWK = {qwk:.5f}')
print(f'Validation Accuracy = {acc:.5f}')

resnet-r
Validation QWK = 0.72849
Validation Accuracy = 0.71507


In [26]:
# Assume that val_dataloader does NOT shuffle, so order matches df_val

# Prepare an empty DataFrame
df_stack = pd.DataFrame()

# Loop each model
for model_name, (ckpt_path, model_class) in models.items():
    print(f'Processing {model_name}…')

    # 1) load & move to device
    model = model_class.load_from_checkpoint(ckpt_path, strict=False)
    model = model.to(device).eval()

    # 2) collect raw outputs in a list
    all_out = []
    with torch.no_grad():
        for imgs, _ in tqdm(val_dataloader):
            imgs = imgs.to(device)
            with torch.amp.autocast(device.type):
                out = model(imgs)
            # out: Tensor of shape [B] or [B, D]
            all_out.append(out.cpu())

    # 3) concatenate and convert to numpy
    all_out = torch.cat(all_out, dim=0).numpy()  # shape (N,) or (N, D)

    # 4) turn into stacking columns
    if all_out.ndim == 1:
        # regressor → one column
        df_stack[f'{model_name}_pred'] = all_out
    else:
        # multi‐dim output → one column per dim
        D = all_out.shape[1]
        for i in range(D):
            df_stack[f'{model_name}_{i}'] = all_out[:, i]

    # cleanup
    del model
    if device.type == 'cuda':
        torch.cuda.empty_cache()

# finally, append the true label column (order must match val_dataloader)
df_stack['label'] = df_val['label'].values
df_stack.to_csv('df_stack.csv', index=False)

Processing convnext…


/home/mauribuntu/miniconda3/envs/causal-dl-torch/lib/python3.12/site-packages/pytorch_lightning/core/saving.py:191: Found keys that are in the model state dict but not in the checkpoint: ['criterion.weight_matrix']


  0%|          | 0/487 [00:00<?, ?it/s]

Processing convnext-r…


  0%|          | 0/487 [00:00<?, ?it/s]

Processing effnet…


/home/mauribuntu/miniconda3/envs/causal-dl-torch/lib/python3.12/site-packages/pytorch_lightning/core/saving.py:191: Found keys that are in the model state dict but not in the checkpoint: ['criterion.weight_matrix']


  0%|          | 0/487 [00:00<?, ?it/s]

Processing effnet-p…


/home/mauribuntu/miniconda3/envs/causal-dl-torch/lib/python3.12/site-packages/pytorch_lightning/core/saving.py:191: Found keys that are in the model state dict but not in the checkpoint: ['criterion.weight_matrix']


  0%|          | 0/487 [00:00<?, ?it/s]

Processing effnet-o…


  0%|          | 0/487 [00:00<?, ?it/s]

Processing resnet…


  0%|          | 0/487 [00:00<?, ?it/s]

Processing resnet-n…


/home/mauribuntu/miniconda3/envs/causal-dl-torch/lib/python3.12/site-packages/pytorch_lightning/core/saving.py:191: Found keys that are in the model state dict but not in the checkpoint: ['criterion.weight_matrix']


  0%|          | 0/487 [00:00<?, ?it/s]

Processing resnet-r…


  0%|          | 0/487 [00:00<?, ?it/s]

=========================================

In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import cohen_kappa_score

import lightgbm as lgb
from lightgbm import LGBMRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from tabpfn import TabPFNRegressor

import joblib

df_stack = pd.read_csv('df_stack.csv')
num_classes = int(df_stack['label'].max() + 1)

In [11]:
import copy

def cv_score(feature_columns, model_, kf, num_classes=5, fit_kwargs={}):

    qwk_scores = []
    models_trained = []
    
    for fold, (tr_idx, te_idx) in enumerate(kf.split(df_stack), 1):
        X_tr = df_stack.iloc[tr_idx][feature_columns]
        y_tr = df_stack.iloc[tr_idx]['label']
        X_te = df_stack.iloc[te_idx][feature_columns]
        y_te = df_stack.iloc[te_idx]['label']

        model = copy.deepcopy(model_)
        model.fit(X_tr, y_tr, eval_set=[(X_te, y_te)], **fit_kwargs)

        preds = model.predict(X_te)
        preds_int = np.clip(np.rint(preds), 0, num_classes-1).astype(int)
        qwk = cohen_kappa_score(y_te, preds_int, weights='quadratic')
        
        qwk_scores.append(qwk)
        models_trained.append(copy.deepcopy(model))
        del model

    return float(np.mean(qwk_scores)), models_trained

In [70]:
# Identify models by splitting on '_'
feature_cols = [c for c in df_stack.columns if c != 'label']
models = sorted({c.split('_')[0] for c in feature_cols})

# Map each model name → list of its feature columns
features_by_model = {
    m: [c for c in feature_cols if c.startswith(m + '_')]
    for m in models
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

stacking_model = LGBMRegressor(verbose=0)

# --- Baseline
print("▶ Running baseline 5-fold CV on all features …")
base_mean, base_models = cv_score(feature_cols, stacking_model, kf, num_classes)
print(f"Baseline mean QWK: {base_mean:.4f}\n")

current_models = models.copy()
current_features = feature_cols.copy()
best_mean = base_mean

# --- Backward elimination
while True:
    print("▶ Testing removal of each model …")
    removal_results = {}
    
    for m in tqdm(current_models, desc="Models"):
        # drop m’s features
        feat = [c for c in current_features if not c.startswith(m + '_')]
        mean_qwk, _ = cv_score(feat, stacking_model, kf, num_classes)
        removal_results[m] = mean_qwk
        print(f"  Removing {m:12} → mean QWK = {mean_qwk:.4f}")

    # find best improvement
    worst_model, candidate_qwk = max(removal_results.items(), key=lambda kv: kv[1])
    if candidate_qwk > best_mean:
        print(f"\n✔ Eliminating model '{worst_model}' improved QWK: {best_mean:.4f} → {candidate_qwk:.4f}\n")
        # update state
        best_mean = candidate_qwk
        current_models.remove(worst_model)
        current_features = [c for c in current_features if not c.startswith(worst_model + '_')]
    else:
        print("\n— No single-model removal improved QWK. Elimination complete.\n")
        break

# --- Final training on selected features
print("▶ Training final 5-fold models on features from:", current_models)
final_mean, final_models = cv_score(current_features, stacking_model, kf, num_classes)
print(f"Final mean QWK: {final_mean:.4f}\n")

# --- Save each model
for i, mdl in enumerate(final_models, 1):
    path = f"stacking_models/stack-lgb_cpu-fold{i}.pkl"
    joblib.dump(mdl, path)
    print(f"✔ Saved fold {i} model to '{path}'")

▶ Running baseline 5-fold CV on all features …
Baseline mean QWK: 0.8352

▶ Testing removal of each model …


Models:   0%|          | 0/8 [00:00<?, ?it/s]

  Removing convnext     → mean QWK = 0.8375
  Removing convnext-r   → mean QWK = 0.8391
  Removing effnet       → mean QWK = 0.8371
  Removing effnet-o     → mean QWK = 0.8399
  Removing effnet-p     → mean QWK = 0.8317
  Removing resnet       → mean QWK = 0.8361
  Removing resnet-n     → mean QWK = 0.8392
  Removing resnet-r     → mean QWK = 0.8384

✔ Eliminating model 'effnet-o' improved QWK: 0.8352 → 0.8399

▶ Testing removal of each model …


Models:   0%|          | 0/7 [00:00<?, ?it/s]

  Removing convnext     → mean QWK = 0.8347
  Removing convnext-r   → mean QWK = 0.8389
  Removing effnet       → mean QWK = 0.8410
  Removing effnet-p     → mean QWK = 0.8333
  Removing resnet       → mean QWK = 0.8362
  Removing resnet-n     → mean QWK = 0.8391
  Removing resnet-r     → mean QWK = 0.8375

✔ Eliminating model 'effnet' improved QWK: 0.8399 → 0.8410

▶ Testing removal of each model …


Models:   0%|          | 0/6 [00:00<?, ?it/s]

  Removing convnext     → mean QWK = 0.8350
  Removing convnext-r   → mean QWK = 0.8370
  Removing effnet-p     → mean QWK = 0.8306
  Removing resnet       → mean QWK = 0.8371
  Removing resnet-n     → mean QWK = 0.8380
  Removing resnet-r     → mean QWK = 0.8388

— No single-model removal improved QWK. Elimination complete.

▶ Training final 5-fold models on features from: ['convnext', 'convnext-r', 'effnet-p', 'resnet', 'resnet-n', 'resnet-r']
Final mean QWK: 0.8410

✔ Saved fold 1 model to 'stacking_models/stack-lgb_cpu-fold1.pkl'
✔ Saved fold 2 model to 'stacking_models/stack-lgb_cpu-fold2.pkl'
✔ Saved fold 3 model to 'stacking_models/stack-lgb_cpu-fold3.pkl'
✔ Saved fold 4 model to 'stacking_models/stack-lgb_cpu-fold4.pkl'
✔ Saved fold 5 model to 'stacking_models/stack-lgb_cpu-fold5.pkl'


In [23]:
# Identify models by splitting on '_'
feature_cols = [c for c in df_stack.columns if c != 'label']
models = sorted({c.split('_')[0] for c in feature_cols})

# Map each model name → list of its feature columns
features_by_model = {
    m: [c for c in feature_cols if c.startswith(m + '_')]
    for m in models
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

stacking_model = LGBMRegressor(verbose=-1, device='cuda')

# --- Baseline
print("▶ Running baseline 5-fold CV on all features …")
base_mean, base_models = cv_score(feature_cols, stacking_model, kf, num_classes)
print(f"Baseline mean QWK: {base_mean:.4f}\n")

current_models = models.copy()
current_features = feature_cols.copy()
best_mean = base_mean

# --- Backward elimination
while True:
    print("▶ Testing removal of each model …")
    removal_results = {}
    
    for m in tqdm(current_models, desc="Models"):
        # drop m’s features
        feat = [c for c in current_features if not c.startswith(m + '_')]
        mean_qwk, _ = cv_score(feat, stacking_model, kf, num_classes)
        removal_results[m] = mean_qwk
        print(f"  Removing {m:12} → mean QWK = {mean_qwk:.4f}")

    # find best improvement
    worst_model, candidate_qwk = max(removal_results.items(), key=lambda kv: kv[1])
    if candidate_qwk > best_mean:
        print(f"\n✔ Eliminating model '{worst_model}' improved QWK: {best_mean:.4f} → {candidate_qwk:.4f}\n")
        # update state
        best_mean = candidate_qwk
        current_models.remove(worst_model)
        current_features = [c for c in current_features if not c.startswith(worst_model + '_')]
    else:
        print("\n— No single-model removal improved QWK. Elimination complete.\n")
        break

# --- Final training on selected features
print("▶ Training final 5-fold models on features from:", current_models)
final_mean, final_models = cv_score(current_features, stacking_model, kf, num_classes)
print(f"Final mean QWK: {final_mean:.4f}\n")

# --- Save each model
for i, mdl in enumerate(final_models, 1):
    path = f"stacking_models/stack-lgb_cuda-fold{i}.pkl"
    joblib.dump(mdl, path)
    print(f"✔ Saved fold {i} model to '{path}'")

▶ Running baseline 5-fold CV on all features …
Baseline mean QWK: 0.8348

▶ Testing removal of each model …


Models:   0%|          | 0/8 [00:00<?, ?it/s]

  Removing convnext     → mean QWK = 0.8363
  Removing convnext-r   → mean QWK = 0.8401
  Removing effnet       → mean QWK = 0.8374
  Removing effnet-o     → mean QWK = 0.8385
  Removing effnet-p     → mean QWK = 0.8317
  Removing resnet       → mean QWK = 0.8364
  Removing resnet-n     → mean QWK = 0.8388
  Removing resnet-r     → mean QWK = 0.8384

✔ Eliminating model 'convnext-r' improved QWK: 0.8348 → 0.8401

▶ Testing removal of each model …


Models:   0%|          | 0/7 [00:00<?, ?it/s]

  Removing convnext     → mean QWK = 0.8370
  Removing effnet       → mean QWK = 0.8379
  Removing effnet-o     → mean QWK = 0.8393
  Removing effnet-p     → mean QWK = 0.8318
  Removing resnet       → mean QWK = 0.8385
  Removing resnet-n     → mean QWK = 0.8388
  Removing resnet-r     → mean QWK = 0.8383

— No single-model removal improved QWK. Elimination complete.

▶ Training final 5-fold models on features from: ['convnext', 'effnet', 'effnet-o', 'effnet-p', 'resnet', 'resnet-n', 'resnet-r']
Final mean QWK: 0.8401

✔ Saved fold 1 model to 'stacking_models/stack-lgb_cuda-fold1.pkl'
✔ Saved fold 2 model to 'stacking_models/stack-lgb_cuda-fold2.pkl'
✔ Saved fold 3 model to 'stacking_models/stack-lgb_cuda-fold3.pkl'
✔ Saved fold 4 model to 'stacking_models/stack-lgb_cuda-fold4.pkl'
✔ Saved fold 5 model to 'stacking_models/stack-lgb_cuda-fold5.pkl'


In [1]:
import sys
import re

class FilteredStream:
    def __init__(self, stream, pattern):
        self.stream = stream
        self.pattern = re.compile(pattern)
        self.buffer = ''

    def write(self, text):
        self.buffer += text
        lines = self.buffer.splitlines(True)
        for line in lines:
            if line.endswith('\n'):
                if not self.pattern.search(line):
                    self.stream.write(line)
            else:
                self.buffer = line
                break
        else:
            self.buffer = ''

    def flush(self):
        if self.buffer:
            if not self.pattern.search(self.buffer):
                self.stream.write(self.buffer)
            self.buffer = ''
        self.stream.flush()

    def __getattr__(self, attr):
        return getattr(self.stream, attr)

# Define the fucking pattern to kill
pattern = r'\[\d+\]\s+validation_0-rmse:\d+\.\d+'

# Save the original streams
original_stdout = sys.stdout
original_stderr = sys.stderr

# Set the filtered streams
sys.stdout = FilteredStream(original_stdout, pattern)
sys.stderr = FilteredStream(original_stderr, pattern)

In [None]:
# Identify models by splitting on '_'
feature_cols = [c for c in df_stack.columns if c != 'label']
models = sorted({c.split('_')[0] for c in feature_cols})

# Map each model name → list of its feature columns
features_by_model = {
    m: [c for c in feature_cols if c.startswith(m + '_')]
    for m in models
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

stacking_model = XGBRegressor(verbosity=0, device='gpu',
                              n_estimators=1000, early_stopping_rounds=100)
                            #   learning_rate=0.05)

# --- Baseline
print("▶ Running baseline 5-fold CV on all features …")
base_mean, base_models = cv_score(feature_cols, stacking_model, kf, num_classes)
print(f"Baseline mean QWK: {base_mean:.4f}\n")

current_models = models.copy()
current_features = feature_cols.copy()
best_mean = base_mean

# --- Backward elimination
while True:
    print("▶ Testing removal of each model …")
    removal_results = {}
    
    for m in tqdm(current_models, desc="Models"):
        # drop m’s features
        feat = [c for c in current_features if not c.startswith(m + '_')]
        mean_qwk, _ = cv_score(feat, stacking_model, kf, num_classes)
        removal_results[m] = mean_qwk
        print(f"  Removing {m:12} → mean QWK = {mean_qwk:.4f}")

    # find best improvement
    worst_model, candidate_qwk = max(removal_results.items(), key=lambda kv: kv[1])
    if candidate_qwk > best_mean:
        print(f"\n✔ Eliminating model '{worst_model}' improved QWK: {best_mean:.4f} → {candidate_qwk:.4f}\n")
        # update state
        best_mean = candidate_qwk
        current_models.remove(worst_model)
        current_features = [c for c in current_features if not c.startswith(worst_model + '_')]
    else:
        print("\n— No single-model removal improved QWK. Elimination complete.\n")
        break

# --- Final training on selected features
print("▶ Training final 5-fold models on features from:", current_models)
final_mean, final_models = cv_score(current_features, stacking_model, kf, num_classes)
print(f"Final mean QWK: {final_mean:.4f}\n")

# --- Save each model
for i, mdl in enumerate(final_models, 1):
    path = f"stacking_models/stack-xgb-fold{i}.pkl"
    joblib.dump(mdl, path)
    print(f"✔ Saved fold {i} model to '{path}'")

▶ Running baseline 5-fold CV on all features …
Baseline mean QWK: 0.8352

▶ Testing removal of each model …


Models:   0%|          | 0/8 [00:00<?, ?it/s]

  Removing convnext     → mean QWK = 0.8309
  Removing convnext-r   → mean QWK = 0.8296
  Removing effnet       → mean QWK = 0.8349
  Removing effnet-o     → mean QWK = 0.8319
  Removing effnet-p     → mean QWK = 0.8256
  Removing resnet       → mean QWK = 0.8345
  Removing resnet-n     → mean QWK = 0.8326
  Removing resnet-r     → mean QWK = 0.8354

✔ Eliminating model 'resnet-r' improved QWK: 0.8352 → 0.8354

▶ Testing removal of each model …


Models:   0%|          | 0/7 [00:00<?, ?it/s]

  Removing convnext     → mean QWK = 0.8362
  Removing convnext-r   → mean QWK = 0.8322
  Removing effnet       → mean QWK = 0.8338
  Removing effnet-o     → mean QWK = 0.8309
  Removing effnet-p     → mean QWK = 0.8282
  Removing resnet       → mean QWK = 0.8309
  Removing resnet-n     → mean QWK = 0.8334

✔ Eliminating model 'convnext' improved QWK: 0.8354 → 0.8362

▶ Testing removal of each model …


Models:   0%|          | 0/6 [00:00<?, ?it/s]

  Removing convnext-r   → mean QWK = 0.8273
  Removing effnet       → mean QWK = 0.8341
  Removing effnet-o     → mean QWK = 0.8328
  Removing effnet-p     → mean QWK = 0.8260
  Removing resnet       → mean QWK = 0.8360
  Removing resnet-n     → mean QWK = 0.8316

— No single-model removal improved QWK. Elimination complete.

▶ Training final 5-fold models on features from: ['convnext-r', 'effnet', 'effnet-o', 'effnet-p', 'resnet', 'resnet-n']
Final mean QWK: 0.8362

✔ Saved fold 1 model to 'stacking_models/stack-xgb-fold1.pkl'
✔ Saved fold 2 model to 'stacking_models/stack-xgb-fold2.pkl'
✔ Saved fold 3 model to 'stacking_models/stack-xgb-fold3.pkl'
✔ Saved fold 4 model to 'stacking_models/stack-xgb-fold4.pkl'
✔ Saved fold 5 model to 'stacking_models/stack-xgb-fold5.pkl'


In [None]:
# Identify models by splitting on '_'
feature_cols = [c for c in df_stack.columns if c != 'label']
models = sorted({c.split('_')[0] for c in feature_cols})

# Map each model name → list of its feature columns
features_by_model = {
    m: [c for c in feature_cols if c.startswith(m + '_')]
    for m in models
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

stacking_model = CatBoostRegressor(verbose=0)

# --- Baseline
print("▶ Running baseline 5-fold CV on all features …")
base_mean, base_models = cv_score(feature_cols, stacking_model, kf, num_classes)
print(f"Baseline mean QWK: {base_mean:.4f}\n")

current_models = models.copy()
current_features = feature_cols.copy()
best_mean = base_mean

# --- Backward elimination
while True:
    print("▶ Testing removal of each model …")
    removal_results = {}
    
    for m in tqdm(current_models, desc="Models"):
        # drop m’s features
        feat = [c for c in current_features if not c.startswith(m + '_')]
        mean_qwk, _ = cv_score(feat, stacking_model, kf, num_classes)
        removal_results[m] = mean_qwk
        print(f"  Removing {m:12} → mean QWK = {mean_qwk:.4f}")

    # find best improvement
    worst_model, candidate_qwk = max(removal_results.items(), key=lambda kv: kv[1])
    if candidate_qwk > best_mean:
        print(f"\n✔ Eliminating model '{worst_model}' improved QWK: {best_mean:.4f} → {candidate_qwk:.4f}\n")
        # update state
        best_mean = candidate_qwk
        current_models.remove(worst_model)
        current_features = [c for c in current_features if not c.startswith(worst_model + '_')]
    else:
        print("\n— No single-model removal improved QWK. Elimination complete.\n")
        break

# --- Final training on selected features
print("▶ Training final 5-fold models on features from:", current_models)
final_mean, final_models = cv_score(current_features, stacking_model, kf, num_classes)
print(f"Final mean QWK: {final_mean:.4f}\n")

# --- Save each model
for i, mdl in enumerate(final_models, 1):
    path = f"stack-cb_cpu-fold{i}.pkl"
    joblib.dump(mdl, path)
    print(f"✔ Saved fold {i} model to '{path}'")

▶ Running baseline 5-fold CV on all features …
Baseline mean QWK: 0.8370

▶ Testing removal of each model …


Models:   0%|          | 0/8 [00:00<?, ?it/s]

  Removing convnext     → mean QWK = 0.8383
  Removing convnext-r   → mean QWK = 0.8382
  Removing effnet       → mean QWK = 0.8399
  Removing effnet-o     → mean QWK = 0.8406
  Removing effnet-p     → mean QWK = 0.8348
  Removing resnet       → mean QWK = 0.8406
  Removing resnet-n     → mean QWK = 0.8407
  Removing resnet-r     → mean QWK = 0.8392

✔ Eliminating model 'resnet-n' improved QWK: 0.8370 → 0.8407

▶ Testing removal of each model …


Models:   0%|          | 0/7 [00:00<?, ?it/s]

  Removing convnext     → mean QWK = 0.8373
  Removing convnext-r   → mean QWK = 0.8410
  Removing effnet       → mean QWK = 0.8398
  Removing effnet-o     → mean QWK = 0.8383
  Removing effnet-p     → mean QWK = 0.8324
  Removing resnet       → mean QWK = 0.8420
  Removing resnet-r     → mean QWK = 0.8367

✔ Eliminating model 'resnet' improved QWK: 0.8407 → 0.8420

▶ Testing removal of each model …


Models:   0%|          | 0/6 [00:00<?, ?it/s]

  Removing convnext     → mean QWK = 0.8379
  Removing convnext-r   → mean QWK = 0.8407
  Removing effnet       → mean QWK = 0.8413
  Removing effnet-o     → mean QWK = 0.8395
  Removing effnet-p     → mean QWK = 0.8342
  Removing resnet-r     → mean QWK = 0.8428

✔ Eliminating model 'resnet-r' improved QWK: 0.8420 → 0.8428

▶ Testing removal of each model …


Models:   0%|          | 0/5 [00:00<?, ?it/s]

  Removing convnext     → mean QWK = 0.8369
  Removing convnext-r   → mean QWK = 0.8412
  Removing effnet       → mean QWK = 0.8394
  Removing effnet-o     → mean QWK = 0.8403
  Removing effnet-p     → mean QWK = 0.8342

— No single-model removal improved QWK. Elimination complete.

▶ Training final 5-fold models on features from: ['convnext', 'convnext-r', 'effnet', 'effnet-o', 'effnet-p']
Final mean QWK: 0.8428

✔ Saved fold 1 model to 'stack_model_fold1.pkl'
✔ Saved fold 2 model to 'stack_model_fold2.pkl'
✔ Saved fold 3 model to 'stack_model_fold3.pkl'
✔ Saved fold 4 model to 'stack_model_fold4.pkl'
✔ Saved fold 5 model to 'stack_model_fold5.pkl'


In [10]:
# Identify models by splitting on '_'
feature_cols = [c for c in df_stack.columns if c != 'label']
models = sorted({c.split('_')[0] for c in feature_cols})

# Map each model name → list of its feature columns
features_by_model = {
    m: [c for c in feature_cols if c.startswith(m + '_')]
    for m in models
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

stacking_model = CatBoostRegressor(task_type='GPU',
                                   iterations=5000, early_stopping_rounds=100,
                                   use_best_model=True,
                                   verbose=0)

# --- Baseline
print("▶ Running baseline 5-fold CV on all features …")
base_mean, base_models = cv_score(feature_cols, stacking_model, kf, num_classes)
print(f"Baseline mean QWK: {base_mean:.4f}\n")

current_models = models.copy()
current_features = feature_cols.copy()
best_mean = base_mean

# --- Backward elimination
while True:
    print("▶ Testing removal of each model …")
    removal_results = {}
    
    for m in tqdm(current_models, desc="Models"):
        # drop m’s features
        feat = [c for c in current_features if not c.startswith(m + '_')]
        mean_qwk, _ = cv_score(feat, stacking_model, kf, num_classes)
        removal_results[m] = mean_qwk
        print(f"  Removing {m:12} → mean QWK = {mean_qwk:.4f}")

    # find best improvement
    worst_model, candidate_qwk = max(removal_results.items(), key=lambda kv: kv[1])
    if candidate_qwk > best_mean:
        print(f"\n✔ Eliminating model '{worst_model}' improved QWK: {best_mean:.4f} → {candidate_qwk:.4f}\n")
        # update state
        best_mean = candidate_qwk
        current_models.remove(worst_model)
        current_features = [c for c in current_features if not c.startswith(worst_model + '_')]
    else:
        print("\n— No single-model removal improved QWK. Elimination complete.\n")
        break

# --- Final training on selected features
print("▶ Training final 5-fold models on features from:", current_models)
final_mean, final_models = cv_score(current_features, stacking_model, kf, num_classes)
print(f"Final mean QWK: {final_mean:.4f}\n")

# --- Save each model
for i, mdl in enumerate(final_models, 1):
    path = f"stack-cb_gpu-fold{i}.pkl"
    joblib.dump(mdl, path)
    print(f"✔ Saved fold {i} model to '{path}'")

▶ Running baseline 5-fold CV on all features …
Baseline mean QWK: 0.8442

▶ Testing removal of each model …


Models:   0%|          | 0/8 [00:00<?, ?it/s]

  Removing convnext     → mean QWK = 0.8444
  Removing convnext-r   → mean QWK = 0.8434
  Removing effnet       → mean QWK = 0.8453
  Removing effnet-o     → mean QWK = 0.8463
  Removing effnet-p     → mean QWK = 0.8377
  Removing resnet       → mean QWK = 0.8452
  Removing resnet-n     → mean QWK = 0.8442
  Removing resnet-r     → mean QWK = 0.8422

✔ Eliminating model 'effnet-o' improved QWK: 0.8442 → 0.8463

▶ Testing removal of each model …


Models:   0%|          | 0/7 [00:00<?, ?it/s]

  Removing convnext     → mean QWK = 0.8415
  Removing convnext-r   → mean QWK = 0.8431
  Removing effnet       → mean QWK = 0.8444
  Removing effnet-p     → mean QWK = 0.8365
  Removing resnet       → mean QWK = 0.8463
  Removing resnet-n     → mean QWK = 0.8451
  Removing resnet-r     → mean QWK = 0.8455

✔ Eliminating model 'resnet' improved QWK: 0.8463 → 0.8463

▶ Testing removal of each model …


Models:   0%|          | 0/6 [00:00<?, ?it/s]

  Removing convnext     → mean QWK = 0.8443
  Removing convnext-r   → mean QWK = 0.8403
  Removing effnet       → mean QWK = 0.8454
  Removing effnet-p     → mean QWK = 0.8387
  Removing resnet-n     → mean QWK = 0.8446
  Removing resnet-r     → mean QWK = 0.8449

— No single-model removal improved QWK. Elimination complete.

▶ Training final 5-fold models on features from: ['convnext', 'convnext-r', 'effnet', 'effnet-p', 'resnet-n', 'resnet-r']
Final mean QWK: 0.8463

✔ Saved fold 1 model to 'stack-cb_gpu-fold1.pkl'
✔ Saved fold 2 model to 'stack-cb_gpu-fold2.pkl'
✔ Saved fold 3 model to 'stack-cb_gpu-fold3.pkl'
✔ Saved fold 4 model to 'stack-cb_gpu-fold4.pkl'
✔ Saved fold 5 model to 'stack-cb_gpu-fold5.pkl'


In [8]:
for i, mdl in enumerate(final_models, 1):
    path = f"stack-cb_gpu-fold{i}.pkl"
    joblib.dump(mdl, path)
    print(f"✔ Saved fold {i} model to '{path}'")

✔ Saved fold 1 model to 'stack-cb_gpu-fold1.pkl'
✔ Saved fold 2 model to 'stack-cb_gpu-fold2.pkl'
✔ Saved fold 3 model to 'stack-cb_gpu-fold3.pkl'
✔ Saved fold 4 model to 'stack-cb_gpu-fold4.pkl'
✔ Saved fold 5 model to 'stack-cb_gpu-fold5.pkl'


In [None]:
final_models[0].feature_names_

['convnext_0',
 'convnext_1',
 'convnext_2',
 'convnext_3',
 'convnext_4',
 'convnext-r_pred',
 'effnet_0',
 'effnet_1',
 'effnet_2',
 'effnet_3',
 'effnet_4',
 'effnet-p_0',
 'effnet-p_1',
 'effnet-p_2',
 'effnet-p_3',
 'effnet-p_4',
 'resnet-n_0',
 'resnet-n_1',
 'resnet-n_2',
 'resnet-n_3',
 'resnet-n_4',
 'resnet-r_pred']

In [51]:
# Identify models by splitting on '_'
feature_cols = [c for c in df_stack.columns if c != 'label']
models = sorted({c.split('_')[0] for c in feature_cols})

# Map each model name → list of its feature columns
features_by_model = {
    m: [c for c in feature_cols if c.startswith(m + '_')]
    for m in models
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

stacking_model = TabPFNRegressor()

# --- Baseline
print("▶ Running baseline 5-fold CV on all features …")
base_mean, base_models = cv_score(feature_cols, stacking_model, kf, num_classes)
print(f"Baseline mean QWK: {base_mean:.4f}\n")

current_models = models.copy()
current_features = feature_cols.copy()
best_mean = base_mean

# --- Backward elimination
while True:
    print("▶ Testing removal of each model …")
    removal_results = {}
    
    for m in tqdm(current_models, desc="Models"):
        # drop m’s features
        feat = [c for c in current_features if not c.startswith(m + '_')]
        mean_qwk, _ = cv_score(feat, stacking_model, kf, num_classes)
        removal_results[m] = mean_qwk
        print(f"  Removing {m:12} → mean QWK = {mean_qwk:.4f}")

    # find best improvement
    worst_model, candidate_qwk = max(removal_results.items(), key=lambda kv: kv[1])
    if candidate_qwk > best_mean:
        print(f"\n✔ Eliminating model '{worst_model}' improved QWK: {best_mean:.4f} → {candidate_qwk:.4f}\n")
        # update state
        best_mean = candidate_qwk
        current_models.remove(worst_model)
        current_features = [c for c in current_features if not c.startswith(worst_model + '_')]
    else:
        print("\n— No single-model removal improved QWK. Elimination complete.\n")
        break

# --- Final training on selected features
print("▶ Training final 5-fold models on features from:", current_models)
final_mean, final_models = cv_score(current_features, stacking_model, kf, num_classes)
print(f"Final mean QWK: {final_mean:.4f}\n")

# --- Save each model
for i, mdl in enumerate(final_models, 1):
    path = f"stack-tabpfn-fold{i}.pkl"
    joblib.dump(mdl, path)
    print(f"✔ Saved fold {i} model to '{path}'")

▶ Running baseline 5-fold CV on all features …
Baseline mean QWK: 0.8428

▶ Testing removal of each model …


Models:   0%|          | 0/8 [00:00<?, ?it/s]

  Removing convnext     → mean QWK = 0.8387
  Removing convnext-r   → mean QWK = 0.8393
  Removing effnet       → mean QWK = 0.8402
  Removing effnet-o     → mean QWK = 0.8417
  Removing effnet-p     → mean QWK = 0.8345
  Removing resnet       → mean QWK = 0.8424
  Removing resnet-n     → mean QWK = 0.8431
  Removing resnet-r     → mean QWK = 0.8430

✔ Eliminating model 'resnet-n' improved QWK: 0.8428 → 0.8431

▶ Testing removal of each model …


Models:   0%|          | 0/7 [00:00<?, ?it/s]

  Removing convnext     → mean QWK = 0.8394
  Removing convnext-r   → mean QWK = 0.8414
  Removing effnet       → mean QWK = 0.8423
  Removing effnet-o     → mean QWK = 0.8399
  Removing effnet-p     → mean QWK = 0.8336
  Removing resnet       → mean QWK = 0.8441
  Removing resnet-r     → mean QWK = 0.8437

✔ Eliminating model 'resnet' improved QWK: 0.8431 → 0.8441

▶ Testing removal of each model …


Models:   0%|          | 0/6 [00:00<?, ?it/s]

  Removing convnext     → mean QWK = 0.8384
  Removing convnext-r   → mean QWK = 0.8405
  Removing effnet       → mean QWK = 0.8411
  Removing effnet-o     → mean QWK = 0.8395
  Removing effnet-p     → mean QWK = 0.8343
  Removing resnet-r     → mean QWK = 0.8431

— No single-model removal improved QWK. Elimination complete.

▶ Training final 5-fold models on features from: ['convnext', 'convnext-r', 'effnet', 'effnet-o', 'effnet-p', 'resnet-r']
Final mean QWK: 0.8441

✔ Saved fold 1 model to 'stack-tabpfn-fold1.pkl'
✔ Saved fold 2 model to 'stack-tabpfn-fold2.pkl'
✔ Saved fold 3 model to 'stack-tabpfn-fold3.pkl'
✔ Saved fold 4 model to 'stack-tabpfn-fold4.pkl'
✔ Saved fold 5 model to 'stack-tabpfn-fold5.pkl'


In [37]:
import torch, numpy as np
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import cohen_kappa_score

class PyTorchTabularMLPRegressor:
    """A sklearn-like wrapper around a tiny PyTorch MLP with early stopping on QWK."""
    def __init__(self,
                 hidden_dims=[128,64],
                 dropout=0.3,
                 lr=1e-3,
                 batch_size=64,
                 max_epochs=50,
                 patience=5,
                 num_classes=5,
                 random_state=None,
                 device=None):
        self.hidden_dims  = hidden_dims
        self.dropout      = dropout
        self.lr           = lr
        self.batch_size   = batch_size
        self.max_epochs   = max_epochs
        self.patience     = patience
        self.num_classes  = num_classes
        self.rs           = random_state
        self.device       = device or torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self._is_fitted   = False

    def _build_model(self, input_dim):
        layers = []
        dims = [input_dim] + self.hidden_dims
        for i in range(len(dims)-1):
            layers += [
                nn.Linear(dims[i], dims[i+1]),
                nn.ReLU(),
                nn.Dropout(self.dropout)
            ]
        layers += [nn.Linear(dims[-1], 1)]
        return nn.Sequential(*layers).to(self.device)

    def fit(self, X, y, eval_set):
        # 1) train/val split for early stopping
        X_tr, y_tr = X, y
        X_va, y_va = eval_set[0]

        # 2) DataLoaders
        ds_tr = TensorDataset(torch.from_numpy(X_tr).float(),
                              torch.from_numpy(y_tr).float())
        ds_va = TensorDataset(torch.from_numpy(X_va).float(),
                              torch.from_numpy(y_va).float())
        loader_tr = DataLoader(ds_tr, batch_size=self.batch_size,
                               shuffle=True,  pin_memory=True)
        loader_va = DataLoader(ds_va, batch_size=self.batch_size,
                               shuffle=False, pin_memory=True)

        # 3) Model, optimizer
        model     = self._build_model(X.shape[1])
        optimizer = torch.optim.AdamW(model.parameters(), lr=self.lr)

        best_qwk = -np.inf
        best_state = None
        no_improve = 0

        # 4) Training loop with early stop on QWK
        for epoch in range(1, self.max_epochs+1):
            model.train()
            for xb, yb in loader_tr:
                xb, yb = xb.to(self.device), yb.to(self.device)
                pred = model(xb).squeeze(1)
                loss = nn.functional.mse_loss(pred, yb)
                optimizer.zero_grad(); loss.backward(); optimizer.step()

            # validation pass
            model.eval()
            all_p, all_t = [], []
            with torch.no_grad():
                for xb, yb in loader_va:
                    xb = xb.to(self.device)
                    out = model(xb).cpu().numpy()
                    preds = np.clip(np.rint(out), 0, self.num_classes-1).astype(int)
                    all_p.append(preds); all_t.append(yb.numpy().astype(int))
            all_p = np.concatenate(all_p); all_t = np.concatenate(all_t)
            qwk = cohen_kappa_score(all_t, all_p, weights='quadratic')

            if qwk > best_qwk:
                best_qwk, best_state, no_improve = qwk, model.state_dict(), 0
            else:
                no_improve += 1
                if no_improve >= self.patience:
                    break

        # load best
        model.load_state_dict(best_state)
        self.model_ = model
        self._is_fitted = True
        return self

    def predict(self, X):
        if not self._is_fitted:
            raise RuntimeError("You must fit() before predict().")
        self.model_.eval()
        preds = []
        loader = DataLoader(torch.from_numpy(X).float(),
                            batch_size=self.batch_size,
                            shuffle=False)
        with torch.no_grad():
            for xb in loader:
                xb = xb.to(self.device)
                out = self.model_(xb).cpu().numpy()
                preds.append(out)
        return np.concatenate(preds)

    def predict_proba(self, X):
        # For stacking we want the raw real‐valued preds
        return self.predict(X)


In [51]:
# Identify models by splitting on '_'
feature_cols = [c for c in df_stack.columns if c != 'label']
models = sorted({c.split('_')[0] for c in feature_cols})

# Map each model name → list of its feature columns
features_by_model = {
    m: [c for c in feature_cols if c.startswith(m + '_')]
    for m in models
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

stacking_model = PyTorchTabularMLPRegressor(
    hidden_dims=[128,64],
    dropout=0.4,
    lr=1e-3,
    batch_size=512,
    max_epochs=100,
    patience=15,
    num_classes=num_classes
)

# --- Baseline
print("▶ Running baseline 5-fold CV on all features …")
base_mean, base_models = cv_score(feature_cols, stacking_model, kf, num_classes)
print(f"Baseline mean QWK: {base_mean:.4f}\n")

current_models = models.copy()
current_features = feature_cols.copy()
best_mean = base_mean

# --- Backward elimination
while True:
    print("▶ Testing removal of each model …")
    removal_results = {}
    
    for m in tqdm(current_models, desc="Models"):
        # drop m’s features
        feat = [c for c in current_features if not c.startswith(m + '_')]
        mean_qwk, _ = cv_score(feat, stacking_model, kf, num_classes)
        removal_results[m] = mean_qwk
        print(f"  Removing {m:12} → mean QWK = {mean_qwk:.4f}")

    # find best improvement
    worst_model, candidate_qwk = max(removal_results.items(), key=lambda kv: kv[1])
    if candidate_qwk > best_mean:
        print(f"\n✔ Eliminating model '{worst_model}' improved QWK: {best_mean:.4f} → {candidate_qwk:.4f}\n")
        # update state
        best_mean = candidate_qwk
        current_models.remove(worst_model)
        current_features = [c for c in current_features if not c.startswith(worst_model + '_')]
    else:
        print("\n— No single-model removal improved QWK. Elimination complete.\n")
        break

# --- Final training on selected features
print("▶ Training final 5-fold models on features from:", current_models)
final_mean, final_models = cv_score(current_features, stacking_model, kf, num_classes)
print(f"Final mean QWK: {final_mean:.4f}\n")

# --- Save each model
for i, mdl in enumerate(final_models, 1):
    path = f"stack-mlp-fold{i}.pkl"
    joblib.dump(mdl, path)
    print(f"✔ Saved fold {i} model to '{path}'")

▶ Running baseline 5-fold CV on all features …
Baseline mean QWK: 0.8416

▶ Testing removal of each model …


Models:   0%|          | 0/8 [00:00<?, ?it/s]

  Removing convnext     → mean QWK = 0.8379
  Removing convnext-r   → mean QWK = 0.8336
  Removing effnet       → mean QWK = 0.8385
  Removing effnet-o     → mean QWK = 0.8386
  Removing effnet-p     → mean QWK = 0.8328
  Removing resnet       → mean QWK = 0.8358
  Removing resnet-n     → mean QWK = 0.8333
  Removing resnet-r     → mean QWK = 0.8368

— No single-model removal improved QWK. Elimination complete.

▶ Training final 5-fold models on features from: ['convnext', 'convnext-r', 'effnet', 'effnet-o', 'effnet-p', 'resnet', 'resnet-n', 'resnet-r']
Final mean QWK: 0.8353

✔ Saved fold 1 model to 'stack-mlp-fold1.pkl'
✔ Saved fold 2 model to 'stack-mlp-fold2.pkl'
✔ Saved fold 3 model to 'stack-mlp-fold3.pkl'
✔ Saved fold 4 model to 'stack-mlp-fold4.pkl'
✔ Saved fold 5 model to 'stack-mlp-fold5.pkl'


.

In [None]:
feature_cols = [c for c in df_stack.columns if c != 'label']

['convnext_0',
 'convnext_1',
 'convnext_2',
 'convnext_3',
 'convnext_4',
 'convnext-r_pred',
 'effnet_0',
 'effnet_1',
 'effnet_2',
 'effnet_3',
 'effnet_4',
 'effnet-p_0',
 'effnet-p_1',
 'effnet-p_2',
 'effnet-p_3',
 'effnet-p_4',
 'effnet-o_0',
 'effnet-o_1',
 'effnet-o_2',
 'effnet-o_3',
 'resnet_0',
 'resnet_1',
 'resnet_2',
 'resnet_3',
 'resnet_4',
 'resnet-n_0',
 'resnet-n_1',
 'resnet-n_2',
 'resnet-n_3',
 'resnet-n_4',
 'resnet-r_pred']

In [41]:
feature_cols = ['convnext_0',
                'convnext_1',
                'convnext_2',
                'convnext_3',
                'convnext_4',
                'convnext-r_pred',
                'effnet_0',
                'effnet_1',
                'effnet_2',
                'effnet_3',
                'effnet_4',
                'effnet-p_0',
                'effnet-p_1',
                'effnet-p_2',
                'effnet-p_3',
                'effnet-p_4',
                'effnet-o_0',
                'effnet-o_1',
                'effnet-o_2',
                'effnet-o_3']
                # 'resnet_0',
                # 'resnet_1',
                # 'resnet_2',
                # 'resnet_3',
                # 'resnet_4',
                # 'resnet-n_0',
                # 'resnet-n_1',
                # 'resnet-n_2',
                # 'resnet-n_3',
                # 'resnet-n_4',
                # 'resnet-r_pred']

In [43]:
from lightgbm import LGBMRegressor

stacking_model = LGBMRegressor(verbose=-1)

final_mean, final_models = cv_score(feature_cols, stacking_model, kf, num_classes)
print(f"Final mean QWK: {final_mean:.4f}\n")

Final mean QWK: 0.8389



In [44]:
from lightgbm import LGBMRegressor

stacking_model = LGBMRegressor(verbose=-1, device='cuda')

final_mean, final_models = cv_score(feature_cols, stacking_model, kf, num_classes)
print(f"Final mean QWK: {final_mean:.4f}\n")

Final mean QWK: 0.8398



In [47]:
from catboost import CatBoostRegressor

stacking_model = CatBoostRegressor(verbose=0, task_type='GPU')

final_mean, final_models = cv_score(feature_cols, stacking_model, kf, num_classes)
print(f"Final mean QWK: {final_mean:.4f}\n")

Final mean QWK: 0.8437

