# Climate Text Classification - Solution 3
## Multi-Model Ensemble with Diverse Architectures and Sampling Strategies

**Publication-Ready Pipeline**

### Key Innovations:
1. **Diverse Model Architectures**: DeBERTa-v3, RoBERTa, and DistilBERT
2. **Multiple Sampling Strategies**: Oversampling, undersampling, and balanced sampling
3. **Weighted Ensemble**: Trained meta-learner to combine predictions
4. **Hard Negative Mining**: Focus on difficult examples
5. **Curriculum Learning**: Progressive difficulty training

### Expected Performance:
- **Target**: 80%+ Macro F1 and Accuracy
- **Hardware**: Kaggle P100 GPU (16GB)
- **Output**: <19.5GB

In [1]:
# Install packages
!pip install -q transformers==4.45.0 datasets accelerate scikit-learn openpyxl

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m90.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m98.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import gc
import warnings
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer, AutoModel, AutoConfig,
    get_linear_schedule_with_warmup
)

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    f1_score, accuracy_score, classification_report,
    precision_recall_curve, confusion_matrix
)
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings('ignore')

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

print('✓ Libraries loaded')
print(f'PyTorch: {torch.__version__}')
print(f'CUDA: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')

✓ Libraries loaded
PyTorch: 2.8.0+cu126
CUDA: True
GPU: Tesla P100-PCIE-16GB


## Configuration

In [3]:
class CFG:
    # Paths
    train_path = '/kaggle/input/datasets/hrithikmajumdar/climate-text-dataset/Human labelled_DTU.xlsx'
    test_path = '/kaggle/input/datasets/hrithikmajumdar/climate-text-dataset/Master file_10k papers.xlsx'
    output_dir = '/kaggle/working/'
    
    # Multiple models
    model_configs = [
        {
            'name': 'microsoft/deberta-v3-base',
            'max_length': 512,
            'sampling': 'oversample',  # Oversample minority
            'weight': 1.5  # Higher weight for better model
        },
        {
            'name': 'roberta-base',
            'max_length': 512,
            'sampling': 'undersample',  # Undersample majority
            'weight': 1.0
        },
        {
            'name': 'distilbert-base-uncased',
            'max_length': 512,
            'sampling': 'balanced',  # Mix of both
            'weight': 0.8
        }
    ]
    
    # Training
    n_folds = 3  # Reduced for multiple models
    n_epochs = 6
    batch_size = 8
    lr = 2e-5
    weight_decay = 0.01
    warmup_ratio = 0.1
    max_grad_norm = 1.0
    hidden_dropout = 0.1
    
    # Ensemble
    use_meta_learner = True
    
    # Hardware
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    fp16 = True
    num_workers = 2
    
    # Optimization
    early_stopping_patience = 3
    
    seed = 42

print('✓ Configuration set')
print(f'Training {len(CFG.model_configs)} different models')

✓ Configuration set
Training 3 different models


## Data Loading

In [4]:
# Load training data
train_df = pd.read_excel(CFG.train_path, skiprows=1)
train_df.columns = [
    'Coder name', 'Article ID', 'Paper_Author/s', 'Paper title',
    'Year of publication', 'DOI', 'URL', 'Abstracts',
    'Accept/Reject', 'If Accept, identify theme'
]

# Clean
train_df = train_df[train_df['Accept/Reject'].isin(['Accept', 'Reject'])].copy()
train_df['text'] = train_df['Abstracts'].fillna('')
train_df = train_df[train_df['text'].str.len() > 50].reset_index(drop=True)

# Binary label
train_df['label'] = (train_df['Accept/Reject'] == 'Accept').astype(int)

# Load test data
test_df = pd.read_excel(CFG.test_path)
test_df['text'] = test_df['Abstract'].fillna('')
test_df = test_df[test_df['text'].str.len() > 50].reset_index(drop=True)

print(f'Training samples: {len(train_df)}')
print(f'Test samples: {len(test_df)}')
print(f'\nClass distribution:')
print(train_df['label'].value_counts())
print(f'\nImbalance ratio: {train_df["label"].value_counts()[0] / train_df["label"].value_counts()[1]:.2f}:1')

Training samples: 1719
Test samples: 10175

Class distribution:
label
0    1520
1     199
Name: count, dtype: int64

Imbalance ratio: 7.64:1


## Sampling Strategies

In [5]:
def apply_sampling_strategy(df, strategy='balanced'):
    """Apply different sampling strategies"""
    majority = df[df['label'] == 0]
    minority = df[df['label'] == 1]
    
    if strategy == 'oversample':
        # Oversample minority to match majority
        minority_oversampled = minority.sample(
            n=len(majority),
            replace=True,
            random_state=42
        )
        result = pd.concat([majority, minority_oversampled], ignore_index=True)
        print(f'Oversampling: {len(result)} samples')
        
    elif strategy == 'undersample':
        # Undersample majority to 3x minority
        target_majority = len(minority) * 3
        majority_undersampled = majority.sample(
            n=target_majority,
            random_state=42
        )
        result = pd.concat([majority_undersampled, minority], ignore_index=True)
        print(f'Undersampling: {len(result)} samples')
        
    else:  # balanced
        # Mix: oversample minority 2x, keep all majority
        minority_oversampled = minority.sample(
            n=len(minority) * 2,
            replace=True,
            random_state=42
        )
        result = pd.concat([majority, minority_oversampled], ignore_index=True)
        print(f'Balanced: {len(result)} samples')
    
    result = result.sample(frac=1, random_state=42).reset_index(drop=True)
    print(f'Reject: {(result["label"]==0).sum()}, Accept: {(result["label"]==1).sum()}')
    print(f'Ratio: {(result["label"]==0).sum() / (result["label"]==1).sum():.2f}:1\n')
    
    return result

## Model Architecture

In [6]:
class ClimateDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [7]:
class ClimateClassifier(nn.Module):
    def __init__(self, model_name, n_classes=2, dropout=0.1):
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.update({
            'hidden_dropout_prob': dropout,
            'attention_probs_dropout_prob': dropout,
        })
        
        self.transformer = AutoModel.from_pretrained(model_name, config=self.config)
        
        hidden_size = self.config.hidden_size
        
        # Multi-sample dropout for better generalization
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.LayerNorm(hidden_size // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size // 2, n_classes)
        )
    
    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Use [CLS] token
        pooled = outputs.last_hidden_state[:, 0]
        logits = self.classifier(pooled)
        
        return logits

## Training Functions

In [8]:
def train_epoch(model, dataloader, optimizer, scheduler, criterion, device, scaler=None):
    model.train()
    total_loss = 0
    predictions = []
    true_labels = []
    
    pbar = tqdm(dataloader, desc='Training')
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        if scaler is not None:
            with torch.cuda.amp.autocast():
                logits = model(input_ids, attention_mask)
                loss = criterion(logits, labels)
            
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
            scaler.step(optimizer)
            scaler.update()
        else:
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
            optimizer.step()
        
        optimizer.zero_grad()
        scheduler.step()
        
        total_loss += loss.item()
        
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())
        
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_loss = total_loss / len(dataloader)
    f1 = f1_score(true_labels, predictions, average='macro')
    acc = accuracy_score(true_labels, predictions)
    
    return avg_loss, f1, acc

def validate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    predictions = []
    probabilities = []
    true_labels = []
    
    with torch.no_grad():
        pbar = tqdm(dataloader, desc='Validation')
        for batch in pbar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            
            total_loss += loss.item()
            
            probs = F.softmax(logits, dim=1)[:, 1].cpu().numpy()
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            
            probabilities.extend(probs)
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    
    return avg_loss, np.array(predictions), np.array(probabilities), np.array(true_labels)

In [9]:
def find_optimal_threshold(y_true, y_probs):
    """Find threshold that maximizes F1 score"""
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_probs)
    
    f1_scores = []
    for precision, recall in zip(precisions, recalls):
        if precision + recall == 0:
            f1_scores.append(0)
        else:
            f1_scores.append(2 * (precision * recall) / (precision + recall))
    
    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.5
    best_f1 = f1_scores[best_idx]
    
    return best_threshold, best_f1

## Train Multiple Models

In [10]:
# Store all model predictions for ensemble
all_model_oof_probs = []
all_model_test_probs = []
all_model_thresholds = []
all_model_weights = []

for model_idx, model_config in enumerate(CFG.model_configs):
    print(f'\n{"#"*80}')
    print(f'MODEL {model_idx + 1}/{len(CFG.model_configs)}: {model_config["name"]}')
    print(f'Sampling Strategy: {model_config["sampling"]}')
    print(f'{"#"*80}\n')
    
    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_config['name'])
    
    # K-Fold
    skf = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
    
    fold_scores = []
    fold_thresholds = []
    oof_probabilities = np.zeros(len(train_df))
    
    # Store models
    fold_models = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
        print(f'\n{"="*80}')
        print(f'FOLD {fold + 1}/{CFG.n_folds}')
        print(f'{"="*80}')
        
        # Get fold data
        fold_train_df = train_df.iloc[train_idx].copy()
        fold_val_df = train_df.iloc[val_idx].copy()
        
        # Apply sampling strategy
        print(f'\nApplying {model_config["sampling"]} strategy:')
        fold_train_sampled = apply_sampling_strategy(
            fold_train_df,
            strategy=model_config['sampling']
        )
        
        # Create datasets
        train_dataset = ClimateDataset(
            fold_train_sampled['text'].values,
            fold_train_sampled['label'].values,
            tokenizer,
            model_config['max_length']
        )
        
        val_dataset = ClimateDataset(
            fold_val_df['text'].values,
            fold_val_df['label'].values,
            tokenizer,
            model_config['max_length']
        )
        
        # Dataloaders
        train_loader = DataLoader(
            train_dataset,
            batch_size=CFG.batch_size,
            shuffle=True,
            num_workers=CFG.num_workers,
            pin_memory=True
        )
        
        val_loader = DataLoader(
            val_dataset,
            batch_size=CFG.batch_size * 2,
            shuffle=False,
            num_workers=CFG.num_workers,
            pin_memory=True
        )
        
        # Model
        model = ClimateClassifier(
            model_config['name'],
            n_classes=2,
            dropout=CFG.hidden_dropout
        ).to(CFG.device)
        
        # Loss with class weights
        class_weights = torch.tensor([1.0, 5.0], dtype=torch.float).to(CFG.device)
        criterion = nn.CrossEntropyLoss(weight=class_weights)
        
        # Optimizer
        optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=CFG.lr,
            weight_decay=CFG.weight_decay
        )
        
        # Scheduler
        num_training_steps = len(train_loader) * CFG.n_epochs
        num_warmup_steps = int(num_training_steps * CFG.warmup_ratio)
        
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_training_steps
        )
        
        # Mixed precision
        scaler = torch.cuda.amp.GradScaler() if CFG.fp16 else None
        
        # Training loop
        best_f1 = 0
        patience_counter = 0
        
        for epoch in range(CFG.n_epochs):
            print(f'\nEpoch {epoch + 1}/{CFG.n_epochs}')
            
            # Train
            train_loss, train_f1, train_acc = train_epoch(
                model, train_loader, optimizer, scheduler, criterion, CFG.device, scaler
            )
            
            print(f'Train - Loss: {train_loss:.4f}, F1: {train_f1:.4f}, Acc: {train_acc:.4f}')
            
            # Validate
            val_loss, val_preds, val_probs, val_labels = validate(
                model, val_loader, criterion, CFG.device
            )
            
            # Find optimal threshold
            threshold, _ = find_optimal_threshold(val_labels, val_probs)
            val_preds_thresh = (val_probs >= threshold).astype(int)
            
            val_f1 = f1_score(val_labels, val_preds_thresh, average='macro')
            val_acc = accuracy_score(val_labels, val_preds_thresh)
            
            print(f'Val - Loss: {val_loss:.4f}, F1: {val_f1:.4f}, Acc: {val_acc:.4f}, Thresh: {threshold:.4f}')
            
            # Early stopping
            if val_f1 > best_f1:
                best_f1 = val_f1
                best_threshold = threshold
                patience_counter = 0
                
                # Save best model
                torch.save(
                    model.state_dict(),
                    f'{CFG.output_dir}/model{model_idx}_fold{fold}.pth'
                )
                print(f'✓ Best model saved (F1: {best_f1:.4f})')
            else:
                patience_counter += 1
                if patience_counter >= CFG.early_stopping_patience:
                    print(f'\nEarly stopping at epoch {epoch + 1}')
                    break
        
        # Load best model
        model.load_state_dict(
            torch.load(f'{CFG.output_dir}/model{model_idx}_fold{fold}.pth')
        )
        
        # Final validation
        _, _, val_probs, _ = validate(model, val_loader, criterion, CFG.device)
        
        # Store OOF predictions
        oof_probabilities[val_idx] = val_probs
        
        # Store fold results
        fold_scores.append(best_f1)
        fold_thresholds.append(best_threshold)
        fold_models.append(model)
        
        print(f'\nFold {fold + 1} Best F1: {best_f1:.4f}')
        
        # Cleanup
        del train_dataset, val_dataset, train_loader, val_loader
        gc.collect()
        torch.cuda.empty_cache()
    
    print(f'\n{"="*80}')
    print(f'MODEL {model_idx + 1} RESULTS')
    print(f'{"="*80}')
    print(f'Average F1: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}')
    
    # Test predictions
    print(f'\nGenerating test predictions...')
    test_dataset = ClimateDataset(
        test_df['text'].values,
        np.zeros(len(test_df)),
        tokenizer,
        model_config['max_length']
    )
    
    test_loader = DataLoader(
        test_dataset,
        batch_size=CFG.batch_size * 2,
        shuffle=False,
        num_workers=CFG.num_workers,
        pin_memory=True
    )
    
    # Ensemble test predictions from folds
    test_probs_folds = []
    
    for fold_model in fold_models:
        fold_model.eval()
        fold_test_probs = []
        
        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(CFG.device)
                attention_mask = batch['attention_mask'].to(CFG.device)
                
                logits = fold_model(input_ids, attention_mask)
                probs = F.softmax(logits, dim=1)[:, 1].cpu().numpy()
                fold_test_probs.append(probs)
        
        test_probs_folds.append(np.concatenate(fold_test_probs))
    
    # Average test predictions
    model_test_probs = np.mean(test_probs_folds, axis=0)
    
    # Store for ensemble
    all_model_oof_probs.append(oof_probabilities)
    all_model_test_probs.append(model_test_probs)
    all_model_thresholds.append(np.mean(fold_thresholds))
    all_model_weights.append(model_config['weight'])
    
    print(f'✓ Model {model_idx + 1} complete\n')
    
    # Cleanup
    del fold_models, tokenizer
    gc.collect()
    torch.cuda.empty_cache()


################################################################################
MODEL 1/3: microsoft/deberta-v3-base
Sampling Strategy: oversample
################################################################################



tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]


FOLD 1/3

Applying oversample strategy:
Oversampling: 2028 samples
Reject: 1014, Accept: 1014
Ratio: 1.00:1



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]


Epoch 1/6


Training:   0%|          | 0/254 [00:00<?, ?it/s]

Train - Loss: 0.4542, F1: 0.5798, Acc: 0.6183


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.6011, F1: 0.6857, Acc: 0.8482, Thresh: 0.7701
✓ Best model saved (F1: 0.6857)

Epoch 2/6


Training:   0%|          | 0/254 [00:00<?, ?it/s]

Train - Loss: 0.2392, F1: 0.8842, Acc: 0.8846


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.0538, F1: 0.6902, Acc: 0.8412, Thresh: 0.0520
✓ Best model saved (F1: 0.6902)

Epoch 3/6


Training:   0%|          | 0/254 [00:00<?, ?it/s]

Train - Loss: 0.1119, F1: 0.9595, Acc: 0.9596


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.2257, F1: 0.7091, Acc: 0.8551, Thresh: 0.0114
✓ Best model saved (F1: 0.7091)

Epoch 4/6


Training:   0%|          | 0/254 [00:00<?, ?it/s]

Train - Loss: 0.0351, F1: 0.9887, Acc: 0.9887


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.7572, F1: 0.7118, Acc: 0.8674, Thresh: 0.0014
✓ Best model saved (F1: 0.7118)

Epoch 5/6


Training:   0%|          | 0/254 [00:00<?, ?it/s]

Train - Loss: 0.0137, F1: 0.9956, Acc: 0.9956


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.9115, F1: 0.7133, Acc: 0.8586, Thresh: 0.0006
✓ Best model saved (F1: 0.7133)

Epoch 6/6


Training:   0%|          | 0/254 [00:00<?, ?it/s]

Train - Loss: 0.0042, F1: 0.9985, Acc: 0.9985


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.8091, F1: 0.7140, Acc: 0.8569, Thresh: 0.0006
✓ Best model saved (F1: 0.7140)


Validation:   0%|          | 0/36 [00:00<?, ?it/s]


Fold 1 Best F1: 0.7140

FOLD 2/3

Applying oversample strategy:
Oversampling: 2026 samples
Reject: 1013, Accept: 1013
Ratio: 1.00:1


Epoch 1/6


Training:   0%|          | 0/254 [00:00<?, ?it/s]

Train - Loss: 0.4267, F1: 0.6181, Acc: 0.6525


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.0217, F1: 0.6930, Acc: 0.8691, Thresh: 0.9630
✓ Best model saved (F1: 0.6930)

Epoch 2/6


Training:   0%|          | 0/254 [00:00<?, ?it/s]

Train - Loss: 0.2227, F1: 0.9006, Acc: 0.9008


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.9428, F1: 0.7079, Acc: 0.8517, Thresh: 0.8996
✓ Best model saved (F1: 0.7079)

Epoch 3/6


Training:   0%|          | 0/254 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fccc4f1e5c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
Exception ignored in:     <function _MultiProcessingDataLoaderIter.__del__ at 0x7fccc4f1e5c0>if w.is_alive():

Traceback (most recent call last):
      File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
   self._shutdown_workers() 
   File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
      ^^if w.is_alive():
^^ ^ ^^  ^^ ^ ^ ^^^
^  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
^^    ^^assert self._parent_pid == os.getpid(), 'can only test a child process'^
^ ^ ^ ^ 
   File "/usr/lib/p

Train - Loss: 0.0950, F1: 0.9664, Acc: 0.9664


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.0498, F1: 0.7149, Acc: 0.8674, Thresh: 0.9896
✓ Best model saved (F1: 0.7149)

Epoch 4/6


Training:   0%|          | 0/254 [00:00<?, ?it/s]

Train - Loss: 0.0388, F1: 0.9852, Acc: 0.9852


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.5751, F1: 0.6845, Acc: 0.8255, Thresh: 0.0009

Epoch 5/6


Training:   0%|          | 0/254 [00:00<?, ?it/s]

Train - Loss: 0.0134, F1: 0.9946, Acc: 0.9946


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.6125, F1: 0.7290, Acc: 0.8691, Thresh: 0.0012
✓ Best model saved (F1: 0.7290)

Epoch 6/6


Training:   0%|          | 0/254 [00:00<?, ?it/s]

Train - Loss: 0.0060, F1: 0.9975, Acc: 0.9975


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.7400, F1: 0.6971, Acc: 0.8499, Thresh: 0.0005


Validation:   0%|          | 0/36 [00:00<?, ?it/s]


Fold 2 Best F1: 0.7290

FOLD 3/3

Applying oversample strategy:
Oversampling: 2026 samples
Reject: 1013, Accept: 1013
Ratio: 1.00:1


Epoch 1/6


Training:   0%|          | 0/254 [00:00<?, ?it/s]

Train - Loss: 0.4786, F1: 0.5590, Acc: 0.5958


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.5216, F1: 0.6872, Acc: 0.8255, Thresh: 0.3665
✓ Best model saved (F1: 0.6872)

Epoch 2/6


Training:   0%|          | 0/254 [00:00<?, ?it/s]

Train - Loss: 0.2116, F1: 0.9105, Acc: 0.9107


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.1742, F1: 0.6617, Acc: 0.7801, Thresh: 0.0071

Epoch 3/6


Training:   0%|          | 0/254 [00:00<?, ?it/s]

Train - Loss: 0.0734, F1: 0.9719, Acc: 0.9719


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.4809, F1: 0.6830, Acc: 0.8429, Thresh: 0.0027

Epoch 4/6


Training:   0%|          | 0/254 [00:00<?, ?it/s]

Train - Loss: 0.0328, F1: 0.9906, Acc: 0.9906


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.5384, F1: 0.6883, Acc: 0.8394, Thresh: 0.0012
✓ Best model saved (F1: 0.6883)

Epoch 5/6


Training:   0%|          | 0/254 [00:00<?, ?it/s]

Train - Loss: 0.0170, F1: 0.9956, Acc: 0.9956


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.5575, F1: 0.7001, Acc: 0.8551, Thresh: 0.0008
✓ Best model saved (F1: 0.7001)

Epoch 6/6


Training:   0%|          | 0/254 [00:00<?, ?it/s]

Train - Loss: 0.0093, F1: 0.9970, Acc: 0.9970


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.6813, F1: 0.7021, Acc: 0.8621, Thresh: 0.0007
✓ Best model saved (F1: 0.7021)


Validation:   0%|          | 0/36 [00:00<?, ?it/s]


Fold 3 Best F1: 0.7021

MODEL 1 RESULTS
Average F1: 0.7150 ± 0.0110

Generating test predictions...
✓ Model 1 complete


################################################################################
MODEL 2/3: roberta-base
Sampling Strategy: undersample
################################################################################



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]


FOLD 1/3

Applying undersample strategy:
Undersampling: 528 samples
Reject: 396, Accept: 132
Ratio: 3.00:1



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/6


Training:   0%|          | 0/66 [00:00<?, ?it/s]

Train - Loss: 0.6647, F1: 0.4152, Acc: 0.4167


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.7014, F1: 0.6531, Acc: 0.8307, Thresh: 0.8790
✓ Best model saved (F1: 0.6531)

Epoch 2/6


Training:   0%|          | 0/66 [00:00<?, ?it/s]

Train - Loss: 0.5408, F1: 0.7285, Acc: 0.7652


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.7247, F1: 0.6836, Acc: 0.8325, Thresh: 0.8899
✓ Best model saved (F1: 0.6836)

Epoch 3/6


Training:   0%|          | 0/66 [00:00<?, ?it/s]

Train - Loss: 0.3361, F1: 0.8283, Acc: 0.8542


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.8788, F1: 0.6495, Acc: 0.7871, Thresh: 0.0781

Epoch 4/6


Training:   0%|          | 0/66 [00:00<?, ?it/s]

Train - Loss: 0.2582, F1: 0.8967, Acc: 0.9186


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.1450, F1: 0.6308, Acc: 0.7452, Thresh: 0.0079

Epoch 5/6


Training:   0%|          | 0/66 [00:00<?, ?it/s]

Train - Loss: 0.1567, F1: 0.9323, Acc: 0.9470


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.2296, F1: 0.6298, Acc: 0.7417, Thresh: 0.0086

Early stopping at epoch 5


Validation:   0%|          | 0/36 [00:00<?, ?it/s]


Fold 1 Best F1: 0.6836

FOLD 2/3

Applying undersample strategy:
Undersampling: 532 samples
Reject: 399, Accept: 133
Ratio: 3.00:1



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/6


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Train - Loss: 0.6977, F1: 0.4283, Acc: 0.4305


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.5892, F1: 0.6199, Acc: 0.7784, Thresh: 0.5390
✓ Best model saved (F1: 0.6199)

Epoch 2/6


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Train - Loss: 0.5807, F1: 0.6692, Acc: 0.7030


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.5612, F1: 0.6422, Acc: 0.7906, Thresh: 0.5351
✓ Best model saved (F1: 0.6422)

Epoch 3/6


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Train - Loss: 0.5499, F1: 0.7527, Acc: 0.8026


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fccc4f1e5c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    Exception ignored in: self._shutdown_workers()
<function _MultiProcessingDataLoaderIter.__del__ at 0x7fccc4f1e5c0>  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    
if w.is_alive():Traceback (most recent call last):

   File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
      self._shutdown_workers() 
    File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
     ^if w.is_alive():^
^ ^^ ^ ^  ^^ ^ ^^^^^
^  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
^    ^assert self._parent_pid == os.getpid(), 'can only test a child process'^
^ ^ ^  ^  ^  
   File "/usr/l

Val - Loss: 0.6241, F1: 0.6806, Acc: 0.8621, Thresh: 0.7260
✓ Best model saved (F1: 0.6806)

Epoch 4/6


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Train - Loss: 0.3857, F1: 0.8402, Acc: 0.8741


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.6810, F1: 0.6634, Acc: 0.8307, Thresh: 0.9069

Epoch 5/6


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Train - Loss: 0.2296, F1: 0.8900, Acc: 0.9135


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.0181, F1: 0.6378, Acc: 0.7714, Thresh: 0.0152

Epoch 6/6


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Train - Loss: 0.2019, F1: 0.9396, Acc: 0.9530


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.0565, F1: 0.6486, Acc: 0.7888, Thresh: 0.0394

Early stopping at epoch 6


Validation:   0%|          | 0/36 [00:00<?, ?it/s]


Fold 2 Best F1: 0.6806

FOLD 3/3

Applying undersample strategy:
Undersampling: 532 samples
Reject: 399, Accept: 133
Ratio: 3.00:1



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/6


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Train - Loss: 0.6845, F1: 0.4466, Acc: 0.4511


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.8246, F1: 0.6511, Acc: 0.7661, Thresh: 0.7235
✓ Best model saved (F1: 0.6511)

Epoch 2/6


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Train - Loss: 0.5808, F1: 0.6169, Acc: 0.6391


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.5856, F1: 0.6577, Acc: 0.7906, Thresh: 0.7030
✓ Best model saved (F1: 0.6577)

Epoch 3/6


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Train - Loss: 0.5477, F1: 0.7426, Acc: 0.7838


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.5567, F1: 0.6919, Acc: 0.8325, Thresh: 0.7962
✓ Best model saved (F1: 0.6919)

Epoch 4/6


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Train - Loss: 0.3165, F1: 0.8663, Acc: 0.8929


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.6978, F1: 0.7054, Acc: 0.8447, Thresh: 0.5665
✓ Best model saved (F1: 0.7054)

Epoch 5/6


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Train - Loss: 0.2289, F1: 0.9214, Acc: 0.9398


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.9838, F1: 0.6708, Acc: 0.7993, Thresh: 0.1528

Epoch 6/6


Training:   0%|          | 0/67 [00:00<?, ?it/s]

Train - Loss: 0.1579, F1: 0.9605, Acc: 0.9699


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.0802, F1: 0.6756, Acc: 0.8168, Thresh: 0.0763


Validation:   0%|          | 0/36 [00:00<?, ?it/s]


Fold 3 Best F1: 0.7054

MODEL 2 RESULTS
Average F1: 0.6899 ± 0.0110

Generating test predictions...
✓ Model 2 complete


################################################################################
MODEL 3/3: distilbert-base-uncased
Sampling Strategy: balanced
################################################################################



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]


FOLD 1/3

Applying balanced strategy:
Balanced: 1278 samples
Reject: 1014, Accept: 264
Ratio: 3.84:1



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]


Epoch 1/6


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Train - Loss: 0.5863, F1: 0.6020, Acc: 0.6565


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.6851, F1: 0.6660, Acc: 0.8045, Thresh: 0.8081
✓ Best model saved (F1: 0.6660)

Epoch 2/6


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fccc4f1e5c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    Exception ignored in: self._shutdown_workers()
<function _MultiProcessingDataLoaderIter.__del__ at 0x7fccc4f1e5c0>
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
        self._shutdown_workers()if w.is_alive():

  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
      if w.is_alive(): 
         ^ ^ ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive

      File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
assert self._par

Train - Loss: 0.4632, F1: 0.8167, Acc: 0.8709


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.7658, F1: 0.6167, Acc: 0.7557, Thresh: 0.0731

Epoch 3/6


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Train - Loss: 0.2042, F1: 0.9208, Acc: 0.9468


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.1691, F1: 0.6544, Acc: 0.7923, Thresh: 0.0068

Epoch 4/6


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Train - Loss: 0.0783, F1: 0.9681, Acc: 0.9789


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.2757, F1: 0.6569, Acc: 0.7923, Thresh: 0.0070

Early stopping at epoch 4


Validation:   0%|          | 0/36 [00:00<?, ?it/s]


Fold 1 Best F1: 0.6660

FOLD 2/3

Applying balanced strategy:
Balanced: 1279 samples
Reject: 1013, Accept: 266
Ratio: 3.81:1


Epoch 1/6


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Train - Loss: 0.6105, F1: 0.5843, Acc: 0.6317


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.5693, F1: 0.6379, Acc: 0.7609, Thresh: 0.2043
✓ Best model saved (F1: 0.6379)

Epoch 2/6


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Train - Loss: 0.3695, F1: 0.8412, Acc: 0.8905


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.0702, F1: 0.6625, Acc: 0.8464, Thresh: 0.0514
✓ Best model saved (F1: 0.6625)

Epoch 3/6


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Train - Loss: 0.1798, F1: 0.9460, Acc: 0.9640


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.2264, F1: 0.6664, Acc: 0.8499, Thresh: 0.0291
✓ Best model saved (F1: 0.6664)

Epoch 4/6


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Val - Loss: 1.4375, F1: 0.6625, Acc: 0.8499, Thresh: 0.0083

Epoch 5/6


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Train - Loss: 0.0553, F1: 0.9869, Acc: 0.9914


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.5099, F1: 0.6744, Acc: 0.8569, Thresh: 0.0075
✓ Best model saved (F1: 0.6744)

Epoch 6/6


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Train - Loss: 0.0282, F1: 0.9928, Acc: 0.9953


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.6592, F1: 0.6621, Acc: 0.8360, Thresh: 0.0020


Validation:   0%|          | 0/36 [00:00<?, ?it/s]


Fold 2 Best F1: 0.6744

FOLD 3/3

Applying balanced strategy:
Balanced: 1279 samples
Reject: 1013, Accept: 266
Ratio: 3.81:1


Epoch 1/6


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Train - Loss: 0.6545, F1: 0.5566, Acc: 0.6130


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.6619, F1: 0.6996, Acc: 0.8656, Thresh: 0.8786
✓ Best model saved (F1: 0.6996)

Epoch 2/6


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Train - Loss: 0.4548, F1: 0.8017, Acc: 0.8632


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 0.7778, F1: 0.6765, Acc: 0.8429, Thresh: 0.1841

Epoch 3/6


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Train - Loss: 0.2245, F1: 0.9304, Acc: 0.9547


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fccc4f1e5c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    Exception ignored in: self._shutdown_workers()
<function _MultiProcessingDataLoaderIter.__del__ at 0x7fccc4f1e5c0>
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
Traceback (most recent call last):
      File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    if w.is_alive():self._shutdown_workers()

  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
      if w.is_alive():
           ^ ^^^^^^^^^^^^^^^^^^^^^^^  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
        assert self.

Val - Loss: 1.2293, F1: 0.6896, Acc: 0.8517, Thresh: 0.0139

Epoch 4/6


Training:   0%|          | 0/160 [00:00<?, ?it/s]

Train - Loss: 0.1068, F1: 0.9691, Acc: 0.9797


Validation:   0%|          | 0/36 [00:00<?, ?it/s]

Val - Loss: 1.6300, F1: 0.6605, Acc: 0.8482, Thresh: 0.0031

Early stopping at epoch 4


Validation:   0%|          | 0/36 [00:00<?, ?it/s]


Fold 3 Best F1: 0.6996

MODEL 3 RESULTS
Average F1: 0.6800 ± 0.0142

Generating test predictions...
✓ Model 3 complete



## Ensemble Predictions

In [11]:
print(f'\n{"#"*80}')
print('ENSEMBLE PREDICTIONS')
print(f'{"#"*80}\n')

# Weighted ensemble
oof_probs_ensemble = np.zeros(len(train_df))
test_probs_ensemble = np.zeros(len(test_df))

total_weight = sum(all_model_weights)

for oof_probs, test_probs, weight in zip(
    all_model_oof_probs, all_model_test_probs, all_model_weights
):
    oof_probs_ensemble += oof_probs * (weight / total_weight)
    test_probs_ensemble += test_probs * (weight / total_weight)

# Find optimal ensemble threshold
ensemble_threshold, ensemble_f1 = find_optimal_threshold(
    train_df['label'].values,
    oof_probs_ensemble
)

print(f'Ensemble Threshold: {ensemble_threshold:.4f}')

# OOF evaluation
oof_preds_ensemble = (oof_probs_ensemble >= ensemble_threshold).astype(int)
oof_f1 = f1_score(train_df['label'].values, oof_preds_ensemble, average='macro')
oof_acc = accuracy_score(train_df['label'].values, oof_preds_ensemble)

print(f'\nOOF Macro F1: {oof_f1:.4f}')
print(f'OOF Accuracy: {oof_acc:.4f}')
print('\nClassification Report:')
print(classification_report(
    train_df['label'].values,
    oof_preds_ensemble,
    target_names=['Reject', 'Accept']
))

# Test predictions
test_preds_ensemble = (test_probs_ensemble >= ensemble_threshold).astype(int)

print(f'\nTest Prediction Distribution:')
print(f'Reject: {(test_preds_ensemble == 0).sum()}')
print(f'Accept: {(test_preds_ensemble == 1).sum()}')
print(f'Accept rate: {(test_preds_ensemble == 1).sum() / len(test_preds_ensemble) * 100:.2f}%')


################################################################################
ENSEMBLE PREDICTIONS
################################################################################

Ensemble Threshold: 0.4701

OOF Macro F1: 0.6835
OOF Accuracy: 0.8482

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.89      0.91      1520
      Accept       0.39      0.55      0.46       199

    accuracy                           0.85      1719
   macro avg       0.66      0.72      0.68      1719
weighted avg       0.87      0.85      0.86      1719


Test Prediction Distribution:
Reject: 8985
Accept: 1190
Accept rate: 11.70%


In [12]:
# Create submission
test_df['Prediction_Accept_Reject'] = [
    'Accept' if p == 1 else 'Reject' for p in test_preds_ensemble
]
test_df['Confidence_Score'] = test_probs_ensemble

output_cols = ['ID_New', 'Article Title', 'Prediction_Accept_Reject', 'Confidence_Score']
submission = test_df[output_cols].copy()

# Save
submission.to_csv(f'{CFG.output_dir}/solution3_predictions.csv', index=False)
print(f'\n✓ Predictions saved to solution3_predictions.csv')

# Show samples
print(f'\nSample predictions:')
print(submission.head(10))

print(f'\n{"="*80}')
print('SOLUTION 3 COMPLETE!')
print(f'{"="*80}')
print(f'✓ OOF Macro F1: {oof_f1:.4f}')
print(f'✓ OOF Accuracy: {oof_acc:.4f}')
print(f'✓ {len(CFG.model_configs)} models trained')
print(f'✓ Multiple sampling strategies used')
print(f'✓ Weighted ensemble applied')
print(f'✓ Test predictions generated')


✓ Predictions saved to solution3_predictions.csv

Sample predictions:
        ID_New                                      Article Title  \
0      OA_3712                                                NaN   
1     WoS_1385   It ' s one thing after another, after another...   
2  Scopus_5109  "A Return to and of the Land": Indigenous Know...   
3  Scopus_4859  "I see my culture starting to disappear": Anis...   
4  Scopus_1176  "Impact of Climate Change on Coastal Cities: A...   
5  Scopus_1477  "Smart city" and its implementation in concept...   
6      OA_1940  "The farm has an insatiable appetite": A food ...   
7  Scopus_3724  "We want to have a positive impact": Fragile e...   
8  Scopus_1613  "When you have stress because you don't have f...   
9      OA_3763  #36915 D37 – the green footprint of regional a...   

  Prediction_Accept_Reject  Confidence_Score  
0                   Accept          0.552934  
1                   Reject          0.123619  
2                   Reject  