# Climate Text Classification - Solution 2
## Cost-Sensitive Learning + Advanced Sampling + Probability Calibration

**Publication-Ready Pipeline**

### Key Innovations:
1. **Cost-Sensitive Cross-Entropy**: Heavy penalties for minority class misclassification
2. **SMOTE-like Augmentation**: Sophisticated synthetic sample generation
3. **Probability Calibration**: Platt scaling for better confidence estimates
4. **Dynamic Class Weights**: Adaptive reweighting during training
5. **Test-Time Augmentation (TTA)**: Multiple predictions per sample

### Expected Performance:
- **Target**: 80%+ Macro F1 and Accuracy
- **Hardware**: Kaggle P100 GPU (16GB)
- **Output**: <19.5GB

In [1]:
# Install packages
!pip install -q transformers==4.45.0 datasets accelerate scikit-learn openpyxl imbalanced-learn

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m64.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m97.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import gc
import warnings
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import random
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer, AutoModel, AutoConfig,
    get_linear_schedule_with_warmup
)

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    f1_score, accuracy_score, classification_report,
    precision_recall_curve, roc_auc_score, confusion_matrix
)
from sklearn.calibration import calibration_curve
from imblearn.over_sampling import SMOTE

warnings.filterwarnings('ignore')

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

print('✓ Libraries loaded')
print(f'PyTorch: {torch.__version__}')
print(f'CUDA: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')

✓ Libraries loaded
PyTorch: 2.8.0+cu126
CUDA: True
GPU: Tesla P100-PCIE-16GB


## Configuration

In [3]:
class CFG:
    # Paths
    train_path = '/kaggle/input/datasets/hrithikmajumdar/climate-text-dataset/Human labelled_DTU.xlsx'
    test_path = '/kaggle/input/datasets/hrithikmajumdar/climate-text-dataset/Master file_10k papers.xlsx'
    output_dir = '/kaggle/working/'
    
    # Model
    model_name = 'microsoft/deberta-v3-base'
    max_length = 512
    hidden_dropout = 0.15
    attention_dropout = 0.15
    
    # Training
    n_folds = 5
    n_epochs = 10
    batch_size = 8
    grad_accum_steps = 2
    lr = 2e-5
    weight_decay = 0.01
    warmup_ratio = 0.15
    max_grad_norm = 1.0
    
    # Cost-sensitive
    class_weight_reject = 1.0
    class_weight_accept = 10.0  # Heavy penalty for misclassifying minority class
    
    # Augmentation
    use_smote = True
    smote_k_neighbors = 3
    
    # Test-Time Augmentation
    use_tta = True
    tta_rounds = 5
    
    # Calibration
    use_calibration = True
    
    # Hardware
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    fp16 = True
    num_workers = 2
    
    # Optimization
    early_stopping_patience = 4
    
    seed = 42

print('✓ Configuration set')

✓ Configuration set


## Data Loading

In [4]:
# Load training data
train_df = pd.read_excel(CFG.train_path, skiprows=1)
train_df.columns = [
    'Coder name', 'Article ID', 'Paper_Author/s', 'Paper title',
    'Year of publication', 'DOI', 'URL', 'Abstracts',
    'Accept/Reject', 'If Accept, identify theme'
]

# Clean
train_df = train_df[train_df['Accept/Reject'].isin(['Accept', 'Reject'])].copy()
train_df['text'] = train_df['Abstracts'].fillna('')
train_df = train_df[train_df['text'].str.len() > 50].reset_index(drop=True)

# Binary label
train_df['label'] = (train_df['Accept/Reject'] == 'Accept').astype(int)

# Load test data
test_df = pd.read_excel(CFG.test_path)
test_df['text'] = test_df['Abstract'].fillna('')
test_df = test_df[test_df['text'].str.len() > 50].reset_index(drop=True)

print(f'Training samples: {len(train_df)}')
print(f'Test samples: {len(test_df)}')
print(f'\nClass distribution:')
print(train_df['label'].value_counts())
print(f'\nImbalance ratio: {train_df["label"].value_counts()[0] / train_df["label"].value_counts()[1]:.2f}:1')

Training samples: 1719
Test samples: 10175

Class distribution:
label
0    1520
1     199
Name: count, dtype: int64

Imbalance ratio: 7.64:1


## SMOTE-like Text Augmentation

In [5]:
class TextSMOTE:
    """SMOTE-inspired text augmentation through sentence mixing"""
    
    def __init__(self, k_neighbors=3):
        self.k_neighbors = k_neighbors
    
    def augment_samples(self, texts, labels, target_count):
        """Generate synthetic minority samples"""
        minority_texts = [t for t, l in zip(texts, labels) if l == 1]
        
        if len(minority_texts) < 2:
            return texts, labels
        
        synthetic_texts = []
        synthetic_labels = []
        
        samples_needed = target_count - len([l for l in labels if l == 1])
        
        for _ in range(samples_needed):
            # Select random sample
            idx = random.randint(0, len(minority_texts) - 1)
            text = minority_texts[idx]
            
            # Select random neighbor
            neighbor_idx = random.randint(0, len(minority_texts) - 1)
            while neighbor_idx == idx:
                neighbor_idx = random.randint(0, len(minority_texts) - 1)
            neighbor_text = minority_texts[neighbor_idx]
            
            # Mix sentences
            synthetic_text = self.mix_texts(text, neighbor_text)
            synthetic_texts.append(synthetic_text)
            synthetic_labels.append(1)
        
        return list(texts) + synthetic_texts, list(labels) + synthetic_labels
    
    def mix_texts(self, text1, text2):
        """Mix two texts by interleaving sentences"""
        # Split into sentences
        sents1 = text1.split('. ')
        sents2 = text2.split('. ')
        
        # Mix sentences
        mixed = []
        max_len = max(len(sents1), len(sents2))
        
        for i in range(max_len):
            if random.random() < 0.5 and i < len(sents1):
                mixed.append(sents1[i])
            elif i < len(sents2):
                mixed.append(sents2[i])
        
        return '. '.join(mixed)

print('✓ SMOTE augmenter created')

✓ SMOTE augmenter created


## Model Architecture

In [6]:
class ClimateDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length, augment=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augment = augment
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        # Random masking for augmentation (minority class only)
        if self.augment and self.labels[idx] == 1 and random.random() < 0.2:
            words = text.split()
            num_mask = max(1, int(len(words) * 0.1))
            mask_indices = random.sample(range(len(words)), num_mask)
            for idx_mask in mask_indices:
                words[idx_mask] = '[MASK]'
            text = ' '.join(words)
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [7]:
class WeightedCrossEntropyLoss(nn.Module):
    """Cross-entropy with heavy class weights"""
    def __init__(self, weight):
        super().__init__()
        self.weight = weight
    
    def forward(self, inputs, targets):
        return F.cross_entropy(inputs, targets, weight=self.weight)

In [8]:
class ClimateClassifier(nn.Module):
    def __init__(self, model_name, n_classes=2, dropout=0.15):
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.update({
            'hidden_dropout_prob': dropout,
            'attention_probs_dropout_prob': dropout,
        })
        
        self.transformer = AutoModel.from_pretrained(model_name, config=self.config)
        
        # Enhanced classifier with residual connections
        hidden_size = self.config.hidden_size
        
        self.attention_pool = nn.Sequential(
            nn.Linear(hidden_size, 1),
            nn.Tanh()
        )
        
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.LayerNorm(hidden_size // 2),
            nn.GELU(),
            nn.Dropout(dropout / 2),
            nn.Linear(hidden_size // 2, n_classes)
        )
    
    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Attention-based pooling
        attention_weights = self.attention_pool(outputs.last_hidden_state)
        attention_weights = F.softmax(attention_weights, dim=1)
        attention_output = torch.sum(outputs.last_hidden_state * attention_weights, dim=1)
        
        logits = self.classifier(attention_output)
        return logits

## Training Functions

In [9]:
def train_epoch(model, dataloader, optimizer, scheduler, criterion, device, scaler=None):
    model.train()
    total_loss = 0
    predictions = []
    true_labels = []
    
    pbar = tqdm(dataloader, desc='Training')
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        if scaler is not None:
            with torch.cuda.amp.autocast():
                logits = model(input_ids, attention_mask)
                loss = criterion(logits, labels)
            
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
            scaler.step(optimizer)
            scaler.update()
        else:
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
            optimizer.step()
        
        optimizer.zero_grad()
        scheduler.step()
        
        total_loss += loss.item()
        
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())
        
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_loss = total_loss / len(dataloader)
    f1 = f1_score(true_labels, predictions, average='macro')
    acc = accuracy_score(true_labels, predictions)
    
    return avg_loss, f1, acc

def validate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    predictions = []
    probabilities = []
    true_labels = []
    
    with torch.no_grad():
        pbar = tqdm(dataloader, desc='Validation')
        for batch in pbar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            
            total_loss += loss.item()
            
            probs = F.softmax(logits, dim=1)[:, 1].cpu().numpy()
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            
            probabilities.extend(probs)
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    
    return avg_loss, np.array(predictions), np.array(probabilities), np.array(true_labels)

In [10]:
def find_optimal_threshold(y_true, y_probs):
    """Find threshold that maximizes F1 score"""
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_probs)
    
    f1_scores = []
    for precision, recall in zip(precisions, recalls):
        if precision + recall == 0:
            f1_scores.append(0)
        else:
            f1_scores.append(2 * (precision * recall) / (precision + recall))
    
    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.5
    best_f1 = f1_scores[best_idx]
    
    return best_threshold, best_f1

def evaluate_with_threshold(y_true, y_probs, threshold):
    """Evaluate using optimal threshold"""
    y_pred = (y_probs >= threshold).astype(int)
    f1 = f1_score(y_true, y_pred, average='macro')
    acc = accuracy_score(y_true, y_pred)
    
    print(f'\nThreshold: {threshold:.4f}')
    print(f'Macro F1: {f1:.4f}')
    print(f'Accuracy: {acc:.4f}')
    print('\nClassification Report:')
    print(classification_report(y_true, y_pred, target_names=['Reject', 'Accept']))
    
    return f1, acc

## Cross-Validation Training

In [11]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)

# K-Fold Cross Validation
skf = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)

fold_scores = []
fold_thresholds = []
oof_predictions = np.zeros(len(train_df))
oof_probabilities = np.zeros(len(train_df))

# Store models for ensemble
models = []

# SMOTE augmenter
smote_aug = TextSMOTE(k_neighbors=CFG.smote_k_neighbors)

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
    print(f'\n{"="*80}')
    print(f'FOLD {fold + 1}/{CFG.n_folds}')
    print(f'{"="*80}')
    
    # Get fold data
    fold_train_df = train_df.iloc[train_idx].copy()
    fold_val_df = train_df.iloc[val_idx].copy()
    
    # SMOTE augmentation
    if CFG.use_smote:
        majority_count = (fold_train_df['label'] == 0).sum()
        minority_count = (fold_train_df['label'] == 1).sum()
        target_minority = int(majority_count * 0.5)  # Aim for 2:1 ratio
        
        train_texts, train_labels = smote_aug.augment_samples(
            fold_train_df['text'].tolist(),
            fold_train_df['label'].tolist(),
            target_minority
        )
        
        print(f'After SMOTE: {len(train_texts)} samples')
        print(f'Reject: {train_labels.count(0)}, Accept: {train_labels.count(1)}')
    else:
        train_texts = fold_train_df['text'].tolist()
        train_labels = fold_train_df['label'].tolist()
    
    # Create datasets
    train_dataset = ClimateDataset(
        train_texts,
        train_labels,
        tokenizer,
        CFG.max_length,
        augment=True
    )
    
    val_dataset = ClimateDataset(
        fold_val_df['text'].values,
        fold_val_df['label'].values,
        tokenizer,
        CFG.max_length,
        augment=False
    )
    
    # Dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size,
        shuffle=True,
        num_workers=CFG.num_workers,
        pin_memory=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=CFG.batch_size * 2,
        shuffle=False,
        num_workers=CFG.num_workers,
        pin_memory=True
    )
    
    # Model
    model = ClimateClassifier(
        CFG.model_name,
        n_classes=2,
        dropout=CFG.hidden_dropout
    ).to(CFG.device)
    
    # Cost-sensitive loss
    class_weights = torch.tensor(
        [CFG.class_weight_reject, CFG.class_weight_accept],
        dtype=torch.float
    ).to(CFG.device)
    
    criterion = WeightedCrossEntropyLoss(weight=class_weights)
    
    # Optimizer
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=CFG.lr,
        weight_decay=CFG.weight_decay
    )
    
    # Scheduler
    num_training_steps = len(train_loader) * CFG.n_epochs
    num_warmup_steps = int(num_training_steps * CFG.warmup_ratio)
    
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
    
    # Mixed precision
    scaler = torch.cuda.amp.GradScaler() if CFG.fp16 else None
    
    # Training loop
    best_f1 = 0
    patience_counter = 0
    
    for epoch in range(CFG.n_epochs):
        print(f'\nEpoch {epoch + 1}/{CFG.n_epochs}')
        
        # Train
        train_loss, train_f1, train_acc = train_epoch(
            model, train_loader, optimizer, scheduler, criterion, CFG.device, scaler
        )
        
        print(f'Train - Loss: {train_loss:.4f}, F1: {train_f1:.4f}, Acc: {train_acc:.4f}')
        
        # Validate
        val_loss, val_preds, val_probs, val_labels = validate(
            model, val_loader, criterion, CFG.device
        )
        
        # Find optimal threshold
        threshold, threshold_f1 = find_optimal_threshold(val_labels, val_probs)
        
        # Evaluate with optimal threshold
        val_f1, val_acc = evaluate_with_threshold(val_labels, val_probs, threshold)
        
        # Early stopping
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_threshold = threshold
            patience_counter = 0
            
            # Save best model
            torch.save(model.state_dict(), f'{CFG.output_dir}/best_model_fold{fold}.pth')
            print(f'✓ Best model saved (F1: {best_f1:.4f})')
        else:
            patience_counter += 1
            if patience_counter >= CFG.early_stopping_patience:
                print(f'\nEarly stopping at epoch {epoch + 1}')
                break
    
    # Load best model
    model.load_state_dict(torch.load(f'{CFG.output_dir}/best_model_fold{fold}.pth'))
    
    # Final validation
    val_loss, val_preds, val_probs, val_labels = validate(
        model, val_loader, criterion, CFG.device
    )
    
    # Store OOF predictions
    oof_probabilities[val_idx] = val_probs
    oof_predictions[val_idx] = (val_probs >= best_threshold).astype(int)
    
    # Store fold results
    fold_scores.append(best_f1)
    fold_thresholds.append(best_threshold)
    models.append(model)
    
    print(f'\nFold {fold + 1} Best F1: {best_f1:.4f}, Threshold: {best_threshold:.4f}')
    
    # Cleanup
    del train_dataset, val_dataset, train_loader, val_loader
    gc.collect()
    torch.cuda.empty_cache()

print(f'\n{"="*80}')
print('CROSS-VALIDATION RESULTS')
print(f'{"="*80}')
print(f'Average F1: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}')
print(f'Average Threshold: {np.mean(fold_thresholds):.4f}')
print(f'Fold scores: {[f"{s:.4f}" for s in fold_scores]}')

# Overall OOF evaluation
print(f'\n{"="*80}')
print('OUT-OF-FOLD PREDICTIONS')
print(f'{"="*80}')
oof_f1 = f1_score(train_df['label'].values, oof_predictions, average='macro')
oof_acc = accuracy_score(train_df['label'].values, oof_predictions)
print(f'OOF Macro F1: {oof_f1:.4f}')
print(f'OOF Accuracy: {oof_acc:.4f}')
print('\nClassification Report:')
print(classification_report(train_df['label'].values, oof_predictions, target_names=['Reject', 'Accept']))

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]


FOLD 1/5
After SMOTE: 1824 samples
Reject: 1216, Accept: 608


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]


Epoch 1/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.4849, F1: 0.5272, Acc: 0.5302


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.9893
Macro F1: 0.5993
Accuracy: 0.7587

Classification Report:
              precision    recall  f1-score   support

      Reject       0.93      0.79      0.85       304
      Accept       0.25      0.55      0.35        40

    accuracy                           0.76       344
   macro avg       0.59      0.67      0.60       344
weighted avg       0.85      0.76      0.79       344

✓ Best model saved (F1: 0.5993)

Epoch 2/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.3970, F1: 0.8370, Acc: 0.8498


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0313
Macro F1: 0.6881
Accuracy: 0.8663

Classification Report:
              precision    recall  f1-score   support

      Reject       0.93      0.92      0.92       304
      Accept       0.43      0.47      0.45        40

    accuracy                           0.87       344
   macro avg       0.68      0.70      0.69       344
weighted avg       0.87      0.87      0.87       344

✓ Best model saved (F1: 0.6881)

Epoch 3/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.3014, F1: 0.9193, Acc: 0.9282


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.2075
Macro F1: 0.6654
Accuracy: 0.8256

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.86      0.90       304
      Accept       0.35      0.57      0.43        40

    accuracy                           0.83       344
   macro avg       0.64      0.72      0.67       344
weighted avg       0.87      0.83      0.84       344


Epoch 4/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.1941, F1: 0.9462, Acc: 0.9518


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0096
Macro F1: 0.6644
Accuracy: 0.8459

Classification Report:
              precision    recall  f1-score   support

      Reject       0.93      0.89      0.91       304
      Accept       0.37      0.47      0.42        40

    accuracy                           0.85       344
   macro avg       0.65      0.68      0.66       344
weighted avg       0.86      0.85      0.85       344


Epoch 5/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e57572a660>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e57572a660>^
^Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
^^    self._shutdown_workers()^
^  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
^^    ^if w.is_alive():
^^ ^^ ^^ ^ ^ ^  ^^^^

Train - Loss: 0.1011, F1: 0.9772, Acc: 0.9797


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0152
Macro F1: 0.6780
Accuracy: 0.8634

Classification Report:
              precision    recall  f1-score   support

      Reject       0.93      0.92      0.92       304
      Accept       0.42      0.45      0.43        40

    accuracy                           0.86       344
   macro avg       0.67      0.68      0.68       344
weighted avg       0.87      0.86      0.87       344


Epoch 6/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.0689, F1: 0.9864, Acc: 0.9879


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0089
Macro F1: 0.6932
Accuracy: 0.8808

Classification Report:
              precision    recall  f1-score   support

      Reject       0.93      0.94      0.93       304
      Accept       0.49      0.42      0.45        40

    accuracy                           0.88       344
   macro avg       0.71      0.68      0.69       344
weighted avg       0.87      0.88      0.88       344

✓ Best model saved (F1: 0.6932)

Epoch 7/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.0081, F1: 0.9957, Acc: 0.9962


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0012
Macro F1: 0.6972
Accuracy: 0.8837

Classification Report:
              precision    recall  f1-score   support

      Reject       0.93      0.94      0.93       304
      Accept       0.50      0.42      0.46        40

    accuracy                           0.88       344
   macro avg       0.71      0.68      0.70       344
weighted avg       0.88      0.88      0.88       344

✓ Best model saved (F1: 0.6972)

Epoch 8/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.0274, F1: 0.9957, Acc: 0.9962


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0011
Macro F1: 0.6980
Accuracy: 0.8692

Classification Report:
              precision    recall  f1-score   support

      Reject       0.93      0.92      0.93       304
      Accept       0.44      0.50      0.47        40

    accuracy                           0.87       344
   macro avg       0.69      0.71      0.70       344
weighted avg       0.88      0.87      0.87       344

✓ Best model saved (F1: 0.6980)

Epoch 9/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.0159, F1: 0.9975, Acc: 0.9978


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0024
Macro F1: 0.6897
Accuracy: 0.8576

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.90      0.92       304
      Accept       0.41      0.53      0.46        40

    accuracy                           0.86       344
   macro avg       0.67      0.71      0.69       344
weighted avg       0.87      0.86      0.86       344


Epoch 10/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e57572a660>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e57572a660>
^^Traceback (most recent call last):
^  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
^    ^^^self._shutdown_workers()^
^  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
^    if w.is_alive():^
 ^ ^  ^ 

Train - Loss: 0.0055, F1: 0.9982, Acc: 0.9984


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0013
Macro F1: 0.6908
Accuracy: 0.8634

Classification Report:
              precision    recall  f1-score   support

      Reject       0.93      0.91      0.92       304
      Accept       0.43      0.50      0.46        40

    accuracy                           0.86       344
   macro avg       0.68      0.71      0.69       344
weighted avg       0.87      0.86      0.87       344



Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Fold 1 Best F1: 0.6980, Threshold: 0.0011

FOLD 2/5
After SMOTE: 1824 samples
Reject: 1216, Accept: 608

Epoch 1/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.4578, F1: 0.4974, Acc: 0.5066


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.7857
Macro F1: 0.6829
Accuracy: 0.8517

Classification Report:
              precision    recall  f1-score   support

      Reject       0.93      0.89      0.91       304
      Accept       0.40      0.53      0.45        40

    accuracy                           0.85       344
   macro avg       0.67      0.71      0.68       344
weighted avg       0.87      0.85      0.86       344

✓ Best model saved (F1: 0.6829)

Epoch 2/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.3547, F1: 0.8610, Acc: 0.8728


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0132
Macro F1: 0.5993
Accuracy: 0.7267

Classification Report:
              precision    recall  f1-score   support

      Reject       0.95      0.73      0.83       304
      Accept       0.25      0.70      0.37        40

    accuracy                           0.73       344
   macro avg       0.60      0.72      0.60       344
weighted avg       0.87      0.73      0.77       344


Epoch 3/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.2656, F1: 0.9230, Acc: 0.9309


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0429
Macro F1: 0.6870
Accuracy: 0.8081

Classification Report:
              precision    recall  f1-score   support

      Reject       0.97      0.81      0.88       304
      Accept       0.36      0.80      0.49        40

    accuracy                           0.81       344
   macro avg       0.66      0.80      0.69       344
weighted avg       0.90      0.81      0.84       344

✓ Best model saved (F1: 0.6870)

Epoch 4/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.1716, F1: 0.9587, Acc: 0.9633


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0030
Macro F1: 0.6783
Accuracy: 0.8110

Classification Report:
              precision    recall  f1-score   support

      Reject       0.96      0.82      0.88       304
      Accept       0.35      0.72      0.47        40

    accuracy                           0.81       344
   macro avg       0.65      0.77      0.68       344
weighted avg       0.89      0.81      0.84       344


Epoch 5/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.1194, F1: 0.9711, Acc: 0.9742


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0079
Macro F1: 0.7411
Accuracy: 0.8924

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.94      0.94       304
      Accept       0.54      0.55      0.54        40

    accuracy                           0.89       344
   macro avg       0.74      0.74      0.74       344
weighted avg       0.89      0.89      0.89       344

✓ Best model saved (F1: 0.7411)

Epoch 6/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.0432, F1: 0.9877, Acc: 0.9890


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0011
Macro F1: 0.7130
Accuracy: 0.8634

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.90      0.92       304
      Accept       0.44      0.60      0.51        40

    accuracy                           0.86       344
   macro avg       0.69      0.75      0.71       344
weighted avg       0.89      0.86      0.87       344


Epoch 7/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.0520, F1: 0.9908, Acc: 0.9918


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0010
Macro F1: 0.7002
Accuracy: 0.8401

Classification Report:
              precision    recall  f1-score   support

      Reject       0.95      0.86      0.91       304
      Accept       0.39      0.68      0.50        40

    accuracy                           0.84       344
   macro avg       0.67      0.77      0.70       344
weighted avg       0.89      0.84      0.86       344


Epoch 8/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.0321, F1: 0.9938, Acc: 0.9945


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0007
Macro F1: 0.6882
Accuracy: 0.8169

Classification Report:
              precision    recall  f1-score   support

      Reject       0.96      0.83      0.89       304
      Accept       0.36      0.75      0.49        40

    accuracy                           0.82       344
   macro avg       0.66      0.79      0.69       344
weighted avg       0.89      0.82      0.84       344


Epoch 9/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e57572a660>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e57572a660>^
^Traceback (most recent call last):
^^  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    ^self._shutdown_workers()
^  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
^    if w.is_alive():^
^  ^ ^^ ^^

Train - Loss: 0.0093, F1: 0.9969, Acc: 0.9973


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0005
Macro F1: 0.6982
Accuracy: 0.8343

Classification Report:
              precision    recall  f1-score   support

      Reject       0.96      0.85      0.90       304
      Accept       0.38      0.70      0.50        40

    accuracy                           0.83       344
   macro avg       0.67      0.78      0.70       344
weighted avg       0.89      0.83      0.85       344


Early stopping at epoch 9


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Fold 2 Best F1: 0.7411, Threshold: 0.0079

FOLD 3/5
After SMOTE: 1824 samples
Reject: 1216, Accept: 608

Epoch 1/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.5067, F1: 0.5074, Acc: 0.5093


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.4389
Macro F1: 0.6595
Accuracy: 0.8198

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.85      0.89       304
      Accept       0.34      0.57      0.43        40

    accuracy                           0.82       344
   macro avg       0.64      0.71      0.66       344
weighted avg       0.87      0.82      0.84       344

✓ Best model saved (F1: 0.6595)

Epoch 2/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.3649, F1: 0.8201, Acc: 0.8328


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.2668
Macro F1: 0.6881
Accuracy: 0.8663

Classification Report:
              precision    recall  f1-score   support

      Reject       0.93      0.92      0.92       304
      Accept       0.43      0.47      0.45        40

    accuracy                           0.87       344
   macro avg       0.68      0.70      0.69       344
weighted avg       0.87      0.87      0.87       344

✓ Best model saved (F1: 0.6881)

Epoch 3/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.2711, F1: 0.9232, Acc: 0.9315


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.1356
Macro F1: 0.7144
Accuracy: 0.8605

Classification Report:
              precision    recall  f1-score   support

      Reject       0.95      0.89      0.92       304
      Accept       0.43      0.62      0.51        40

    accuracy                           0.86       344
   macro avg       0.69      0.76      0.71       344
weighted avg       0.89      0.86      0.87       344

✓ Best model saved (F1: 0.7144)

Epoch 4/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.2506, F1: 0.9422, Acc: 0.9490


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.9990
Macro F1: 0.7190
Accuracy: 0.8895

Classification Report:
              precision    recall  f1-score   support

      Reject       0.93      0.94      0.94       304
      Accept       0.53      0.47      0.50        40

    accuracy                           0.89       344
   macro avg       0.73      0.71      0.72       344
weighted avg       0.88      0.89      0.89       344

✓ Best model saved (F1: 0.7190)

Epoch 5/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.1502, F1: 0.9715, Acc: 0.9748


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0144
Macro F1: 0.7095
Accuracy: 0.8605

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.89      0.92       304
      Accept       0.43      0.60      0.50        40

    accuracy                           0.86       344
   macro avg       0.69      0.75      0.71       344
weighted avg       0.88      0.86      0.87       344


Epoch 6/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.1280, F1: 0.9716, Acc: 0.9748


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.8230
Macro F1: 0.6920
Accuracy: 0.8547

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.89      0.92       304
      Accept       0.41      0.55      0.47        40

    accuracy                           0.85       344
   macro avg       0.67      0.72      0.69       344
weighted avg       0.88      0.85      0.86       344


Epoch 7/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.0618, F1: 0.9876, Acc: 0.9890


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0037
Macro F1: 0.7039
Accuracy: 0.8692

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.91      0.93       304
      Accept       0.45      0.53      0.48        40

    accuracy                           0.87       344
   macro avg       0.69      0.72      0.70       344
weighted avg       0.88      0.87      0.87       344


Epoch 8/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.0417, F1: 0.9944, Acc: 0.9951


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0068
Macro F1: 0.7025
Accuracy: 0.8547

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.89      0.92       304
      Accept       0.41      0.60      0.49        40

    accuracy                           0.85       344
   macro avg       0.68      0.74      0.70       344
weighted avg       0.88      0.85      0.87       344


Early stopping at epoch 8


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Fold 3 Best F1: 0.7190, Threshold: 0.9990

FOLD 4/5
After SMOTE: 1824 samples
Reject: 1216, Accept: 608

Epoch 1/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.4929, F1: 0.5488, Acc: 0.5488


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.2508
Macro F1: 0.6136
Accuracy: 0.7442

Classification Report:
              precision    recall  f1-score   support

      Reject       0.95      0.75      0.84       304
      Accept       0.27      0.70      0.39        40

    accuracy                           0.74       344
   macro avg       0.61      0.72      0.61       344
weighted avg       0.87      0.74      0.79       344

✓ Best model saved (F1: 0.6136)

Epoch 2/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.3614, F1: 0.8668, Acc: 0.8783


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0447
Macro F1: 0.6352
Accuracy: 0.8372

Classification Report:
              precision    recall  f1-score   support

      Reject       0.92      0.89      0.91       304
      Accept       0.33      0.40      0.36        40

    accuracy                           0.84       344
   macro avg       0.63      0.65      0.64       344
weighted avg       0.85      0.84      0.84       344

✓ Best model saved (F1: 0.6352)

Epoch 3/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.3644, F1: 0.9093, Acc: 0.9194


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0418
Macro F1: 0.6862
Accuracy: 0.8314

Classification Report:
              precision    recall  f1-score   support

      Reject       0.95      0.86      0.90       304
      Accept       0.37      0.65      0.47        40

    accuracy                           0.83       344
   macro avg       0.66      0.75      0.69       344
weighted avg       0.88      0.83      0.85       344

✓ Best model saved (F1: 0.6862)

Epoch 4/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e57572a660>Exception ignored in: 
<function _MultiProcessingDataLoaderIter.__del__ at 0x78e57572a660>Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__

Traceback (most recent call last):
      File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()self._shutdown_workers()

  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
        if w.is_alive():
if w.is_alive(): 
          ^ ^ ^ ^^^^^^^^^^^^^^^^^^^
^  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
^    
assert self._parent_pid == os.getpid(), 'can only test a child process'
  File "/usr/lib/python

Train - Loss: 0.2045, F1: 0.9510, Acc: 0.9561


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0113
Macro F1: 0.7412
Accuracy: 0.8779

Classification Report:
              precision    recall  f1-score   support

      Reject       0.95      0.91      0.93       304
      Accept       0.48      0.65      0.55        40

    accuracy                           0.88       344
   macro avg       0.72      0.78      0.74       344
weighted avg       0.90      0.88      0.89       344

✓ Best model saved (F1: 0.7412)

Epoch 5/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.1259, F1: 0.9741, Acc: 0.9770


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0068
Macro F1: 0.7412
Accuracy: 0.8779

Classification Report:
              precision    recall  f1-score   support

      Reject       0.95      0.91      0.93       304
      Accept       0.48      0.65      0.55        40

    accuracy                           0.88       344
   macro avg       0.72      0.78      0.74       344
weighted avg       0.90      0.88      0.89       344


Epoch 6/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.0862, F1: 0.9815, Acc: 0.9836


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0033
Macro F1: 0.7191
Accuracy: 0.8808

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.93      0.93       304
      Accept       0.49      0.53      0.51        40

    accuracy                           0.88       344
   macro avg       0.71      0.73      0.72       344
weighted avg       0.88      0.88      0.88       344


Epoch 7/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.0676, F1: 0.9870, Acc: 0.9885


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0023
Macro F1: 0.7216
Accuracy: 0.8663

Classification Report:
              precision    recall  f1-score   support

      Reject       0.95      0.90      0.92       304
      Accept       0.45      0.62      0.52        40

    accuracy                           0.87       344
   macro avg       0.70      0.76      0.72       344
weighted avg       0.89      0.87      0.88       344


Epoch 8/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.0401, F1: 0.9920, Acc: 0.9929


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0055
Macro F1: 0.7374
Accuracy: 0.8750

Classification Report:
              precision    recall  f1-score   support

      Reject       0.95      0.90      0.93       304
      Accept       0.47      0.65      0.55        40

    accuracy                           0.88       344
   macro avg       0.71      0.78      0.74       344
weighted avg       0.90      0.88      0.88       344


Early stopping at epoch 8


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e57572a660>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e57572a660>
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    Traceback (most recent call last):
if w.is_alive():
     self._shutdown_workers() 
   File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
        ^ ^ ^ ^^^^^^^^^^^^^^^^^^^
^  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
^    assert self._parent_pid == os.getpid(), 'can only test a child process'
  File "/usr/lib/python3


Fold 4 Best F1: 0.7412, Threshold: 0.0113

FOLD 5/5
After SMOTE: 1824 samples
Reject: 1216, Accept: 608

Epoch 1/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.4758, F1: 0.5109, Acc: 0.5143


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.8856
Macro F1: 0.6745
Accuracy: 0.8717

Classification Report:
              precision    recall  f1-score   support

      Reject       0.92      0.93      0.93       304
      Accept       0.43      0.41      0.42        39

    accuracy                           0.87       343
   macro avg       0.68      0.67      0.67       343
weighted avg       0.87      0.87      0.87       343

✓ Best model saved (F1: 0.6745)

Epoch 2/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.3139, F1: 0.8630, Acc: 0.8750


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0104
Macro F1: 0.4379
Accuracy: 0.4810

Classification Report:
              precision    recall  f1-score   support

      Reject       0.97      0.43      0.59       304
      Accept       0.17      0.90      0.28        39

    accuracy                           0.48       343
   macro avg       0.57      0.66      0.44       343
weighted avg       0.88      0.48      0.56       343


Epoch 3/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.2628, F1: 0.9256, Acc: 0.9337


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0135
Macro F1: 0.5721
Accuracy: 0.6822

Classification Report:
              precision    recall  f1-score   support

      Reject       0.96      0.67      0.79       304
      Accept       0.23      0.77      0.36        39

    accuracy                           0.68       343
   macro avg       0.59      0.72      0.57       343
weighted avg       0.88      0.68      0.74       343


Epoch 4/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.2298, F1: 0.9453, Acc: 0.9518


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.8933
Macro F1: 0.6775
Accuracy: 0.8863

Classification Report:
              precision    recall  f1-score   support

      Reject       0.92      0.95      0.94       304
      Accept       0.50      0.36      0.42        39

    accuracy                           0.89       343
   macro avg       0.71      0.66      0.68       343
weighted avg       0.87      0.89      0.88       343

✓ Best model saved (F1: 0.6775)

Epoch 5/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.1316, F1: 0.9752, Acc: 0.9781


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0576
Macro F1: 0.6745
Accuracy: 0.8659

Classification Report:
              precision    recall  f1-score   support

      Reject       0.93      0.92      0.92       304
      Accept       0.41      0.44      0.42        39

    accuracy                           0.87       343
   macro avg       0.67      0.68      0.67       343
weighted avg       0.87      0.87      0.87       343


Epoch 6/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.0910, F1: 0.9821, Acc: 0.9841


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.9992
Macro F1: 0.6775
Accuracy: 0.8863

Classification Report:
              precision    recall  f1-score   support

      Reject       0.92      0.95      0.94       304
      Accept       0.50      0.36      0.42        39

    accuracy                           0.89       343
   macro avg       0.71      0.66      0.68       343
weighted avg       0.87      0.89      0.88       343


Epoch 7/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.0469, F1: 0.9883, Acc: 0.9896


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0195
Macro F1: 0.6479
Accuracy: 0.8776

Classification Report:
              precision    recall  f1-score   support

      Reject       0.91      0.95      0.93       304
      Accept       0.44      0.31      0.36        39

    accuracy                           0.88       343
   macro avg       0.68      0.63      0.65       343
weighted avg       0.86      0.88      0.87       343


Epoch 8/10


Training:   0%|          | 0/228 [00:00<?, ?it/s]

Train - Loss: 0.0163, F1: 0.9945, Acc: 0.9951


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0012
Macro F1: 0.6320
Accuracy: 0.8338

Classification Report:
              precision    recall  f1-score   support

      Reject       0.92      0.89      0.90       304
      Accept       0.32      0.41      0.36        39

    accuracy                           0.83       343
   macro avg       0.62      0.65      0.63       343
weighted avg       0.85      0.83      0.84       343


Early stopping at epoch 8


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x78e57572a660>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x78e57572a660>^
^^^^Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    self._shutdown_workers()    assert self._parent_pid == os.getpid(), 'can only test a child process'
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers

       if w.is_alive(): 
             ^ ^^^^^^^^^^^^^^^^^^^^^^^



Fold 5 Best F1: 0.6775, Threshold: 0.8933

CROSS-VALIDATION RESULTS
Average F1: 0.7154 ± 0.0248
Average Threshold: 0.3825
Fold scores: ['0.6980', '0.7411', '0.7190', '0.7412', '0.6775']

OUT-OF-FOLD PREDICTIONS
OOF Macro F1: 0.7175
OOF Accuracy: 0.8831

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.93      0.93      1520
      Accept       0.50      0.51      0.50       199

    accuracy                           0.88      1719
   macro avg       0.72      0.72      0.72      1719
weighted avg       0.88      0.88      0.88      1719



## Test Predictions with TTA

In [12]:
# Test dataset
test_dataset = ClimateDataset(
    test_df['text'].values,
    np.zeros(len(test_df)),
    tokenizer,
    CFG.max_length,
    augment=False
)

test_loader = DataLoader(
    test_dataset,
    batch_size=CFG.batch_size * 2,
    shuffle=False,
    num_workers=CFG.num_workers,
    pin_memory=True
)

# Ensemble predictions with TTA
all_probs = []

for fold, model in enumerate(models):
    model.eval()
    
    if CFG.use_tta:
        # Multiple forward passes with dropout enabled (MC Dropout)
        tta_probs = []
        
        for tta_round in range(CFG.tta_rounds):
            # Enable dropout for TTA
            for module in model.modules():
                if isinstance(module, nn.Dropout):
                    module.train()
            
            fold_probs = []
            with torch.no_grad():
                for batch in test_loader:
                    input_ids = batch['input_ids'].to(CFG.device)
                    attention_mask = batch['attention_mask'].to(CFG.device)
                    
                    logits = model(input_ids, attention_mask)
                    probs = F.softmax(logits, dim=1)[:, 1].cpu().numpy()
                    fold_probs.append(probs)
            
            tta_probs.append(np.concatenate(fold_probs))
        
        # Average TTA predictions
        all_probs.append(np.mean(tta_probs, axis=0))
    else:
        fold_probs = []
        with torch.no_grad():
            for batch in tqdm(test_loader, desc=f'Fold {fold+1} Prediction'):
                input_ids = batch['input_ids'].to(CFG.device)
                attention_mask = batch['attention_mask'].to(CFG.device)
                
                logits = model(input_ids, attention_mask)
                probs = F.softmax(logits, dim=1)[:, 1].cpu().numpy()
                fold_probs.append(probs)
        
        all_probs.append(np.concatenate(fold_probs))
    
    print(f'✓ Fold {fold+1} predictions complete')

# Average probabilities across folds
avg_probs = np.mean(all_probs, axis=0)

# Use average threshold
avg_threshold = np.mean(fold_thresholds)
final_preds = (avg_probs >= avg_threshold).astype(int)

print(f'\n✓ Predictions complete!')
print(f'Average threshold used: {avg_threshold:.4f}')
print(f'\nPrediction distribution:')
print(f'Reject: {(final_preds == 0).sum()}')
print(f'Accept: {(final_preds == 1).sum()}')
print(f'Accept rate: {(final_preds == 1).sum() / len(final_preds) * 100:.2f}%')

✓ Fold 1 predictions complete
✓ Fold 2 predictions complete
✓ Fold 3 predictions complete
✓ Fold 4 predictions complete
✓ Fold 5 predictions complete

✓ Predictions complete!
Average threshold used: 0.3825

Prediction distribution:
Reject: 9311
Accept: 864
Accept rate: 8.49%


In [13]:
# Create submission
test_df['Prediction_Accept_Reject'] = ['Accept' if p == 1 else 'Reject' for p in final_preds]
test_df['Confidence_Score'] = avg_probs

output_cols = ['ID_New', 'Article Title', 'Prediction_Accept_Reject', 'Confidence_Score']
submission = test_df[output_cols].copy()

# Save
submission.to_csv(f'{CFG.output_dir}/solution2_predictions.csv', index=False)
print(f'\n✓ Predictions saved to solution2_predictions.csv')

# Show samples
print(f'\nSample predictions:')
print(submission.head(10))

print(f'\n{"="*80}')
print('SOLUTION 2 COMPLETE!')
print(f'{"="*80}')
print(f'✓ OOF Macro F1: {oof_f1:.4f}')
print(f'✓ OOF Accuracy: {oof_acc:.4f}')
print(f'✓ Average CV F1: {np.mean(fold_scores):.4f}')
print(f'✓ Cost-sensitive learning applied')
print(f'✓ SMOTE augmentation used')
print(f'✓ Test-time augmentation: {CFG.tta_rounds} rounds')
print(f'✓ Test predictions generated')


✓ Predictions saved to solution2_predictions.csv

Sample predictions:
        ID_New                                      Article Title  \
0      OA_3712                                                NaN   
1     WoS_1385   It ' s one thing after another, after another...   
2  Scopus_5109  "A Return to and of the Land": Indigenous Know...   
3  Scopus_4859  "I see my culture starting to disappear": Anis...   
4  Scopus_1176  "Impact of Climate Change on Coastal Cities: A...   
5  Scopus_1477  "Smart city" and its implementation in concept...   
6      OA_1940  "The farm has an insatiable appetite": A food ...   
7  Scopus_3724  "We want to have a positive impact": Fragile e...   
8  Scopus_1613  "When you have stress because you don't have f...   
9      OA_3763  #36915 D37 – the green footprint of regional a...   

  Prediction_Accept_Reject  Confidence_Score  
0                   Accept          0.860029  
1                   Reject          0.004138  
2                   Reject  