# Climate Text Classification - Solution 1
## Advanced Data Augmentation + Focal Loss + Threshold Optimization

**Publication-Ready Pipeline**

### Key Innovations:
1. **Multi-Strategy Data Augmentation**: Back-translation, paraphrasing, and synonym replacement
2. **Adaptive Focal Loss**: Dynamic focusing on hard examples
3. **Threshold Optimization**: F1-score maximization on validation set
4. **Progressive Training**: Curriculum learning from easy to hard samples
5. **Model Ensemble**: Multiple augmentation strategies

### Expected Performance:
- **Target**: 80%+ Macro F1 and Accuracy
- **Hardware**: Kaggle P100 GPU (16GB)
- **Output**: <19.5GB

In [1]:
# Install packages
!pip install -q transformers==4.45.0 datasets accelerate scikit-learn openpyxl nlpaug torch-optimizer

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.9/55.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m67.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.9/61.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m90.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import gc
import warnings
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import random
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

from transformers import (
    AutoTokenizer, AutoModel, AutoConfig,
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup
)

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    f1_score, accuracy_score, classification_report,
    precision_recall_curve, roc_auc_score, confusion_matrix
)

warnings.filterwarnings('ignore')

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

print('✓ Libraries loaded')
print(f'PyTorch: {torch.__version__}')
print(f'CUDA: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')

✓ Libraries loaded
PyTorch: 2.8.0+cu126
CUDA: True
GPU: Tesla P100-PCIE-16GB


## Configuration

In [5]:
class CFG:
    # Paths
    train_path = '/kaggle/input/datasets/hrithikmajumdar/climate-text-dataset/Human labelled_DTU.xlsx'
    test_path = '/kaggle/input/datasets/hrithikmajumdar/climate-text-dataset/Master file_10k papers.xlsx'
    output_dir = '/kaggle/working/'
    
    # Model
    model_name = 'microsoft/deberta-v3-base'  # Strong baseline
    max_length = 512
    hidden_dropout = 0.1
    attention_dropout = 0.1
    
    # Training
    n_folds = 5
    n_epochs = 8
    batch_size = 8
    grad_accum_steps = 2
    lr = 1.5e-5
    weight_decay = 0.01
    warmup_ratio = 0.1
    max_grad_norm = 1.0
    
    # Augmentation
    aug_rate = 4  # Augment minority class 4x
    aug_probability = 0.3
    
    # Loss
    focal_alpha = 0.75  # Higher weight for minority class
    focal_gamma = 3.0   # Strong focusing on hard examples
    label_smoothing = 0.05
    
    # Class weights (7.64:1 imbalance)
    class_weights = [1.0, 7.64]  # [Reject, Accept]
    
    # Hardware
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    fp16 = True
    num_workers = 2
    
    # Optimization
    early_stopping_patience = 3
    use_swa = True  # Stochastic Weight Averaging
    swa_start_epoch = 5
    
    seed = 42

print('✓ Configuration set')

✓ Configuration set


## Data Loading and Augmentation

In [6]:
# Load training data
train_df = pd.read_excel(CFG.train_path, skiprows=1)
train_df.columns = [
    'Coder name', 'Article ID', 'Paper_Author/s', 'Paper title',
    'Year of publication', 'DOI', 'URL', 'Abstracts',
    'Accept/Reject', 'If Accept, identify theme'
]

# Clean
train_df = train_df[train_df['Accept/Reject'].isin(['Accept', 'Reject'])].copy()
train_df['text'] = train_df['Abstracts'].fillna('')
train_df = train_df[train_df['text'].str.len() > 50].reset_index(drop=True)  # Remove very short texts

# Binary label
train_df['label'] = (train_df['Accept/Reject'] == 'Accept').astype(int)

print(f'Training samples: {len(train_df)}')
print(f'\nClass distribution:')
print(train_df['label'].value_counts())
print(f'\nImbalance ratio: {train_df["label"].value_counts()[0] / train_df["label"].value_counts()[1]:.2f}:1')

Training samples: 1719

Class distribution:
label
0    1520
1     199
Name: count, dtype: int64

Imbalance ratio: 7.64:1


In [7]:
# Simple but effective text augmentation
class TextAugmenter:
    """Multiple augmentation strategies for text data"""
    
    def __init__(self, aug_prob=0.3):
        self.aug_prob = aug_prob
        
    def random_deletion(self, text, p=0.1):
        """Randomly delete words"""
        words = text.split()
        if len(words) == 1:
            return text
        
        new_words = []
        for word in words:
            if random.random() > p:
                new_words.append(word)
        
        if len(new_words) == 0:
            return random.choice(words)
        
        return ' '.join(new_words)
    
    def random_swap(self, text, n=3):
        """Randomly swap words"""
        words = text.split()
        if len(words) < 2:
            return text
        
        new_words = words.copy()
        for _ in range(n):
            idx1, idx2 = random.sample(range(len(new_words)), 2)
            new_words[idx1], new_words[idx2] = new_words[idx2], new_words[idx1]
        
        return ' '.join(new_words)
    
    def synonym_replacement(self, text):
        """Simple synonym replacement using domain keywords"""
        synonyms = {
            'climate': ['climate', 'environmental', 'ecological'],
            'change': ['change', 'shift', 'transformation'],
            'mitigation': ['mitigation', 'reduction', 'abatement'],
            'well-being': ['well-being', 'welfare', 'quality of life'],
            'sustainable': ['sustainable', 'eco-friendly', 'green'],
            'policy': ['policy', 'regulation', 'framework'],
            'governance': ['governance', 'management', 'administration'],
        }
        
        words = text.split()
        new_words = []
        
        for word in words:
            word_lower = word.lower().strip('.,!?;:')
            if word_lower in synonyms and random.random() < 0.3:
                new_words.append(random.choice(synonyms[word_lower]))
            else:
                new_words.append(word)
        
        return ' '.join(new_words)
    
    def augment(self, text, strategy='mixed'):
        """Apply augmentation strategy"""
        if random.random() > self.aug_prob:
            return text
        
        if strategy == 'deletion':
            return self.random_deletion(text)
        elif strategy == 'swap':
            return self.random_swap(text)
        elif strategy == 'synonym':
            return self.synonym_replacement(text)
        else:  # mixed
            aug_type = random.choice(['deletion', 'swap', 'synonym'])
            if aug_type == 'deletion':
                return self.random_deletion(text)
            elif aug_type == 'swap':
                return self.random_swap(text)
            else:
                return self.synonym_replacement(text)

print('✓ Augmenter created')

✓ Augmenter created


In [8]:
# Augment minority class
def create_balanced_dataset(df, aug_rate=4):
    """Create balanced dataset through augmentation"""
    augmenter = TextAugmenter(aug_prob=0.5)
    
    # Separate classes
    majority = df[df['label'] == 0].copy()
    minority = df[df['label'] == 1].copy()
    
    print(f'Original - Majority: {len(majority)}, Minority: {len(minority)}')
    
    # Augment minority class
    augmented_samples = []
    for _ in range(aug_rate):
        for idx, row in minority.iterrows():
            new_row = row.copy()
            new_row['text'] = augmenter.augment(row['text'])
            augmented_samples.append(new_row)
    
    aug_df = pd.DataFrame(augmented_samples)
    
    # Combine
    balanced_df = pd.concat([majority, minority, aug_df], ignore_index=True)
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    print(f'Balanced - Total: {len(balanced_df)}, Accept: {(balanced_df["label"]==1).sum()}')
    print(f'New ratio: {(balanced_df["label"]==0).sum() / (balanced_df["label"]==1).sum():.2f}:1')
    
    return balanced_df

# Create balanced dataset
balanced_train_df = create_balanced_dataset(train_df, aug_rate=CFG.aug_rate)

print('\n✓ Balanced dataset created')

Original - Majority: 1520, Minority: 199
Balanced - Total: 2515, Accept: 995
New ratio: 1.53:1

✓ Balanced dataset created


In [9]:
# Load test data
test_df = pd.read_excel(CFG.test_path)
test_df['text'] = test_df['Abstract'].fillna('')
test_df = test_df[test_df['text'].str.len() > 50].reset_index(drop=True)

print(f'Test samples: {len(test_df)}')

Test samples: 10175


## Model Architecture

In [10]:
class ClimateDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length, augment=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.augment = augment
        if augment:
            self.augmenter = TextAugmenter(aug_prob=0.3)
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        # Apply augmentation during training
        if self.augment and self.labels[idx] == 1:  # Only augment minority class
            text = self.augmenter.augment(text)
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [11]:
class FocalLoss(nn.Module):
    """Focal Loss for addressing class imbalance"""
    def __init__(self, alpha=0.75, gamma=3.0, label_smoothing=0.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.label_smoothing = label_smoothing
    
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', label_smoothing=self.label_smoothing)
        pt = torch.exp(-ce_loss)
        
        # Apply alpha weighting
        alpha_t = self.alpha * targets + (1 - self.alpha) * (1 - targets)
        
        # Focal loss
        focal_loss = alpha_t * (1 - pt) ** self.gamma * ce_loss
        
        return focal_loss.mean()

In [12]:
class ClimateClassifier(nn.Module):
    def __init__(self, model_name, n_classes=2, dropout=0.1):
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.update({
            'hidden_dropout_prob': dropout,
            'attention_probs_dropout_prob': dropout,
        })
        
        self.transformer = AutoModel.from_pretrained(model_name, config=self.config)
        
        # Multi-layer classifier head
        hidden_size = self.config.hidden_size
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.LayerNorm(hidden_size),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.LayerNorm(hidden_size // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size // 2, n_classes)
        )
    
    def forward(self, input_ids, attention_mask):
        outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Use [CLS] token + mean pooling
        cls_output = outputs.last_hidden_state[:, 0]  # [CLS]
        
        # Mean pooling
        mask_expanded = attention_mask.unsqueeze(-1).expand(outputs.last_hidden_state.size())
        sum_embeddings = torch.sum(outputs.last_hidden_state * mask_expanded, 1)
        sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
        mean_output = sum_embeddings / sum_mask
        
        # Combine [CLS] and mean pooling
        combined = cls_output + mean_output
        
        logits = self.classifier(combined)
        return logits

## Training Functions

In [13]:
def train_epoch(model, dataloader, optimizer, scheduler, criterion, device, scaler=None):
    model.train()
    total_loss = 0
    predictions = []
    true_labels = []
    
    pbar = tqdm(dataloader, desc='Training')
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        if scaler is not None:
            with torch.cuda.amp.autocast():
                logits = model(input_ids, attention_mask)
                loss = criterion(logits, labels)
            
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
            scaler.step(optimizer)
            scaler.update()
        else:
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
            optimizer.step()
        
        optimizer.zero_grad()
        scheduler.step()
        
        total_loss += loss.item()
        
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(labels.cpu().numpy())
        
        pbar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    avg_loss = total_loss / len(dataloader)
    f1 = f1_score(true_labels, predictions, average='macro')
    acc = accuracy_score(true_labels, predictions)
    
    return avg_loss, f1, acc

def validate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    predictions = []
    probabilities = []
    true_labels = []
    
    with torch.no_grad():
        pbar = tqdm(dataloader, desc='Validation')
        for batch in pbar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            
            total_loss += loss.item()
            
            probs = F.softmax(logits, dim=1)[:, 1].cpu().numpy()
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            
            probabilities.extend(probs)
            predictions.extend(preds)
            true_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    
    return avg_loss, np.array(predictions), np.array(probabilities), np.array(true_labels)

In [14]:
def find_optimal_threshold(y_true, y_probs):
    """Find threshold that maximizes F1 score"""
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_probs)
    
    f1_scores = []
    for precision, recall in zip(precisions, recalls):
        if precision + recall == 0:
            f1_scores.append(0)
        else:
            f1_scores.append(2 * (precision * recall) / (precision + recall))
    
    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx] if best_idx < len(thresholds) else 0.5
    best_f1 = f1_scores[best_idx]
    
    return best_threshold, best_f1

def evaluate_with_threshold(y_true, y_probs, threshold):
    """Evaluate using optimal threshold"""
    y_pred = (y_probs >= threshold).astype(int)
    f1 = f1_score(y_true, y_pred, average='macro')
    acc = accuracy_score(y_true, y_pred)
    
    print(f'\nThreshold: {threshold:.4f}')
    print(f'Macro F1: {f1:.4f}')
    print(f'Accuracy: {acc:.4f}')
    print('\nClassification Report:')
    print(classification_report(y_true, y_pred, target_names=['Reject', 'Accept']))
    print('\nConfusion Matrix:')
    print(confusion_matrix(y_true, y_pred))
    
    return f1, acc

## Cross-Validation Training

In [15]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)

# K-Fold Cross Validation on ORIGINAL data (not augmented)
skf = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)

fold_scores = []
fold_thresholds = []
oof_predictions = np.zeros(len(train_df))
oof_probabilities = np.zeros(len(train_df))

# Store models for ensemble
models = []

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
    print(f'\n{"="*80}')
    print(f'FOLD {fold + 1}/{CFG.n_folds}')
    print(f'{"="*80}')
    
    # Get fold data
    fold_train_df = train_df.iloc[train_idx].copy()
    fold_val_df = train_df.iloc[val_idx].copy()
    
    # Augment training fold
    fold_train_balanced = create_balanced_dataset(fold_train_df, aug_rate=CFG.aug_rate)
    
    # Create datasets
    train_dataset = ClimateDataset(
        fold_train_balanced['text'].values,
        fold_train_balanced['label'].values,
        tokenizer,
        CFG.max_length,
        augment=True  # Additional online augmentation
    )
    
    val_dataset = ClimateDataset(
        fold_val_df['text'].values,
        fold_val_df['label'].values,
        tokenizer,
        CFG.max_length,
        augment=False
    )
    
    # Dataloaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size,
        shuffle=True,
        num_workers=CFG.num_workers,
        pin_memory=True
    )
    
    val_loader = DataLoader(
        val_dataset,
        batch_size=CFG.batch_size * 2,
        shuffle=False,
        num_workers=CFG.num_workers,
        pin_memory=True
    )
    
    # Model
    model = ClimateClassifier(
        CFG.model_name,
        n_classes=2,
        dropout=CFG.hidden_dropout
    ).to(CFG.device)
    
    # Loss
    criterion = FocalLoss(
        alpha=CFG.focal_alpha,
        gamma=CFG.focal_gamma,
        label_smoothing=CFG.label_smoothing
    )
    
    # Optimizer
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=CFG.lr,
        weight_decay=CFG.weight_decay
    )
    
    # Scheduler
    num_training_steps = len(train_loader) * CFG.n_epochs
    num_warmup_steps = int(num_training_steps * CFG.warmup_ratio)
    
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
    
    # Mixed precision
    scaler = torch.cuda.amp.GradScaler() if CFG.fp16 else None
    
    # Training loop
    best_f1 = 0
    patience_counter = 0
    
    for epoch in range(CFG.n_epochs):
        print(f'\nEpoch {epoch + 1}/{CFG.n_epochs}')
        
        # Train
        train_loss, train_f1, train_acc = train_epoch(
            model, train_loader, optimizer, scheduler, criterion, CFG.device, scaler
        )
        
        print(f'Train - Loss: {train_loss:.4f}, F1: {train_f1:.4f}, Acc: {train_acc:.4f}')
        
        # Validate
        val_loss, val_preds, val_probs, val_labels = validate(
            model, val_loader, criterion, CFG.device
        )
        
        # Find optimal threshold
        threshold, threshold_f1 = find_optimal_threshold(val_labels, val_probs)
        
        # Evaluate with optimal threshold
        val_f1, val_acc = evaluate_with_threshold(val_labels, val_probs, threshold)
        
        # Early stopping
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_threshold = threshold
            patience_counter = 0
            
            # Save best model
            torch.save(model.state_dict(), f'{CFG.output_dir}/best_model_fold{fold}.pth')
            print(f'✓ Best model saved (F1: {best_f1:.4f})')
        else:
            patience_counter += 1
            if patience_counter >= CFG.early_stopping_patience:
                print(f'\nEarly stopping at epoch {epoch + 1}')
                break
    
    # Load best model
    model.load_state_dict(torch.load(f'{CFG.output_dir}/best_model_fold{fold}.pth'))
    
    # Final validation
    val_loss, val_preds, val_probs, val_labels = validate(
        model, val_loader, criterion, CFG.device
    )
    
    # Store OOF predictions
    oof_probabilities[val_idx] = val_probs
    oof_predictions[val_idx] = (val_probs >= best_threshold).astype(int)
    
    # Store fold results
    fold_scores.append(best_f1)
    fold_thresholds.append(best_threshold)
    models.append(model)
    
    print(f'\nFold {fold + 1} Best F1: {best_f1:.4f}, Threshold: {best_threshold:.4f}')
    
    # Cleanup
    del train_dataset, val_dataset, train_loader, val_loader
    gc.collect()
    torch.cuda.empty_cache()

print(f'\n{"="*80}')
print('CROSS-VALIDATION RESULTS')
print(f'{"="*80}')
print(f'Average F1: {np.mean(fold_scores):.4f} ± {np.std(fold_scores):.4f}')
print(f'Average Threshold: {np.mean(fold_thresholds):.4f}')
print(f'Fold scores: {[f"{s:.4f}" for s in fold_scores]}')

# Overall OOF evaluation
print(f'\n{"="*80}')
print('OUT-OF-FOLD PREDICTIONS')
print(f'{"="*80}')
oof_f1 = f1_score(train_df['label'].values, oof_predictions, average='macro')
oof_acc = accuracy_score(train_df['label'].values, oof_predictions)
print(f'OOF Macro F1: {oof_f1:.4f}')
print(f'OOF Accuracy: {oof_acc:.4f}')
print('\nClassification Report:')
print(classification_report(train_df['label'].values, oof_predictions, target_names=['Reject', 'Accept']))

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]


FOLD 1/5
Original - Majority: 1216, Minority: 159
Balanced - Total: 2011, Accept: 795
New ratio: 1.53:1


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]


Epoch 1/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0351, F1: 0.6116, Acc: 0.6121


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.6622
Macro F1: 0.6811
Accuracy: 0.8605

Classification Report:
              precision    recall  f1-score   support

      Reject       0.93      0.91      0.92       304
      Accept       0.41      0.47      0.44        40

    accuracy                           0.86       344
   macro avg       0.67      0.69      0.68       344
weighted avg       0.87      0.86      0.86       344


Confusion Matrix:
[[277  27]
 [ 21  19]]
✓ Best model saved (F1: 0.6811)

Epoch 2/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0130, F1: 0.9039, Acc: 0.9065


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.1440
Macro F1: 0.5592
Accuracy: 0.6570

Classification Report:
              precision    recall  f1-score   support

      Reject       0.96      0.64      0.77       304
      Accept       0.23      0.80      0.35        40

    accuracy                           0.66       344
   macro avg       0.59      0.72      0.56       344
weighted avg       0.87      0.66      0.72       344


Confusion Matrix:
[[194 110]
 [  8  32]]

Epoch 3/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0062, F1: 0.9695, Acc: 0.9707


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.3681
Macro F1: 0.6643
Accuracy: 0.8576

Classification Report:
              precision    recall  f1-score   support

      Reject       0.92      0.91      0.92       304
      Accept       0.40      0.42      0.41        40

    accuracy                           0.86       344
   macro avg       0.66      0.67      0.66       344
weighted avg       0.86      0.86      0.86       344


Confusion Matrix:
[[278  26]
 [ 23  17]]

Epoch 4/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0019, F1: 0.9927, Acc: 0.9930


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.1979
Macro F1: 0.6709
Accuracy: 0.8517

Classification Report:
              precision    recall  f1-score   support

      Reject       0.93      0.90      0.91       304
      Accept       0.39      0.47      0.43        40

    accuracy                           0.85       344
   macro avg       0.66      0.69      0.67       344
weighted avg       0.87      0.85      0.86       344


Confusion Matrix:
[[274  30]
 [ 21  19]]

Early stopping at epoch 4


Validation:   0%|          | 0/22 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7e42a7b2e5c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       Exception ignored in: ^^<function _MultiProcessingDataLoaderIter.__del__ at 0x7e42a7b2e5c0>^
^Traceback (most recent call last):
^  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
^^    ^self._shutdown_workers()^
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
^^    ^if w.is_alive():

   File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
      assert self._parent_pid == os.getpid(), 'can only test a child process'
      ^ ^^ ^ ^ ^^ ^ ^ ^ ^^ 
  File "/usr


Fold 1 Best F1: 0.6811, Threshold: 0.6622

FOLD 2/5
Original - Majority: 1216, Minority: 159
Balanced - Total: 2011, Accept: 795
New ratio: 1.53:1

Epoch 1/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0394, F1: 0.6098, Acc: 0.6106


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.2912
Macro F1: 0.6715
Accuracy: 0.8314

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.87      0.90       304
      Accept       0.36      0.57      0.44        40

    accuracy                           0.83       344
   macro avg       0.65      0.72      0.67       344
weighted avg       0.87      0.83      0.85       344


Confusion Matrix:
[[263  41]
 [ 17  23]]
✓ Best model saved (F1: 0.6715)

Epoch 2/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0182, F1: 0.8726, Acc: 0.8762


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.4257
Macro F1: 0.7328
Accuracy: 0.8866

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.93      0.94       304
      Accept       0.51      0.55      0.53        40

    accuracy                           0.89       344
   macro avg       0.73      0.74      0.73       344
weighted avg       0.89      0.89      0.89       344


Confusion Matrix:
[[283  21]
 [ 18  22]]
✓ Best model saved (F1: 0.7328)

Epoch 3/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0057, F1: 0.9741, Acc: 0.9751


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.1355
Macro F1: 0.6756
Accuracy: 0.7965

Classification Report:
              precision    recall  f1-score   support

      Reject       0.97      0.80      0.87       304
      Accept       0.34      0.80      0.48        40

    accuracy                           0.80       344
   macro avg       0.65      0.80      0.68       344
weighted avg       0.90      0.80      0.83       344


Confusion Matrix:
[[242  62]
 [  8  32]]

Epoch 4/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0028, F1: 0.9896, Acc: 0.9901


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0918
Macro F1: 0.7065
Accuracy: 0.8343

Classification Report:
              precision    recall  f1-score   support

      Reject       0.96      0.85      0.90       304
      Accept       0.39      0.75      0.51        40

    accuracy                           0.83       344
   macro avg       0.68      0.80      0.71       344
weighted avg       0.90      0.83      0.86       344


Confusion Matrix:
[[257  47]
 [ 10  30]]

Epoch 5/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7e42a7b2e5c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x7e42a7b2e5c0>^^
^Traceback (most recent call last):
^^  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
^^    ^self._shutdown_workers()
^  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
^    ^if w.is_alive():^
^^  ^^ ^ ^^  ^ ^^

Train - Loss: 0.0017, F1: 0.9917, Acc: 0.9920


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.0985
Macro F1: 0.6981
Accuracy: 0.8227

Classification Report:
              precision    recall  f1-score   support

      Reject       0.97      0.83      0.89       304
      Accept       0.37      0.78      0.50        40

    accuracy                           0.82       344
   macro avg       0.67      0.80      0.70       344
weighted avg       0.90      0.82      0.85       344


Confusion Matrix:
[[252  52]
 [  9  31]]

Early stopping at epoch 5


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Fold 2 Best F1: 0.7328, Threshold: 0.4257

FOLD 3/5
Original - Majority: 1216, Minority: 159
Balanced - Total: 2011, Accept: 795
New ratio: 1.53:1

Epoch 1/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0322, F1: 0.6525, Acc: 0.6534


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.6074
Macro F1: 0.6360
Accuracy: 0.7849

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.81      0.87       304
      Accept       0.30      0.62      0.40        40

    accuracy                           0.78       344
   macro avg       0.62      0.72      0.64       344
weighted avg       0.87      0.78      0.81       344


Confusion Matrix:
[[245  59]
 [ 15  25]]
✓ Best model saved (F1: 0.6360)

Epoch 2/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7e42a7b2e5c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    Exception ignored in: assert self._parent_pid == os.getpid(), 'can only test a child process'<function _MultiProcessingDataLoaderIter.__del__ at 0x7e42a7b2e5c0>

 Traceback (most recent call last):
    File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
       self._shutdown_workers() 
   File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
       if w.is_alive():
^ ^ ^ ^  ^ ^ ^^^^^^^^^^^^^^^^^^^

Train - Loss: 0.0149, F1: 0.8778, Acc: 0.8807


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.3065
Macro F1: 0.6359
Accuracy: 0.7529

Classification Report:
              precision    recall  f1-score   support

      Reject       0.97      0.75      0.84       304
      Accept       0.29      0.80      0.43        40

    accuracy                           0.75       344
   macro avg       0.63      0.77      0.64       344
weighted avg       0.89      0.75      0.79       344


Confusion Matrix:
[[227  77]
 [  8  32]]

Epoch 3/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0063, F1: 0.9679, Acc: 0.9692


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.3787
Macro F1: 0.6684
Accuracy: 0.8285

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.86      0.90       304
      Accept       0.35      0.57      0.44        40

    accuracy                           0.83       344
   macro avg       0.65      0.72      0.67       344
weighted avg       0.87      0.83      0.85       344


Confusion Matrix:
[[262  42]
 [ 17  23]]
✓ Best model saved (F1: 0.6684)

Epoch 4/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0028, F1: 0.9865, Acc: 0.9871


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.2585
Macro F1: 0.6746
Accuracy: 0.8343

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.87      0.90       304
      Accept       0.37      0.57      0.45        40

    accuracy                           0.83       344
   macro avg       0.65      0.72      0.67       344
weighted avg       0.87      0.83      0.85       344


Confusion Matrix:
[[264  40]
 [ 17  23]]
✓ Best model saved (F1: 0.6746)

Epoch 5/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0008, F1: 0.9943, Acc: 0.9945


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.1115
Macro F1: 0.6571
Accuracy: 0.7849

Classification Report:
              precision    recall  f1-score   support

      Reject       0.96      0.79      0.87       304
      Accept       0.32      0.75      0.45        40

    accuracy                           0.78       344
   macro avg       0.64      0.77      0.66       344
weighted avg       0.89      0.78      0.82       344


Confusion Matrix:
[[240  64]
 [ 10  30]]

Epoch 6/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0006, F1: 0.9969, Acc: 0.9970


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.1210
Macro F1: 0.6558
Accuracy: 0.7878

Classification Report:
              precision    recall  f1-score   support

      Reject       0.96      0.80      0.87       304
      Accept       0.32      0.72      0.44        40

    accuracy                           0.79       344
   macro avg       0.64      0.76      0.66       344
weighted avg       0.88      0.79      0.82       344


Confusion Matrix:
[[242  62]
 [ 11  29]]

Epoch 7/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0002, F1: 0.9984, Acc: 0.9985


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.1320
Macro F1: 0.6613
Accuracy: 0.7936

Classification Report:
              precision    recall  f1-score   support

      Reject       0.96      0.80      0.87       304
      Accept       0.33      0.72      0.45        40

    accuracy                           0.79       344
   macro avg       0.64      0.76      0.66       344
weighted avg       0.88      0.79      0.82       344


Confusion Matrix:
[[244  60]
 [ 11  29]]

Early stopping at epoch 7


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Fold 3 Best F1: 0.6746, Threshold: 0.2585

FOLD 4/5
Original - Majority: 1216, Minority: 159
Balanced - Total: 2011, Accept: 795
New ratio: 1.53:1

Epoch 1/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0332, F1: 0.6396, Acc: 0.6405


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.5446
Macro F1: 0.7252
Accuracy: 0.8983

Classification Report:
              precision    recall  f1-score   support

      Reject       0.93      0.96      0.94       304
      Accept       0.58      0.45      0.51        40

    accuracy                           0.90       344
   macro avg       0.76      0.70      0.73       344
weighted avg       0.89      0.90      0.89       344


Confusion Matrix:
[[291  13]
 [ 22  18]]
✓ Best model saved (F1: 0.7252)

Epoch 2/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0157, F1: 0.8997, Acc: 0.9030


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.2615
Macro F1: 0.7007
Accuracy: 0.8488

Classification Report:
              precision    recall  f1-score   support

      Reject       0.95      0.88      0.91       304
      Accept       0.40      0.62      0.49        40

    accuracy                           0.85       344
   macro avg       0.68      0.75      0.70       344
weighted avg       0.88      0.85      0.86       344


Confusion Matrix:
[[267  37]
 [ 15  25]]

Epoch 3/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0047, F1: 0.9792, Acc: 0.9801


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.1309
Macro F1: 0.7014
Accuracy: 0.8372

Classification Report:
              precision    recall  f1-score   support

      Reject       0.96      0.86      0.90       304
      Accept       0.39      0.70      0.50        40

    accuracy                           0.84       344
   macro avg       0.67      0.78      0.70       344
weighted avg       0.89      0.84      0.86       344


Confusion Matrix:
[[260  44]
 [ 12  28]]

Epoch 4/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0022, F1: 0.9943, Acc: 0.9945


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.1404
Macro F1: 0.6737
Accuracy: 0.8023

Classification Report:
              precision    recall  f1-score   support

      Reject       0.96      0.81      0.88       304
      Accept       0.34      0.75      0.47        40

    accuracy                           0.80       344
   macro avg       0.65      0.78      0.67       344
weighted avg       0.89      0.80      0.83       344


Confusion Matrix:
[[246  58]
 [ 10  30]]

Early stopping at epoch 4


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Fold 4 Best F1: 0.7252, Threshold: 0.5446

FOLD 5/5
Original - Majority: 1216, Minority: 160
Balanced - Total: 2016, Accept: 800
New ratio: 1.52:1

Epoch 1/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0373, F1: 0.6127, Acc: 0.6131


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.5172
Macro F1: 0.6367
Accuracy: 0.7755

Classification Report:
              precision    recall  f1-score   support

      Reject       0.95      0.79      0.86       304
      Accept       0.29      0.69      0.41        39

    accuracy                           0.78       343
   macro avg       0.62      0.74      0.64       343
weighted avg       0.88      0.78      0.81       343


Confusion Matrix:
[[239  65]
 [ 12  27]]
✓ Best model saved (F1: 0.6367)

Epoch 2/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0159, F1: 0.8809, Acc: 0.8839


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.5757
Macro F1: 0.6784
Accuracy: 0.8192

Classification Report:
              precision    recall  f1-score   support

      Reject       0.95      0.84      0.89       304
      Accept       0.35      0.69      0.47        39

    accuracy                           0.82       343
   macro avg       0.65      0.76      0.68       343
weighted avg       0.89      0.82      0.84       343


Confusion Matrix:
[[254  50]
 [ 12  27]]
✓ Best model saved (F1: 0.6784)

Epoch 3/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0053, F1: 0.9727, Acc: 0.9737


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.3041
Macro F1: 0.7226
Accuracy: 0.8630

Classification Report:
              precision    recall  f1-score   support

      Reject       0.95      0.89      0.92       304
      Accept       0.43      0.67      0.53        39

    accuracy                           0.86       343
   macro avg       0.69      0.78      0.72       343
weighted avg       0.89      0.86      0.88       343


Confusion Matrix:
[[270  34]
 [ 13  26]]
✓ Best model saved (F1: 0.7226)

Epoch 4/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0029, F1: 0.9902, Acc: 0.9906


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.3931
Macro F1: 0.6896
Accuracy: 0.8571

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.90      0.92       304
      Accept       0.40      0.54      0.46        39

    accuracy                           0.86       343
   macro avg       0.67      0.72      0.69       343
weighted avg       0.88      0.86      0.87       343


Confusion Matrix:
[[273  31]
 [ 18  21]]

Epoch 5/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7e42a7b2e5c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process


Train - Loss: 0.0008, F1: 0.9938, Acc: 0.9940


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.1297
Macro F1: 0.6828
Accuracy: 0.8513

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.89      0.91       304
      Accept       0.39      0.54      0.45        39

    accuracy                           0.85       343
   macro avg       0.66      0.71      0.68       343
weighted avg       0.88      0.85      0.86       343


Confusion Matrix:
[[271  33]
 [ 18  21]]

Epoch 6/8


Training:   0%|          | 0/252 [00:00<?, ?it/s]

Train - Loss: 0.0011, F1: 0.9969, Acc: 0.9970


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Threshold: 0.1244
Macro F1: 0.6614
Accuracy: 0.8163

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.84      0.89       304
      Accept       0.33      0.62      0.43        39

    accuracy                           0.82       343
   macro avg       0.64      0.73      0.66       343
weighted avg       0.88      0.82      0.84       343


Confusion Matrix:
[[256  48]
 [ 15  24]]

Early stopping at epoch 6


Validation:   0%|          | 0/22 [00:00<?, ?it/s]


Fold 5 Best F1: 0.7226, Threshold: 0.3041

CROSS-VALIDATION RESULTS
Average F1: 0.7072 ± 0.0243
Average Threshold: 0.4390
Fold scores: ['0.6811', '0.7328', '0.6746', '0.7252', '0.7226']

OUT-OF-FOLD PREDICTIONS
OOF Macro F1: 0.7066
OOF Accuracy: 0.8685

Classification Report:
              precision    recall  f1-score   support

      Reject       0.94      0.91      0.92      1520
      Accept       0.44      0.54      0.49       199

    accuracy                           0.87      1719
   macro avg       0.69      0.73      0.71      1719
weighted avg       0.88      0.87      0.87      1719



## Test Predictions

In [16]:
# Test dataset
test_dataset = ClimateDataset(
    test_df['text'].values,
    np.zeros(len(test_df)),
    tokenizer,
    CFG.max_length,
    augment=False
)

test_loader = DataLoader(
    test_dataset,
    batch_size=CFG.batch_size * 2,
    shuffle=False,
    num_workers=CFG.num_workers,
    pin_memory=True
)

# Ensemble predictions
all_probs = []

for fold, model in enumerate(models):
    model.eval()
    fold_probs = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc=f'Fold {fold+1} Prediction'):
            input_ids = batch['input_ids'].to(CFG.device)
            attention_mask = batch['attention_mask'].to(CFG.device)
            
            logits = model(input_ids, attention_mask)
            probs = F.softmax(logits, dim=1)[:, 1].cpu().numpy()
            fold_probs.append(probs)
    
    all_probs.append(np.concatenate(fold_probs))

# Average probabilities
avg_probs = np.mean(all_probs, axis=0)

# Use average threshold
avg_threshold = np.mean(fold_thresholds)
final_preds = (avg_probs >= avg_threshold).astype(int)

print(f'\nPredictions complete!')
print(f'Average threshold used: {avg_threshold:.4f}')
print(f'\nPrediction distribution:')
print(f'Reject: {(final_preds == 0).sum()}')
print(f'Accept: {(final_preds == 1).sum()}')
print(f'Accept rate: {(final_preds == 1).sum() / len(final_preds) * 100:.2f}%')

Fold 1 Prediction:   0%|          | 0/636 [00:00<?, ?it/s]

Fold 2 Prediction:   0%|          | 0/636 [00:00<?, ?it/s]

Fold 3 Prediction:   0%|          | 0/636 [00:00<?, ?it/s]

Fold 4 Prediction:   0%|          | 0/636 [00:00<?, ?it/s]

Fold 5 Prediction:   0%|          | 0/636 [00:00<?, ?it/s]


Predictions complete!
Average threshold used: 0.4390

Prediction distribution:
Reject: 8816
Accept: 1359
Accept rate: 13.36%


In [17]:
# Create submission
test_df['Prediction_Accept_Reject'] = ['Accept' if p == 1 else 'Reject' for p in final_preds]
test_df['Confidence_Score'] = avg_probs

output_cols = ['ID_New', 'Article Title', 'Prediction_Accept_Reject', 'Confidence_Score']
submission = test_df[output_cols].copy()

# Save
submission.to_csv(f'{CFG.output_dir}/solution1_predictions.csv', index=False)
print(f'\n✓ Predictions saved to solution1_predictions.csv')

# Show samples
print(f'\nSample predictions:')
print(submission.head(10))

print(f'\n{"="*80}')
print('SOLUTION 1 COMPLETE!')
print(f'{"="*80}')
print(f'✓ OOF Macro F1: {oof_f1:.4f}')
print(f'✓ OOF Accuracy: {oof_acc:.4f}')
print(f'✓ Average CV F1: {np.mean(fold_scores):.4f}')
print(f'✓ Models saved for each fold')
print(f'✓ Test predictions generated')


✓ Predictions saved to solution1_predictions.csv

Sample predictions:
        ID_New                                      Article Title  \
0      OA_3712                                                NaN   
1     WoS_1385   It ' s one thing after another, after another...   
2  Scopus_5109  "A Return to and of the Land": Indigenous Know...   
3  Scopus_4859  "I see my culture starting to disappear": Anis...   
4  Scopus_1176  "Impact of Climate Change on Coastal Cities: A...   
5  Scopus_1477  "Smart city" and its implementation in concept...   
6      OA_1940  "The farm has an insatiable appetite": A food ...   
7  Scopus_3724  "We want to have a positive impact": Fragile e...   
8  Scopus_1613  "When you have stress because you don't have f...   
9      OA_3763  #36915 D37 – the green footprint of regional a...   

  Prediction_Accept_Reject  Confidence_Score  
0                   Accept          0.772829  
1                   Reject          0.232518  
2                   Reject  