In [None]:
# ================================================================
# IMPORT LIBRARIES AND CONFIGURE ENVIRONMENT
# ================================================================
# Ignore unnecessary warnings
import warnings
warnings.simplefilter('ignore')

# Import essential libraries
import pandas as pd
import numpy as np

# Import utilities for feature combinations and modeling
from itertools import combinations
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin

# Import PyTorch modules for neural networks
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [None]:
# ================================================================
# LOAD DATASETS
# ================================================================
# Load the training dataset
train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')

# Load the testing dataset
test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')

# Load the original reference dataset
orig = pd.read_csv('/kaggle/input/loan-prediction-dataset-2025/loan_dataset_20000.csv')

# Display dataset shapes for verification
print('Train Shape:', train.shape)
print('Test Shape:', test.shape)
print('Orig Shape:', orig.shape)

In [None]:
# ================================================================
# DEFINE TARGET AND CATEGORICAL VARIABLES
# ================================================================
# Define the target variable
TARGET = 'loan_paid_back'

# Define categorical columns
CATS = [
    'gender',
    'marital_status',
    'education_level',
    'employment_status',
    'loan_purpose',
    'grade_subgrade'
]

# Define base features excluding id and target
BASE = [col for col in train.columns if col not in ['id', TARGET]]

In [None]:
# ================================================================
# CREATE INTERACTION FEATURES
# ================================================================
# Initialize list to store interaction feature names
INTER = []

# Create pairwise combinations of base features
for col1, col2 in combinations(BASE, 2):

    # Define the new interaction feature name
    new_col_name = f'{col1}_{col2}'

    # Add to interaction list
    INTER.append(new_col_name)

    # Concatenate values across all datasets
    for df in [train, test, orig]:
        df[new_col_name] = df[col1].astype(str) + '_' + df[col2].astype(str)

# Display count of generated interaction features
print(f'{len(INTER)} Interaction Features Created.')

In [None]:
# ================================================================
# ADD MEAN AND COUNT STATISTICS FROM ORIGINAL DATA
# ================================================================
# Initialize list to store new features
ORIG = []

# Generate mean and count features from the original dataset
for col in BASE:

    # Compute mean target per category
    mean_map = orig.groupby(col)[TARGET].mean()
    new_mean_col_name = f"orig_mean_{col}"
    mean_map.name = new_mean_col_name

    # Merge mean feature into train and test
    train = train.merge(mean_map, on=col, how='left')
    test = test.merge(mean_map, on=col, how='left')
    ORIG.append(new_mean_col_name)

    # Compute count per category
    count_map = orig.groupby(col).size().reset_index(name=f"orig_count_{col}")

    # Merge count feature into train and test
    train = train.merge(count_map, on=col, how='left')
    test = test.merge(count_map, on=col, how='left')
    ORIG.append(f"orig_count_{col}")

# Display number of original features created
print(f'{len(ORIG)} Orig Features Created.')

In [None]:
# ================================================================
# COMBINE ALL FEATURE GROUPS
# ================================================================
# Combine base, original, and interaction features
FEATURES = BASE + ORIG + INTER

# Display final feature count
print(f'{len(FEATURES)} Total Features.')

In [None]:
# ================================================================
# DEFINE DATA MATRICES AND CROSS-VALIDATION STRATEGY
# ================================================================
# Create training feature matrix
X = train[FEATURES]

# Create target vector
y = train[TARGET]

# Define cross-validation splits
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

In [None]:
# ================================================================
# DEFINE TARGET ENCODER CLASS
# ================================================================
class TargetEncoder(BaseEstimator, TransformerMixin):
    # Initialize encoder with parameters
    def __init__(self, cols_to_encode, aggs=['mean'], cv=5, smooth='auto', drop_original=False):
        # Store list of columns to encode
        self.cols_to_encode = cols_to_encode

        # Store aggregation methods
        self.aggs = aggs

        # Store number of cross-validation folds
        self.cv = cv

        # Store smoothing parameter
        self.smooth = smooth

        # Flag to determine if original columns should be dropped
        self.drop_original = drop_original

        # Initialize mapping dictionaries
        self.mappings_ = {}

        # Initialize global statistics
        self.global_stats_ = {}


    # Fit method to learn mappings from full data
    def fit(self, X, y):
        # Create a temporary DataFrame with target column
        temp_df = X.copy()
        temp_df['target'] = y

        # Compute global statistics for each aggregation
        for agg_func in self.aggs:
            self.global_stats_[agg_func] = y.agg(agg_func)

        # Compute category-specific mappings for each column and aggregation
        for col in self.cols_to_encode:
            self.mappings_[col] = {}
            for agg_func in self.aggs:
                mapping = temp_df.groupby(col)['target'].agg(agg_func)
                self.mappings_[col][agg_func] = mapping

        # Return fitted encoder
        return self


    # Transform method to encode unseen data using learned mappings
    def transform(self, X):
        # Create a copy of the input DataFrame
        X_transformed = X.copy()

        # Iterate over each column and aggregation function
        for col in self.cols_to_encode:
            for agg_func in self.aggs:

                # Define new encoded column name
                new_col_name = f'TE_{col}_{agg_func}'

                # Retrieve learned mapping
                map_series = self.mappings_[col][agg_func]

                # Map categories to target-based encoding
                X_transformed[new_col_name] = X[col].map(map_series)

                # Replace unseen values with global statistic
                X_transformed[new_col_name].fillna(self.global_stats_[agg_func], inplace=True)

        # Drop original columns if specified
        if self.drop_original:
            X_transformed.drop(columns=self.cols_to_encode, inplace=True)

        # Return transformed DataFrame
        return X_transformed


    # Fit and transform method with internal cross-validation
    def fit_transform(self, X, y):
        # Fit encoder on full dataset
        self.fit(X, y)

        # Create an empty DataFrame to store encoded values
        encoded_features = pd.DataFrame(index=X.index)

        # Initialize K-Fold cross-validation
        kf = KFold(n_splits=self.cv, shuffle=True, random_state=42)

        # Perform internal CV to avoid target leakage
        for train_idx, val_idx in kf.split(X, y):

            # Split into training and validation folds
            X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
            X_val = X.iloc[val_idx]

            # Create temporary DataFrame with target column
            temp_df_train = X_train.copy()
            temp_df_train['target'] = y_train

            # Encode each specified column
            for col in self.cols_to_encode:

                # Iterate through aggregation functions
                for agg_func in self.aggs:

                    # Define new encoded column name
                    new_col_name = f'TE_{col}_{agg_func}'

                    # Compute global statistic for the fold
                    fold_global_stat = y_train.agg(agg_func)

                    # Compute category-specific statistics
                    mapping = temp_df_train.groupby(col)['target'].agg(agg_func)

                    # Apply smoothing only for mean encoding
                    if agg_func == 'mean':

                        # Compute category sample counts
                        counts = temp_df_train.groupby(col)['target'].count()

                        # Determine smoothing parameter
                        m = self.smooth
                        if self.smooth == 'auto':
                            variance_between = mapping.var()
                            avg_variance_within = temp_df_train.groupby(col)['target'].var().mean()
                            m = avg_variance_within / variance_between if variance_between > 0 else 0

                        # Compute smoothed mean
                        smoothed_mapping = (counts * mapping + m * fold_global_stat) / (counts + m)

                        # Map validation data
                        encoded_values = X_val[col].map(smoothed_mapping)

                    # Handle non-mean aggregations
                    else:
                        encoded_values = X_val[col].map(mapping)

                    # Fill missing categories with fold-level global statistic
                    encoded_features.loc[X_val.index, new_col_name] = encoded_values.fillna(fold_global_stat)

        # Merge encoded features into original DataFrame
        X_transformed = X.copy()

        # Attach encoded columns to the transformed DataFrame
        for col in encoded_features.columns:
            X_transformed[col] = encoded_features[col]

        # Drop original columns if required
        if self.drop_original:
            X_transformed.drop(columns=self.cols_to_encode, inplace=True)

        # Return transformed DataFrame
        return X_transformed

In [None]:
# ================================================================
# DEFINE PYTORCH DATASET CLASS
# ================================================================
class TabularDataset(Dataset):
    # Initialize dataset
    def __init__(self, features, targets=None):
        self.features = torch.tensor(features.values, dtype=torch.float32)
        if targets is not None:
            self.targets = torch.tensor(targets.values, dtype=torch.float32)
        else:
            self.targets = None

    # Return dataset length
    def __len__(self):
        return len(self.features)

    # Return item by index
    def __getitem__(self, idx):
        if self.targets is not None:
            return self.features[idx], self.targets[idx]
        return self.features[idx]

In [None]:
# ================================================================
# DEFINE NEURAL NETWORK MODEL
# ================================================================
class MLPModel(nn.Module):
    # Initialize network layers
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.out = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    # Define forward pass
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.out(x)

In [None]:
# ================================================================
# DEFINE TRAINING HELPER FUNCTIONS
# ================================================================
# ------------------------------------------------
# Train the model for one epoch
# ------------------------------------------------
def train_one_epoch(model, loader, optimizer, criterion, device):
    # Set model to training mode
    model.train()

    # Initialize total running loss
    running_loss = 0.0

    # Iterate through each batch in the data loader
    for features, targets in loader:

        # Move features and targets to the selected device
        features = features.to(device)
        targets = targets.to(device).unsqueeze(1)

        # Reset gradients before backward pass
        optimizer.zero_grad()

        # Perform forward pass to obtain model outputs
        logits = model(features)

        # Compute loss between predictions and targets
        loss = criterion(logits, targets)

        # Backpropagate gradients
        loss.backward()

        # Update model parameters
        optimizer.step()

        # Accumulate batch loss
        running_loss += loss.item() * features.size(0)

    # Compute average epoch loss
    return running_loss / len(loader.dataset)


# ------------------------------------------------
# Evaluate model performance on validation data
# ------------------------------------------------
def evaluate(model, loader, device):
    # Set model to evaluation mode
    model.eval()

    # Initialize list to store predictions
    preds = []

    # Disable gradient computation for efficiency
    with torch.no_grad():

        # Iterate through validation batches
        for features, _ in loader:

            # Move features to the selected device
            features = features.to(device)

            # Perform forward pass to obtain raw logits
            logits = model(features)

            # Convert logits to probabilities
            probs = torch.sigmoid(logits).cpu().numpy().ravel()

            # Append batch predictions
            preds.append(probs)

    # Concatenate all batch predictions
    return np.concatenate(preds)


# ------------------------------------------------
# Perform inference on test data
# ------------------------------------------------
def infer_test(model, loader, device):
    # Set model to evaluation mode
    model.eval()

    # Initialize list to store predictions
    preds = []

    # Disable gradient computation for inference
    with torch.no_grad():

        # Iterate through test batches
        for features in loader:

            # Move features to the selected device
            features = features.to(device)

            # Perform forward pass to obtain raw logits
            logits = model(features)

            # Convert logits to probabilities
            probs = torch.sigmoid(logits).cpu().numpy().ravel()

            # Append batch predictions
            preds.append(probs)

    # Concatenate all batch predictions
    return np.concatenate(preds)

In [None]:
# ================================================================
# DEFINE CROSS-VALIDATION TRAINING LOOP
# ================================================================
def run_cv(X, y, test_df, skf, inter_cols, cat_cols):
    # Select computing device (use GPU if available)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Initialize arrays for out-of-fold and test predictions
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(test_df))

    # Set training hyperparameters
    num_epochs = 24
    batch_size = 384
    lr = 5e-4

    # Loop through each fold in cross-validation
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):

        # Print the current fold number
        print(f'--- Fold {fold}/{skf.n_splits} ---')

        # Split data into training and validation subsets
        X_train, y_train = X.iloc[train_idx].copy(), y.iloc[train_idx].copy()
        X_val, y_val = X.iloc[val_idx].copy(), y.iloc[val_idx].copy()
        X_test = test_df.copy()

        # Initialize the target encoder
        te = TargetEncoder(
            cols_to_encode=inter_cols,
            cv=5,
            smooth='auto',
            aggs=['mean'],
            drop_original=True
        )

        # Apply target encoding on the training data
        X_train = te.fit_transform(X_train, y_train)

        # Transform validation and test data using learned mappings
        X_val = te.transform(X_val)
        X_test = te.transform(X_test)

        # Convert categorical columns to numerical codes
        for col in cat_cols:
            X_train[col] = X_train[col].astype('category').cat.codes
            X_val[col] = X_val[col].astype('category').cat.codes
            X_test[col] = X_test[col].astype('category').cat.codes

        # Fill any missing values with zeros
        X_train = X_train.fillna(0)
        X_val = X_val.fillna(0)
        X_test = X_test.fillna(0)

        # Create PyTorch datasets for train, validation, and test
        train_ds = TabularDataset(X_train, y_train)
        val_ds = TabularDataset(X_val, y_val)
        test_ds = TabularDataset(X_test)

        # Create data loaders for efficient batching
        train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
        test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

        # Determine the number of input features
        input_dim = X_train.shape[1]

        # Initialize the neural network model
        model = MLPModel(input_dim=input_dim).to(device)

        # Define the binary cross-entropy loss function
        criterion = nn.BCEWithLogitsLoss()

        # Define the optimizer for model training
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)

        # Initialize variables to track the best model per fold
        best_auc = -np.inf
        best_val_preds = None

        # Train the model for a fixed number of epochs
        for epoch in range(1, num_epochs + 1):

            # Perform one training epoch
            train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device)

            # Evaluate model performance on the validation set
            val_preds = evaluate(model, val_loader, device)
            val_auc = roc_auc_score(y_val, val_preds)

            # Print epoch results (training loss and validation AUC)
            print(f'Epoch {epoch:02d} - loss: {train_loss:.4f} - val_auc: {val_auc:.4f}')

            # Update best model if AUC improves
            if val_auc > best_auc:
                best_auc = val_auc
                best_val_preds = val_preds.copy()

        # Store best validation predictions in out-of-fold array
        oof_preds[val_idx] = best_val_preds

        # Generate predictions on test data for the current fold
        fold_test_preds = infer_test(model, test_loader, device)

        # Average test predictions across all folds
        test_preds += fold_test_preds / skf.n_splits

        # Print final AUC for this fold
        print(f'Fold {fold} AUC: {best_auc:.4f}')

    # Compute overall OOF AUC score
    overall_auc = roc_auc_score(y, oof_preds)

    # Display final summary results
    print('====================')
    print(f'Overall OOF AUC: {overall_auc:.4f}')
    print('====================')

    # Return out-of-fold predictions, test predictions, and overall AUC
    return oof_preds, test_preds, overall_auc

In [None]:
# ================================================================
# MAIN EXECUTION FUNCTION
# ================================================================
def main():
    # Run cross-validation
    oof_preds, test_preds, overall_auc = run_cv(X, y, test[FEATURES], skf, INTER, CATS)

    # Save out-of-fold predictions
    pd.DataFrame({'id': train.id, TARGET: oof_preds}).to_csv(
        f'oof_torch_cv_{overall_auc:.5f}.csv', index=False
    )

    # Save test predictions
    pd.DataFrame({'id': test.id, TARGET: test_preds}).to_csv(
        'submission.csv', index=False
    )

In [None]:
# ================================================================
# EXECUTION ENTRY POINT
# ================================================================
if __name__ == "__main__":
    main()