This notebook is an experiment based on [my baseline code](https://www.kaggle.com/code/masayakawamata/s5e11-xgb-baseline?scriptVersionId=272568850) and inspired by [this discussion](https://www.kaggle.com/competitions/playground-series-s5e11/discussion/614986).

The method implements "XGB over Residuals" by:
1.  Training a Logistic Regression model on the **original** dataset.
2.  Acquiring the logits (predictions before activation) from that model.
3.  Passing these logits to the main XGBoost model using the `base_margin` parameter.

---

### Current Results & Purpose

While I have **not yet achieved a score improvement** with this specific setup, I suspect the underlying concept could be powerful. A better score might be achievable through modifications, such as:

* Using a Neural Network or another GBDT (instead of LogReg) to generate the `base_margin`.
* Training the Stage 1 model on a feature-engineered dataset to capture more of the original data's signal, which might significantly unlock the potential of this "over residuals" approach.

I hope this notebook can serve as a useful baseline for your own experiments with this technique.

If you manage to improve performance with this method, I would be very grateful if you could share your findings in the comments or in a post-competition write-up, as your insights would be highly valuable.

In [None]:
import warnings
warnings.simplefilter('ignore')

# Load Data

In [None]:
import numpy as np, pandas as pd

train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')
orig = pd.read_csv('/kaggle/input/loan-prediction-dataset-2025/loan_dataset_20000.csv')
print('Train Shape:', train.shape)
print('Test Shape:', test.shape)
print('Orig Shape:', orig.shape)

train.head(3)

In [None]:
TARGET = 'loan_paid_back'
BASE = [col for col in train.columns if col not in ['id', TARGET]]
CATS = ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade']
NUMS = [col for col in BASE if col not in CATS]

# LogReg on Original Data

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

preprocessor_ohe = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), CATS),
    ],
    remainder='passthrough' 
)

pipeline_logreg = Pipeline(steps=[
    ('preprocess', preprocessor_ohe),
    ('scale', StandardScaler()),
    ('model', LogisticRegression(C=48, max_iter=1200, random_state=42))
])

X_orig = orig[BASE]
y_orig = orig[TARGET]

pipeline_logreg.fit(X_orig, y_orig)

print("Getting logits (decision_function) for train and test data...")
train['logreg_logit'] = pipeline_logreg.decision_function(train[BASE])
test['logreg_logit'] = pipeline_logreg.decision_function(test[BASE])
train[['id', 'logreg_logit']].head(3)

In [None]:
FEATURES = BASE + ['logreg_logit']

# Model

In [None]:
X = train[FEATURES]
y = train[TARGET]

In [None]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [None]:
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 5,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'n_estimators': 10000,
    'learning_rate': 0.01,
    'early_stopping_rounds': 200,
    'random_state': 42,
    'n_jobs': -1,
    'enable_categorical': True,
    'device': 'cuda',
}

In [None]:
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(test))

N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f'--- Fold {fold}/{N_SPLITS} ---')
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    X_test = test[FEATURES].copy()

    X_train[CATS] = X_train[CATS].astype('category')
    X_val[CATS] = X_val[CATS].astype('category')
    X_test[CATS] = X_test[CATS].astype('category')

    dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
    dval = xgb.DMatrix(X_val, label=y_val, enable_categorical=True)
    dtest = xgb.DMatrix(X_test, enable_categorical=True)
    
    dtrain.set_base_margin(X_train['logreg_logit'])
    dval.set_base_margin(X_val['logreg_logit'])
    dtest.set_base_margin(test['logreg_logit']) 
    
    evals = [(dtrain, 'train'), (dval, 'eval')]
    
    model = xgb.train(
        params,
        dtrain,
        evals=evals,
        num_boost_round=10000,
        early_stopping_rounds=params.get('early_stopping_rounds', 200),
        verbose_eval=1000, 
    )   
    
    fold_val_preds = model.predict(dval, iteration_range=(0, model.best_iteration))
    fold_test_preds = model.predict(dtest, iteration_range=(0, model.best_iteration))
    
    oof_preds[val_idx] = fold_val_preds 
    
    fold_score = roc_auc_score(y_val, fold_val_preds)
    print(f'Fold {fold} AUC: {fold_score:.4f}')
    
    test_preds += fold_test_preds / N_SPLITS 

overall_auc = roc_auc_score(y, oof_preds)
print(f'====================')
print(f'Overall OOF AUC: {overall_auc:.4f}')
print(f'====================')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

feature_importances_dict = model.get_score(importance_type='gain')
importance_df = pd.DataFrame({
    'feature': feature_importances_dict.keys(),
    'importance': feature_importances_dict.values()
})

importance_df = importance_df.sort_values('importance', ascending=False)

plt.style.use('fivethirtyeight')
plt.figure(figsize=(12, 20))
sns.barplot(x='importance', 
            y='feature', 
            data=importance_df.head(50))
plt.title(f'Feature Importance (gain) - Last Model')
plt.xlabel('Importance Score (Gain)')
plt.ylabel('Features')
plt.tight_layout()
plt.show()

In [None]:
pd.DataFrame({'id': train.id, TARGET: oof_preds}).to_csv(f'oof_xgb_over_resid_cv_{overall_auc}.csv', index=False)
pd.DataFrame({'id': test.id, TARGET: test_preds}).to_csv(f'test_xgb_over_resid_cv_{overall_auc}.csv', index=False)

In [None]:
# import optuna

# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import StratifiedKFold, cross_val_score
# from sklearn.metrics import roc_auc_score

# preprocessor_ohe = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), CATS),
#     ],
#     remainder='passthrough' 
# )

# X_orig = orig[BASE]
# y_orig = orig[TARGET]

# def objective(trial):
#     logreg_c = trial.suggest_float('C', 1e-4, 1e2, log=True)
#     logreg_max_iter = trial.suggest_int('max_iter', 100, 2000, step=100)

#     pipeline = Pipeline(steps=[
#         ('preprocess', preprocessor_ohe),
#         ('scale', StandardScaler()),
#         ('model', LogisticRegression(
#             C=logreg_c,
#             max_iter=logreg_max_iter,
#             solver='liblinear', 
#             random_state=42
#         ))
#     ])

#     N_SPLITS = 5
#     skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
    
#     try:
#         scores = cross_val_score(pipeline, X_orig, y_orig, cv=skf, scoring='roc_auc')
#         return np.mean(scores)
    
#     except Exception as e:
#         print(f"Trial failed with error: {e}")
#         return 0.0
# N_TRIALS = 100

# print(f"Optuna tuning started (N_TRIALS={N_TRIALS}, CV_SPLITS=5)...")
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

# print("\nOptuna tuning finished.")
# print(f"Best trial AUC: {study.best_value:.6f}")
# print(f"Best params: {study.best_params}")

# best_params = study.best_params

# final_pipeline = Pipeline(steps=[
#     ('preprocess', preprocessor_ohe),
#     ('scale', StandardScaler()),
#     ('model', LogisticRegression(
#         C=best_params['C'],
#         max_iter=best_params['max_iter'],
#         solver='liblinear', 
#         random_state=42
#     ))
# ])

# final_pipeline.fit(X_orig, y_orig)
# train['logreg_logit'] = final_pipeline.decision_function(train[BASE])
# test['logreg_logit'] = final_pipeline.decision_function(test[BASE])

# display(train[['id', 'logreg_logit']].head(3))