In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e11/sample_submission.csv
/kaggle/input/playground-series-s5e11/train.csv
/kaggle/input/playground-series-s5e11/test.csv
/kaggle/input/loan-prediction-dataset-2025/loan_dataset_20000.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier # <-- Still using XGBoost
from sklearn.metrics import roc_auc_score

# --- CONFIGURATION (Paths and Column Names) ---
COMP_PATH = "/kaggle/input/playground-series-s5e11/"
ORIG_FILE_PATH = "/kaggle/input/loan-prediction-dataset-2025/loan_dataset_20000.csv" 

TARGET_COL = 'loan_paid_back'
print(f"Target Column: {TARGET_COL}")

# --- 1. Load and Prepare ALL Data ---
train_df = pd.read_csv(COMP_PATH + "train.csv")
test_df = pd.read_csv(COMP_PATH + "test.csv")
try:
    original_df = pd.read_csv(ORIG_FILE_PATH)
except FileNotFoundError:
    original_df = None

test_ids = test_df['id']
train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)


# --- 2. ENRICHED TRAINING DATA (CONCATENATION) ---
if original_df is not None:
    print("\nStarting Data Enrichment (Concatenation)...")
    
    # Simple direct concatenation
    train_df = pd.concat([train_df, original_df], ignore_index=True)
    print(f"Combined Training Data Size: {len(train_df)} rows.")

    # 2e. ðŸ’¡ CRITICAL FIX: ALIGN TEST FEATURES ðŸ’¡
    # We must add all the new features (like 'installment') to the test set, 
    # filling them with NaN so the Imputation step can handle them later.
    new_features = [col for col in train_df.columns if col not in test_df.columns and col != TARGET_COL]
    
    for col in new_features:
        test_df[col] = np.nan
    
    print(f"Test data aligned with {len(new_features)} new features added (filled with NaN).")


# --- 3. FEATURE ENGINEERING (FIXED Logic) ---
print("\nApplying Feature Engineering...")

# Feature 1: Income to Loan Ratio
train_df['Income_to_Loan_Ratio'] = train_df['annual_income'] / train_df['loan_amount']
test_df['Income_to_Loan_Ratio'] = test_df['annual_income'] / test_df['loan_amount']

# Feature 2: Annual Debt Amount
train_df['Annual_Debt_Amount'] = train_df['debt_to_income_ratio'] * train_df['annual_income']
test_df['Annual_Debt_Amount'] = test_df['debt_to_income_ratio'] * test_df['annual_income']

# Feature 3: Payment Burden (Now works because 'installment' exists in test_df as NaN)
if 'installment' in train_df.columns:
    train_df['Payment_Burden'] = train_df['installment'] / train_df['monthly_income']
    test_df['Payment_Burden'] = test_df['installment'] / test_df['monthly_income']

print("Feature Engineering Complete.")


# --- 4. PREPROCESSING (Final Cleaning) ---
print("Starting Preprocessing...")
feature_cols = [col for col in train_df.columns if col != TARGET_COL]

# Imputation Loop (This is where the NaNs in the test set will be filled!)
for col in feature_cols:
    if train_df[col].dtype == 'object' or train_df[col].dtype == 'category':
        mode = train_df[col].mode()[0]
        train_df[col] = train_df[col].fillna(mode)
        test_df[col] = test_df[col].fillna(mode)
    else:
        median = train_df[col].median()
        train_df[col] = train_df[col].fillna(median)
        test_df[col] = test_df[col].fillna(median)

# Encoding and Alignment
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

# Align columns (ensures test has same one-hot columns as train)
train_cols_to_keep = [col for col in train_df.columns if col != TARGET_COL]
test_df = test_df.reindex(columns=train_cols_to_keep, fill_value=0)

# Split Data
X = train_df.drop(TARGET_COL, axis=1)
y = train_df[TARGET_COL]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# --- 5. MODEL TRAINING (XGBoost for Ensemble Diversity) ---
print("\nTraining the XGBoost Model (Piece 2 of Ensemble)...")

xgb_final_model = XGBClassifier(
    n_estimators=200, learning_rate=0.03, max_depth=6, subsample=0.8,
    colsample_bytree=0.8, random_state=42, n_jobs=-1,
    use_label_encoder=False, eval_metric='auc'
)
xgb_final_model.fit(X_train, y_train)

# Evaluate
val_preds_xgb = xgb_final_model.predict_proba(X_val)[:, 1]
auc_score_xgb = roc_auc_score(y_val, val_preds_xgb)

print(f"Previous LGBM Score (Synthetic Only): 0.91920")
print(f"XGBoost Score (Enriched Data): {auc_score_xgb}")


# --- 6. SUBMISSION ---
print("\nCreating Submission File...")
test_preds_xgb = xgb_final_model.predict_proba(test_df)[:, 1]

submission_df_xgb = pd.DataFrame({
    'id': test_ids,
    'loan_paid_back': test_preds_xgb
})

submission_df_xgb.to_csv('submission_xgb_enriched.csv', index=False)
print("Submission file (submission_xgb_enriched.csv) created! Submit and check the score.")

Target Column: loan_paid_back

Starting Data Enrichment (Concatenation)...
Combined Training Data Size: 613994 rows.
Test data aligned with 10 new features added (filled with NaN).

Applying Feature Engineering...
Feature Engineering Complete.
Starting Preprocessing...

Training the XGBoost Model (Piece 2 of Ensemble)...
Previous LGBM Score (Synthetic Only): 0.91920
XGBoost Score (Enriched Data): 0.9166696802259506

Creating Submission File...
Submission file (submission_xgb_enriched.csv) created! Submit and check the score.
