In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd

# Use the path that works in your environment.
# Assuming you are running this where the files are accessible.
try:
    df_train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
    
    print("--- Loaded df_train Columns ---")
    # Print all column names
    print(df_train.columns.tolist())
    
    # Print the first few rows to check if the data looks correct
    print("\n--- df_train Head (First 5 Rows) ---")
    print(df_train.head())

except FileNotFoundError:
    print("Error: The file path '/kaggle/input/playground-series-s5e11/train.csv' was not found.")
    print("Please check the path and make sure you are loading the correct file.")

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import os
import sys

# --- 0. Configuration and Environment Check ---

# Check if a GPU environment is available, otherwise default to CPU
try:
    # A simple check for the necessary CUDA environment (may fail on some setups)
    import cupy
    XGB_DEVICE = 'cuda' 
    XGB_TREE_METHOD = 'gpu_hist'
    print("Using GPU (CUDA) environment.")
except:
    XGB_DEVICE = 'cpu'
    XGB_TREE_METHOD = 'hist'
    print("GPU environment not detected or failed to initialize. Using CPU.")


# Correct TARGET variable name based on your file's columns
TARGET = 'loan_paid_back' 
N_FOLDS = 5 # Number of folds for Cross-Validation and OOF Encoding

# Best hyper-parameters found by Optuna in the original notebook (AUC: 0.926017)
# Note: n_jobs=-1 is handled implicitly by device='cuda' or nthread in 'cpu'
OPTIMAL_PARAMS = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': XGB_TREE_METHOD,
    'device': XGB_DEVICE,
    'n_estimators': 2000,
    'learning_rate': 0.015112,
    'max_depth': 12,
    'min_child_weight': 11,
    'gamma': 0.00755,
    'subsample': 0.887,
    'colsample_bytree': 0.835,
    'colsample_bylevel': 0.812,
    'colsample_bynode': 0.701,
    'reg_alpha': 0.000305,
    'reg_lambda': 0.000495,
    'seed': 42,
}

# --- 1. Data Loading ---
INPUT_PATH = '/kaggle/input/playground-series-s5e11/'
print(f"Loading data from: {INPUT_PATH}")

try:
    df_train = pd.read_csv(os.path.join(INPUT_PATH, 'train.csv'))
    df_test = pd.read_csv(os.path.join(INPUT_PATH, 'test.csv'))
    
    y = df_train[TARGET].astype(int)
    test_ids = df_test['id']
    df_train.drop(TARGET, axis=1, inplace=True)
    
    # Combine for feature engineering, dropping 'id' as it's not a feature
    df_full = pd.concat([df_train.drop('id', axis=1), df_test.drop('id', axis=1)], ignore_index=True)

    print(f"Train shape: {df_train.shape}, Test shape: {df_test.shape}")

except FileNotFoundError as e:
    print(f"Error loading files. Check the INPUT_PATH and file names: {e}")
    sys.exit()


# --- 2. Advanced Feature Engineering (Replicating Notebook Concepts) ---

print("\n--- Starting Advanced Feature Engineering ---")

CAT_COLS = ['gender', 'marital_status', 'education_level', 
            'employment_status', 'loan_purpose', 'grade_subgrade']
NUM_COLS = ['annual_income', 'debt_to_income_ratio', 'credit_score', 
            'loan_amount', 'interest_rate']

# Fill NaNs for the full dataset BEFORE splitting
for col in NUM_COLS:
    # Missingness Indicator: Fill NaNs with -999
    df_full[col].fillna(-999, inplace=True)

for col in CAT_COLS:
    # Categorical NaN handling: Treat NaNs as a new category 'Missing'
    df_full[col].fillna('Missing', inplace=True)
    
    # --- Frequency Encoding ---
    count_map = df_full[col].value_counts().to_dict()
    df_full[f'{col}_freq_enc'] = df_full[col].map(count_map)

# --- Group Mean Deviation Feature Creation (Interaction Stats) ---
for group_col in ['grade_subgrade', 'education_level']:
    for agg_col in ['annual_income', 'credit_score']:
        # Calculate the mean of 'agg_col' for each category in 'group_col'
        group_mean = df_full.groupby(group_col)[agg_col].transform('mean')
        
        # Deviation = Value - Group Mean
        df_full[f'{agg_col}_dev_by_{group_col}'] = df_full[agg_col] - group_mean
        
print("Feature Engineering Complete. Full shape:", df_full.shape)

# --- Separate back into training and testing sets ---
X = df_full.iloc[:len(y)]
X_test = df_full.iloc[len(y):]


# --- 3. Out-of-Fold (OOF) Target Encoding ---

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
oof_encoded_cols = [f'{col}_oof_enc' for col in CAT_COLS]
oof_df = pd.DataFrame(index=X.index, columns=oof_encoded_cols)
global_mean = y.mean() # Mean of the whole target used for unknown/rare categories

print(f"Starting {N_FOLDS}-Fold OOF Target Encoding...")
for train_idx, val_idx in kf.split(X, y):
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold = y.iloc[train_idx]
    
    for col in CAT_COLS:
        # Calculate target mean on the training fold
        target_mean_map = y_train_fold.groupby(X_train_fold[col]).mean()
        
        # Apply the mean to the validation fold
        oof_df.loc[val_idx, f'{col}_oof_enc'] = X_val_fold[col].map(target_mean_map)

# Add the OOF encoded features to the training set
X = pd.concat([X, oof_df], axis=1)

# For the test set, use the global target mean map
for col in CAT_COLS:
    global_target_mean = y.groupby(X[col]).mean()
    X_test[f'{col}_oof_enc'] = X_test[col].map(global_target_mean)

# --- CRITICAL FIX: Ensure OOF columns are float and handle all remaining NaNs ---
for col in oof_encoded_cols:
    X[col] = X[col].astype(float)
    X_test[col] = X_test[col].astype(float)

    # Fill NaNs (for categories not seen in the fold or new in test set) with the overall target mean
    X[col].fillna(global_mean, inplace=True)
    X_test[col].fillna(global_mean, inplace=True)

# Drop original categorical columns now that they are fully encoded
X.drop(CAT_COLS, axis=1, inplace=True)
X_test.drop(CAT_COLS, axis=1, inplace=True)

# Final check of column alignment (CRITICAL for XGBoost)
X_test = X_test[X.columns]


# --- 4. Model Training (Single XGBoost Model) ---
print("\n--- Starting XGBoost Training ---")

# Pass optimal parameters including the determined device and tree method
model = xgb.XGBClassifier(**OPTIMAL_PARAMS)

# Train the model
model.fit(
    X, y,
    eval_set=[(X, y)],
    early_stopping_rounds=100,
    verbose=False
)

print(f"Model trained with {model.best_iteration} boosting rounds.")
print(f"Training AUC: {roc_auc_score(y, model.predict_proba(X)[:, 1]):.6f}")


# --- 5. Prediction and Submission ---
# Predict probabilities on the test set
predictions = model.predict_proba(X_test)[:, 1]

# Create submission file
submission_df = pd.DataFrame({'id': test_ids, TARGET: predictions})
submission_df.to_csv('submission_advanced_xgb.csv', index=False)

print("\n--- Submission File Created ---")
print("File: submission_advanced_xgb.csv")
print(f"Prediction head:\n{submission_df.head()}")

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import roc_auc_score
import os

# --- 0. Configuration and Hyperparameters ---

TARGET = 'loan_paid_back' 
XGB_DEVICE = 'cuda' 
XGB_TREE_METHOD = 'hist' if XGB_DEVICE == 'cpu' else 'gpu_hist'
N_FOLDS = 5 
GLOBAL_SEED = 42

OPTIMAL_PARAMS = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': XGB_TREE_METHOD,
    'device': XGB_DEVICE,
    'n_estimators': 2000,
    'learning_rate': 0.015112,
    'max_depth': 12,
    'min_child_weight': 11,
    'gamma': 0.00755,
    'subsample': 0.887,
    'colsample_bytree': 0.835,
    'colsample_bylevel': 0.812,
    'colsample_bynode': 0.701,
    'reg_alpha': 0.000305,
    'reg_lambda': 0.000495,
    'seed': GLOBAL_SEED,
    'n_jobs': -1,
}

# --- 1. Data Loading ---
INPUT_PATH = '/kaggle/input/playground-series-s5e11/'

try:
    df_train = pd.read_csv(os.path.join(INPUT_PATH, 'train.csv'))
    df_test = pd.read_csv(os.path.join(INPUT_PATH, 'test.csv'))
    
    y = df_train[TARGET].astype(int)
    test_ids = df_test['id']
    df_train.drop(TARGET, axis=1, inplace=True)
    
    df_full = pd.concat([df_train.drop('id', axis=1), df_test.drop('id', axis=1)], ignore_index=True)

except FileNotFoundError as e:
    print(f"Error loading files. Check the INPUT_PATH: {e}")
    exit()


# --- 2. Advanced Feature Engineering (With Missingness Flags) ---

print("\n--- Starting Advanced Feature Engineering ---")

CAT_COLS = ['gender', 'marital_status', 'education_level', 
            'employment_status', 'loan_purpose', 'grade_subgrade']
NUM_COLS = ['annual_income', 'debt_to_income_ratio', 'credit_score', 
            'loan_amount', 'interest_rate']

# Create Missingness Flags (Crucial for high AUC models)
for col in NUM_COLS:
    df_full[f'{col}_is_missing'] = df_full[col].isna().astype(int)
    # Fill actual NaN values with -999 (Sentinel value for XGBoost)
    df_full[col].fillna(-999, inplace=True)

# Categorical NaN handling: Treat NaNs as a new category 'Missing'
for col in CAT_COLS:
    df_full[col].fillna('Missing', inplace=True)
    
    # --- Frequency Encoding (Feature 1) ---
    count_map = df_full[col].value_counts().to_dict()
    df_full[f'{col}_freq_enc'] = df_full[col].map(count_map)

# --- Group Mean Deviation Feature Creation (Feature 2) ---
for group_col in ['grade_subgrade', 'education_level']:
    for agg_col in ['annual_income', 'credit_score']:
        group_mean = df_full.groupby(group_col)[agg_col].transform('mean')
        df_full[f'{agg_col}_dev_by_{group_col}'] = df_full[agg_col] - group_mean
        
# --- Robust Scaling on all numeric features (including imputed ones) ---
# This is often used in such solutions to normalize data and handle outliers.
numeric_features_to_scale = [c for c in df_full.columns if c not in CAT_COLS]
scaler = RobustScaler()
df_full[numeric_features_to_scale] = scaler.fit_transform(df_full[numeric_features_to_scale])

print("Feature Engineering Complete. Full shape:", df_full.shape)

# --- Separate back into training and testing sets ---
X = df_full.iloc[:len(y)]
X_test = df_full.iloc[len(y):]


# --- 3. Out-of-Fold (OOF) Target Encoding (Feature 3) ---

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=GLOBAL_SEED)
oof_encoded_cols = [f'{col}_oof_enc' for col in CAT_COLS]
oof_df = pd.DataFrame(index=X.index, columns=oof_encoded_cols)
global_mean = y.mean()

print(f"Starting {N_FOLDS}-Fold OOF Target Encoding...")
for train_idx, val_idx in kf.split(X, y):
    X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
    y_train_fold = y.iloc[train_idx]
    
    for col in CAT_COLS:
        target_mean_map = y_train_fold.groupby(X_train_fold[col]).mean()
        # Apply the mean to the validation fold
        oof_df.loc[val_idx, f'{col}_oof_enc'] = X_val_fold[col].map(target_mean_map)

# Add the OOF encoded features to the training set
X = pd.concat([X, oof_df], axis=1)

# For the test set, use the global target mean from the entire training set
for col in CAT_COLS:
    global_target_mean = y.groupby(X[col]).mean()
    X_test[f'{col}_oof_enc'] = X_test[col].map(global_target_mean)

# --- Ensure OOF columns are numerical and fill NaNs (for unseen categories) ---
for col in oof_encoded_cols:
    X[col] = X[col].astype(float)
    X_test[col] = X_test[col].astype(float)

    # Fill NaNs with the overall target mean
    X[col].fillna(global_mean, inplace=True)
    X_test[col].fillna(global_mean, inplace=True)

# Drop original categorical columns
X.drop(CAT_COLS, axis=1, inplace=True)
X_test.drop(CAT_COLS, axis=1, inplace=True)

# Final check of column alignment (CRITICAL)
X_test = X_test[X.columns]


# --- 4. Model Training (Single XGBoost Model) ---
print("\n--- Starting XGBoost Training ---")

model = xgb.XGBClassifier(**OPTIMAL_PARAMS, enable_categorical=False)

model.fit(
    X, y,
    eval_set=[(X, y)],
    early_stopping_rounds=100,
    verbose=False
)

print(f"Model trained with {model.best_iteration} boosting rounds.")
print(f"Training AUC: {roc_auc_score(y, model.predict_proba(X)[:, 1]):.6f}")


# --- 5. Prediction and Submission ---
predictions = model.predict_proba(X_test)[:, 1]

submission_df = pd.DataFrame({'id': test_ids, TARGET: predictions})
submission_df.to_csv('submission_final_advanced_xgb.csv', index=False)

print("\n--- Submission File Created ---")
print("File: submission_final_advanced_xgbV2.csv")
print(f"Prediction head:\n{submission_df.head()}")

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import RobustScaler
import os

# --- 0. Configuration and Hyperparameters ---

TARGET = 'loan_paid_back' 
N_FOLDS = 5
GLOBAL_SEED = 42

# LightGBM parameters often perform better than XGBoost defaults
LGB_PARAMS = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'n_estimators': 3000,
    'learning_rate': 0.01,
    'num_leaves': 16,
    'max_depth': 6,
    'min_child_samples': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'n_jobs': -1,
    'seed': GLOBAL_SEED,
    # Use 'gpu' if available and performing better than 'cpu' on your system
    'device_type': 'cpu' 
}

# --- 1. Data Loading ---
INPUT_PATH = '/kaggle/input/playground-series-s5e11/'
try:
    df_train = pd.read_csv(os.path.join(INPUT_PATH, 'train.csv'))
    df_test = pd.read_csv(os.path.join(INPUT_PATH, 'test.csv'))
    
    y = df_train[TARGET].astype(int)
    test_ids = df_test['id']
    df_train.drop(TARGET, axis=1, inplace=True)
    
    df_full = pd.concat([df_train.drop('id', axis=1), df_test.drop('id', axis=1)], ignore_index=True)

except FileNotFoundError as e:
    print(f"Error loading files. Check the INPUT_PATH: {e}")
    exit()


# --- 2. Feature Engineering for LightGBM ---

print("\n--- Starting LightGBM-Optimized Feature Engineering ---")

CAT_COLS = ['gender', 'marital_status', 'education_level', 
            'employment_status', 'loan_purpose', 'grade_subgrade']
NUM_COLS = ['annual_income', 'debt_to_income_ratio', 'credit_score', 
            'loan_amount', 'interest_rate']

# A. Missingness Flags (Highly Predictive)
for col in NUM_COLS:
    df_full[f'{col}_is_missing'] = df_full[col].isna().astype(int)
    # Fill actual NaN values with -999 (Sentinel value)
    df_full[col].fillna(-999, inplace=True)

# B. Categorical Missingness and Type Casting (Native LGB support)
for col in CAT_COLS:
    # Treat NaNs as a new category (important for LGB's native support)
    df_full[col].fillna('Missing', inplace=True)
    # Convert to 'category' type, which LightGBM uses natively
    df_full[col] = df_full[col].astype('category')
    
# C. Group Deviation Features (Advanced Statistical Feature)
for group_col in ['grade_subgrade', 'education_level']:
    for agg_col in ['annual_income', 'credit_score']:
        group_mean = df_full.groupby(group_col)[agg_col].transform('mean')
        df_full[f'{agg_col}_dev_by_{group_col}'] = df_full[agg_col] - group_mean
        
# D. Robust Scaling on all numeric features
numeric_features_to_scale = [c for c in df_full.columns if c not in CAT_COLS]
scaler = RobustScaler()
df_full[numeric_features_to_scale] = scaler.fit_transform(df_full[numeric_features_to_scale])

print("Feature Engineering Complete. Full shape:", df_full.shape)

# Separate back into training and testing sets
X = df_full.iloc[:len(y)]
X_test = df_full.iloc[len(y):]


# --- 3. Model Training (LightGBM with 5-Fold Cross-Validation) ---

print("\n--- Starting LightGBM Training (5-Fold CV) ---")

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=GLOBAL_SEED)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = lgb.LGBMClassifier(**LGB_PARAMS)
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)],
        # Pass the categorical features list for native handling
        categorical_feature=CAT_COLS
    )
    
    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(X_test)[:, 1] / N_FOLDS
    
    print(f"Fold {fold+1} finished. Best iteration: {model.best_iteration_}")

oof_auc = roc_auc_score(y, oof_preds)
print(f"\nOverall OOF AUC (LightGBM): {oof_auc:.6f}")


# --- 4. Prediction and Submission ---
# The final predictions are the average of the 5 folds (test_preds)

submission_df = pd.DataFrame({'id': test_ids, TARGET: test_preds})
submission_df.to_csv('submission_lgbm_advanced.csv', index=False)

print("\n--- Submission File Created ---")
print("File: submission_lgbm_advanced.csv")
print(f"Prediction head:\n{submission_df.head()}")

In [None]:

import seaborn as sns

In [None]:
train  = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')

In [None]:
# Visualize correlation between numerical features
corr = train.select_dtypes(['number']).corr()
sns.heatmap(corr, cmap='coolwarm', annot=True)

In [None]:
test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')

In [None]:
train.info()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Suppose your training dataframe is 'train'
train = train.copy()

# Identify categorical and numeric columns
categorical_cols = train.select_dtypes(include=['object']).columns.tolist()
numeric_cols = train.select_dtypes(exclude=['object']).columns.tolist()

# One-hot encode ALL categorical features (no drop_first to preserve all features)
train_encoded = pd.get_dummies(train, columns=categorical_cols, drop_first=False)

# Compute correlation matrix including ALL columns
corr_matrix = train_encoded.corr()

# Optional: round to 2 decimals for clarity
corr_matrix = corr_matrix.round(2)

# Visualize
plt.figure(figsize=(24, 20))
sns.heatmap(corr_matrix, cmap='coolwarm', center=0)
plt.title('Full Feature Correlation Matrix (All Features from Training Data)')
plt.show()


In [None]:
# Compute correlation with the target variable
corr_with_target = corr_matrix['loan_paid_back'].sort_values(ascending=False)

# Show top positive and negative correlations
print("Top Positive Correlations (features that increase likelihood of repayment):")
print(corr_with_target.head(15))

print("\nTop Negative Correlations (features that decrease likelihood of repayment):")
print(corr_with_target.tail(15))


In [None]:
import matplotlib.pyplot as plt

top_features = corr_with_target[abs(corr_with_target) > 0.1]  # filter strongest ones
plt.figure(figsize=(10,6))
top_features.plot(kind='bar', color='red')
plt.title('Features Most Correlated with Loan Repayment')
plt.ylabel('Correlation with loan_paid_back')
plt.show()


In [None]:
train.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(16, 12))
sns.heatmap(corr_matrix, cmap='coolwarm', center=0)
plt.title('Correlation Matrix after One-Hot Encoding')
plt.show()


In [None]:
train.info()

In [None]:
import matplotlib.pyplot as plt

train.hist(bins=30,figsize=(15, 10),color= 'blue')
plt.suptitle("Train() Data Distributions", fontsize=16)
plt.show()

In [None]:
target = 'loan_paid_back'

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

def convert_all_to_numeric(train, test):
    """
    Converts ALL columns in train/test to numeric (LabelEncoding for objects),
    safely handling columns that exist only in one dataset.
    """
    import numpy as np
    import pandas as pd
    from sklearn.preprocessing import LabelEncoder

    tr, te = train.copy(), test.copy()
    encoders = {}

    # unify columns intersection (ignore target like Dropout that‚Äôs train-only)
    common_cols = sorted(set(tr.columns).intersection(set(te.columns)))

    for col in common_cols:
        if tr[col].dtype == "O" or str(tr[col].dtype).startswith("category"):
            le = LabelEncoder()
            combined = pd.concat([tr[col].astype(str), te[col].astype(str)], axis=0)
            le.fit(combined)
            tr[col] = le.transform(tr[col].astype(str))
            te[col] = le.transform(te[col].astype(str))
            encoders[col] = le
        else:
            tr[col] = pd.to_numeric(tr[col], errors="coerce")
            te[col] = pd.to_numeric(te[col], errors="coerce")

    # handle train-only numeric columns like target
    for col in set(tr.columns) - set(te.columns):
        if tr[col].dtype == "O":
            le = LabelEncoder()
            tr[col] = le.fit_transform(tr[col].astype(str))
        else:
            tr[col] = pd.to_numeric(tr[col], errors="coerce")

    # clean infinities / NaNs
    tr.replace([np.inf, -np.inf], np.nan, inplace=True)
    te.replace([np.inf, -np.inf], np.nan, inplace=True)
    tr.fillna(tr.mean(numeric_only=True), inplace=True)
    te.fillna(te.mean(numeric_only=True), inplace=True)

    num_cols = tr.select_dtypes(include="number").columns.tolist()
    return tr, te, num_cols


def dist_plots(train, test, num_features):
    """
    Plot KDE + Boxplots for numeric columns.
    """
    print("\nDistribution analysis (all numeric/object columns converted)\n")
    df = pd.concat(
        [train[num_features].assign(Source="Train"),
         test[num_features].assign(Source="Test")],
        axis=0, ignore_index=True
    )

    n = len(num_features)
    fig, axes = plt.subplots(
        n, 2,
        figsize=(18, n * 4),
        gridspec_kw={"hspace": 0.3, "wspace": 0.2, "width_ratios": [0.70, 0.30]}
    )
    if n == 1:
        axes = np.array([axes])

    for i, col in enumerate(num_features):
        # KDE
        ax = axes[i, 0]
        sns.kdeplot(data=df, x=col, hue="Source",
                    palette=["#3cb371", "#0483ff"], ax=ax, linewidth=2)
        ax.set(xlabel="", ylabel="")
        ax.set_title(f"{col}")
        ax.grid()

        # Boxplot
        ax = axes[i, 1]
        sns.boxplot(data=df, y=col, x="Source", width=0.5,
                    linewidth=1, fliersize=1, ax=ax, palette=["#3cb371", "b"])
        ax.set(xlabel="", ylabel="")
        ax.set_title(f"{col}")
        ax.set_xticklabels(["Train", "Test"])

    plt.tight_layout()
    plt.show()
    
# 1Ô∏è‚É£ Convert everything to numeric
tr_all, te_all, numeric_cols = convert_all_to_numeric(train, test)

print("Numeric columns used for distribution plots:")
print(numeric_cols)

# 2Ô∏è‚É£ Plot all feature distributions
dist_plots(tr_all, te_all, [c for c in numeric_cols if c != target])

In [None]:
train_cols = ['id', 'annual_income', 'debt_to_income_ratio', 'credit_score',
       'loan_amount', 'interest_rate', 'gender', 'marital_status',
       'education_level', 'employment_status', 'loan_purpose',
       'grade_subgrade', 'loan_paid_back']

for i in train_cols:
    print('**=='*20)
    print(f"Unique Values of column {i}")
    print(f"{train[i].value_counts()}")
    

In [None]:
train.describe()

**There's a better way to do this**

In [None]:
for col in train_cols:
    print(f"\n{'='*80}")
    print(f"üîπ Column: {col}")
    print(f"Unique values: {train[col].nunique()}")
    print(train[col].value_counts(dropna=False).head(10))  # top 10 most common


In [None]:
summary = train[train_cols].nunique().reset_index()
summary.columns = ['Column', 'Unique_Values']
summary


In [None]:
from ydata_profiling import ProfileReport

profile = ProfileReport(train[train_cols], title="Loan Data Report")
profile.to_notebook_iframe()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Target column
target = 'loan_paid_back'

# Skip high-cardinality columns
max_uniques = 30

# Select columns to visualize (exclude ID and target)
features = [c for c in train.columns if c not in ['id', target]]

# Prepare the plots
for col in features:
    n_unique = train[col].nunique()
    dtype = train[col].dtype
    
    # Skip columns with too many unique values
    if n_unique > max_uniques:
        continue
    
    print(f"\n{'='*80}")
    print(f"Feature: {col} | Unique values: {n_unique} | Type: {dtype}")
    
    plt.figure(figsize=(7, 4))
    
    if np.issubdtype(dtype, np.number):
        # Numeric feature ‚Üí show boxplot distribution by target
        sns.boxplot(x=target, y=col, data=train, palette="Set2")
        plt.title(f"{col} vs. Loan Paid Back (Boxplot)")
    else:
        # Categorical feature ‚Üí show repayment rate per category
        temp = train.groupby(col)[target].mean().sort_values(ascending=False)
        sns.barplot(x=temp.index, y=temp.values, palette="viridis")
        plt.title(f"Repayment Probability by {col}")
        plt.ylabel("Mean(loan_paid_back)")
        plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()


In [None]:
import pandas as pd
from scipy import stats
import numpy as np

target = 'loan_paid_back'

# Select numeric features only
numeric_cols = train.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c not in [target, 'id']]

anova_results = []

for col in numeric_cols:
    # Separate the two groups
    group1 = train[train[target] == 1][col].dropna()
    group0 = train[train[target] == 0][col].dropna()
    
    # Run ANOVA (F-test)
    f_stat, p_val = stats.f_oneway(group1, group0)
    
    anova_results.append({
        'Feature': col,
        'F-Statistic': f_stat,
        'p-Value': p_val
    })

# Create summary DataFrame
anova_df = pd.DataFrame(anova_results).sort_values(by='p-Value')
anova_df['Significant'] = anova_df['p-Value'] < 0.05

print("üîπ ANOVA Results:")
display(anova_df)


In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import LabelEncoder

target = 'loan_paid_back'

anova_results = []
chi2_results = []

# Split features into numeric & categorical
numeric_cols = train.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c not in [target, 'id']]

categorical_cols = [c for c in train.columns if c not in numeric_cols + [target, 'id']]

# --- 1Ô∏è‚É£ ANOVA for numeric features ---
for col in numeric_cols:
    group1 = train.loc[train[target] == 1, col].dropna()
    group0 = train.loc[train[target] == 0, col].dropna()
    if len(group1) > 1 and len(group0) > 1:
        f_stat, p_val = stats.f_oneway(group1, group0)
        anova_results.append({
            'Feature': col,
            'Test': 'ANOVA (numeric)',
            'Statistic': f_stat,
            'p-Value': p_val
        })

# --- 2Ô∏è‚É£ Chi-square for categorical features ---
for col in categorical_cols:
    # Encode if needed
    temp = train[[col, target]].dropna()
    contingency = pd.crosstab(temp[col], temp[target])
    
    # Chi-square test
    if contingency.shape[0] > 1:
        chi2, p_val, dof, ex = stats.chi2_contingency(contingency)
        chi2_results.append({
            'Feature': col,
            'Test': 'Chi-Square (categorical)',
            'Statistic': chi2,
            'p-Value': p_val
        })

# --- 3Ô∏è‚É£ Combine results ---
results_df = pd.DataFrame(anova_results + chi2_results)
results_df['Significant'] = results_df['p-Value'] < 0.05
results_df = results_df.sort_values(by='p-Value')

print("üîπ Feature Significance (ANOVA + Chi-Square):")
display(results_df)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Sort and plot
sorted_df = results_df.sort_values(by='Statistic', ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(
    data=sorted_df,
    x='Statistic',
    y='Feature',
    hue='Test',
    dodge=False,
    palette='viridis'
)
plt.title("Feature Significance (ANOVA + Chi-Square)")
plt.xlabel("Test Statistic (F / Chi¬≤)")
plt.ylabel("Feature")
plt.legend(title="Test Type")
plt.tight_layout()
plt.show()


In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')
sub = pd.read_csv('/kaggle/input/playground-series-s5e11/sample_submission.csv')

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Suppose your training dataframe is 'train'

# Identify categorical and numeric columns
categorical_cols = train.select_dtypes(include=['object']).columns.tolist()
numeric_cols = train.select_dtypes(exclude=['object']).columns.tolist()

# One-hot encode ALL categorical features (no drop_first to preserve all features)
train_encoded = pd.get_dummies(train, columns=categorical_cols, drop_first=False)

# Compute correlation matrix including ALL columns
corr_matrix = train_encoded.corr()

# Optional: round to 2 decimals for clarity
corr_matrix = corr_matrix.round(2)

# Visualize
plt.figure(figsize=(24, 20))
sns.heatmap(corr_matrix, cmap='coolwarm', center=0)
plt.title('Full Feature Correlation Matrix (All Features from Training Data)')
plt.show()


**#Check new ones **

In [None]:
# # ==============================================================
# # 1. Imports
# # ==============================================================
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.metrics import roc_auc_score, roc_curve
# # Updated Imports: LightGBM, XGBoost, and the new CatBoost
# import lightgbm as lgb
# import xgboost as xgb
# from catboost import CatBoostClassifier 
# import matplotlib.pyplot as plt
# import seaborn as sns
# import warnings
# warnings.filterwarnings('ignore')

# # ==============================================================
# # 2. Load Data
# # ==============================================================
# # NOTE: File paths assume a Kaggle environment; adjust if running elsewhere
# train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
# test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')
# sub = pd.read_csv('/kaggle/input/playground-series-s5e11/sample_submission.csv')

# # ==============================================================
# # 3. Basic Setup
# # ==============================================================
# X = train.drop(columns=['loan_paid_back', 'id'])
# y = train['loan_paid_back']
# X_test = test.drop(columns=['id'])

# # Identify categorical and numeric columns
# cat_cols = X.select_dtypes(include=['object']).columns.tolist()
# num_cols = X.select_dtypes(exclude=['object']).columns.tolist()

# print("Categorical:", cat_cols)
# print("Numerical:", num_cols)

# # ==============================================================
# # 4. Feature Preprocessing (Only Numeric/XGBoost/LightGBM)
# # NOTE: We keep the encoded data for models that require it (LGBM, XGBoost)
# # ==============================================================
# ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
# ct = ColumnTransformer(
#     transformers=[('ohe', ohe, cat_cols)],
#     remainder='passthrough'
# )

# # Fit-transform train, transform test
# X_encoded = ct.fit_transform(X)
# X_test_encoded = ct.transform(X_test)

# # Convert to DataFrame with aligned columns
# encoded_cols = ct.get_feature_names_out()
# X_encoded = pd.DataFrame(X_encoded, columns=encoded_cols)
# X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoded_cols)

# # ==============================================================
# # 5. Split Train/Validation for Curves
# # ==============================================================
# X_train, X_val, y_train, y_val = train_test_split(
#     X_encoded, y, test_size=0.2, random_state=42, stratify=y
# )

# # Create a second split using the UN-ENCODED data for CatBoost
# X_train_cat, X_val_cat, y_train_cat, y_val_cat = train_test_split(
#     X, y, test_size=0.2, random_state=42, stratify=y
# )
# X_test_cat = X_test.copy()

# # ==============================================================
# # 6. CatBoost Classifier (The Special Something)
# # CatBoost natively handles categorical features, so we use the un-encoded data.
# # ==============================================================
# print("\n--- 6. CatBoost Classifier ---")
# # CatBoost's advanced optimization includes ordering principle and native feature handling
# cat_model = CatBoostClassifier(
#     iterations=600,
#     learning_rate=0.05,
#     eval_metric='AUC',
#     random_seed=42,
#     verbose=50,
#     early_stopping_rounds=30,
#     # Use the column names identified in Section 3
#     cat_features=cat_cols 
# )

# cat_model.fit(
#     X_train_cat, y_train_cat,
#     eval_set=[(X_val_cat, y_val_cat)]
# )

# y_val_pred_cat = cat_model.predict_proba(X_val_cat)[:, 1]
# auc_cat = roc_auc_score(y_val_cat, y_val_pred_cat)
# print(f"CatBoost AUC: {auc_cat:.4f}")

# # Plotting the training curve
# evals_result_cat = cat_model.get_evals_result()
# plt.figure(figsize=(6,5))
# plt.plot(evals_result_cat['validation']['AUC'], label='Validation AUC')
# plt.title('CatBoost AUC Training Curve')
# plt.xlabel('Iteration')
# plt.ylabel('AUC')
# plt.legend()
# plt.savefig('catboost_training_curve.png')
# plt.close()

# # Predict test
# cat_pred = cat_model.predict_proba(X_test_cat)[:, 1]
# pd.DataFrame({'id': test['id'], 'loan_paid_back': cat_pred}).to_csv('catboost_predictions.csv', index=False)


# # ==============================================================
# # 7. LightGBM (FIXED: Callbacks and Plotting)
# # ==============================================================
# print("\n--- 7. LightGBM ---")
# lgb_train = lgb.Dataset(X_train, y_train)
# lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

# params = {
#     'objective': 'binary',
#     'metric': 'auc',
#     'learning_rate': 0.05,
#     'num_leaves': 31,
#     'verbose': -1,
#     'seed': 42
# }

# evals_result_lgb = {}
# early_stopping = lgb.early_stopping(stopping_rounds=30, verbose=50)
# record_eval = lgb.record_evaluation(evals_result_lgb)

# lgb_model = lgb.train(
#     params,
#     lgb_train,
#     valid_sets=[lgb_train, lgb_val], 
#     num_boost_round=300,
#     callbacks=[early_stopping, record_eval]
# )

# y_val_pred_lgb = lgb_model.predict(X_val, num_iteration=lgb_model.best_iteration)
# auc_lgb = roc_auc_score(y_val, y_val_pred_lgb)
# print(f"LightGBM AUC: {auc_lgb:.4f}")

# lgb.plot_metric(evals_result_lgb) 
# plt.title('LightGBM AUC Training Curve')
# plt.savefig('lgbm_training_curve.png')
# plt.close()

# lgb_pred = lgb_model.predict(X_test_encoded, num_iteration=lgb_model.best_iteration)
# pd.DataFrame({'id': test['id'], 'loan_paid_back': lgb_pred}).to_csv('lgbm_predictions.csv', index=False)


# # ==============================================================
# # 8. XGBoost (FIXED: Eval Set and Plotting)
# # ==============================================================
# print("\n--- 8. XGBoost ---")
# xgb_model = xgb.XGBClassifier(
#     objective='binary:logistic',
#     eval_metric='auc',
#     use_label_encoder=False,
#     learning_rate=0.05,
#     n_estimators=300,
#     max_depth=6,
#     random_state=42
# )

# xgb_model.fit(
#     X_train, y_train,
#     eval_set=[(X_train, y_train), (X_val, y_val)], 
#     early_stopping_rounds=30, 
#     verbose=False
# )

# y_val_pred_xgb = xgb_model.predict_proba(X_val)[:,1]
# auc_xgb = roc_auc_score(y_val, y_val_pred_xgb)
# print(f"XGBoost AUC: {auc_xgb:.4f}")

# results = xgb_model.evals_result()
# plt.figure(figsize=(6,5))
# plt.plot(results['validation_0']['auc'], label='Train AUC')
# plt.plot(results['validation_1']['auc'], label='Validation AUC') 
# plt.title('XGBoost AUC Training Curve')
# plt.xlabel('Boosting Round')
# plt.ylabel('AUC')
# plt.legend()
# plt.savefig('xgboost_training_curve.png')
# plt.close()

# xgb_pred = xgb_model.predict_proba(X_test_encoded)[:,1]
# pd.DataFrame({'id': test['id'], 'loan_paid_back': xgb_pred}).to_csv('xgboost_predictions.csv', index=False)


# # ==============================================================
# # 9. Weighted Averaging Ensemble (Advanced Ensemble)
# # Simple ensemble using equal weights for demonstration
# # ==============================================================
# print("\n--- 9. Weighted Averaging Ensemble ---")

# # Combine predictions on the test set
# ensemble_pred = (
#     lgb_pred * 0.33 +
#     xgb_pred * 0.33 +
#     cat_pred * 0.34 # Slightly higher weight for CatBoost (arbitrary for demonstration)
# )

# # Create final submission file
# pd.DataFrame({
#     'id': test['id'], 
#     'loan_paid_back': ensemble_pred
# }).to_csv('ensemble_predictions.csv', index=False)

# # ==============================================================
# # 10. Summary
# # ==============================================================
# print(f"\nModel Summary (Validation AUC):")
# print(f"CatBoost:            {auc_cat:.4f} (Used un-encoded data)")
# print(f"LightGBM:            {auc_lgb:.4f} (Used encoded data)")
# print(f"XGBoost:             {auc_xgb:.4f} (Used encoded data)")
# print("Ensemble predictions saved to 'ensemble_predictions.csv'.")
# print("All individual predictions and training curves saved.")

In [None]:
# #Food for tjpoghty 
# Model Summary (Validation AUC):
# CatBoost:            0.9169 (Used un-encoded data)
# LightGBM:            0.9200 (Used encoded data)
# XGBoost:             0.9187 (Used encoded data)
# Ensemble predictions saved to 'ensemble_predictions.csv'.
# All individual predictions and training curves saved.

In [None]:
# !kaggle competitions submit -c playground-series-s5e11 -f submission.csv -m "LightGBM AUC 0.9204"
