# Init

In [11]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas 
import numpy 
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("seaborn-v0_8")
sns.set_palette("crest")

In [2]:
sample = pandas.read_csv("sample_submission.csv")
train = pandas.read_csv("train.csv")
test = pandas.read_csv("test.csv")

# XG BOOST 

In [159]:
X = train.drop(columns=['id', 'loan_paid_back']).copy()
y = train['loan_paid_back'].copy()

def build_pipeline(X,cols):
    X_sub = X[cols]
    num_cols = X_sub.select_dtypes(include=['int64', 'float64']).columns.tolist()
    cat_cols = X_sub.select_dtypes(include=['object']).columns.tolist()

    numeric_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
    ])
    categorical_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ])
    preprocessor = ColumnTransformer([
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ], remainder="drop")

    clf = Pipeline([
        ("pre", preprocessor),
        ("clf", XGBClassifier(
            use_label_encoder=False,
            eval_metric="auc",
            n_estimators=300,
            learning_rate=0.03,
            max_depth=4,
            min_child_weight=3,
            gamma=0.2,
            subsample=0.8,
            colsample_bytree=0.8,
            scale_pos_weight=4,          # <---- IMPORTANT
            n_jobs=-1,
            random_state=42
        ))
    ])
    return clf

In [128]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, roc_auc_score,f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.calibration import calibration_curve
from sklearn.model_selection import cross_val_predict

def get_metrics(X,cols,y = train['loan_paid_back'].copy()):
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    base_clf = build_pipeline(X,cols)
    base_auc = cross_val_score(base_clf, X[cols], y, scoring="roc_auc", cv=cv, n_jobs=-1).mean()
    base_acc = cross_val_score(base_clf, X[cols], y, scoring="accuracy", cv=cv, n_jobs=-1).mean()


    # ensure labels are integer
    y = y.astype(int)

    # get cross-validated predictions and probabilities (uses base_clf / XGB pipeline)
    preds = cross_val_predict(base_clf, X[cols], y, cv=cv, method="predict", n_jobs=-1)
    probs = cross_val_predict(base_clf, X[cols], y, cv=cv, method="predict_proba", n_jobs=-1)[:, 1]

    sns.set_style("whitegrid")


    fig, axes = plt.subplots(2, 2, figsize=(12, 9))
    ax = axes.ravel()

    # overall histogram + KDE
    sns.histplot(probs, bins=40, kde=True, stat="density", color="C0", ax=ax[0])
    ax[0].set_title("Predicted probability (P(class=1)) — overall")
    ax[0].set_xlabel("P(class=1)")

    # class-wise distributions
    sns.histplot(probs[y == 0], bins=40, color="C1", label="true class 0", stat="density", alpha=0.6, ax=ax[1])
    sns.histplot(probs[y == 1], bins=40, color="C2", label="true class 1", stat="density", alpha=0.6, ax=ax[1])
    ax[1].legend()
    ax[1].set_title("Predicted probabilities by true class")

    # calibration curve
    prob_true, prob_pred = calibration_curve(y, probs, n_bins=10)
    ax[2].plot(prob_pred, prob_true, marker="o")
    ax[2].plot([0, 1], [0, 1], "--", color="gray")
    ax[2].set_xlabel("Mean predicted prob")
    ax[2].set_ylabel("Fraction of positives")
    ax[2].set_title("Calibration curve")

    # empirical CDF
    sorted_p = np.sort(probs)
    ecdf = np.arange(1, len(sorted_p) + 1) / len(sorted_p)
    ax[3].plot(sorted_p, ecdf)
    ax[3].set_xlabel("P(class=1)")
    ax[3].set_ylabel("ECDF")
    ax[3].set_title("Empirical CDF of predicted probabilities")

    plt.tight_layout()
    plt.show()

    # quick numeric summaries
    print("Model used: XGBClassifier (wrapped in pipeline)")
    print("Mean prob:", np.mean(probs).round(4), "Median prob:", np.median(probs).round(4))
    for t in (0.25, 0.5, 0.75):
        print(f"Fraction <= {t}: {(probs <= t).mean():.3f}")
    try:
        prec_class_0 = precision_score(y, preds, pos_label=0)
        recall_class_0 = recall_score(y, preds, pos_label=0)
        f1 =f1_score(y, preds, pos_label=0)
        auc = roc_auc_score(y, probs)
        print(f"Precision (class 0): {prec_class_0:.4f}")
        print(f"Recall    (class 0): {recall_class_0:.4f}")
        print(f"F1: {f1}")
        print(f"AUC    : {auc:.4f}")
    except NameError:
        pass



In [168]:
X_test

Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_to_income,effective_burden,effective_income,rate_stress,emi_estimate
0,10.267507,0.049,626,11461.42,14.73,Female,Single,High School,Employed,Other,D5,0.398228,561.60958,28780.099,16442.814520,14068.893050
1,10.749943,0.093,732,15492.25,12.85,Female,Married,Master's,Employed,Other,C1,0.332264,1440.77925,46625.483,18518.740500,16589.617708
2,10.914286,0.367,611,3796.41,13.29,Male,Single,Bachelor's,Employed,Debt consolidation,D1,0.069082,1393.28247,54954.257,4622.774980,4204.524075
3,10.152128,0.110,671,6574.30,9.57,Female,Single,Bachelor's,Employed,Debt consolidation,C3,0.256362,723.17300,25643.740,6197.326128,5243.004250
4,10.133434,0.081,688,17696.89,12.80,Female,Married,PhD,Employed,Business,C1,0.703105,1433.44809,25168.721,22353.745319,18876.682667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254564,11.438600,0.068,744,29704.00,13.48,Female,Single,Bachelor's,Employed,Debt consolidation,B2,0.319962,2019.87200,92835.038,35005.150297,33367.493333
254565,10.796458,0.091,634,20284.33,9.58,Female,Married,High School,Employed,Debt consolidation,D4,0.415267,1845.87403,48845.561,17998.855164,16193.656783
254566,9.936415,0.096,718,26387.55,9.00,Male,Single,Master's,Employed,Debt consolidation,C4,1.276702,2533.20480,20667.616,23900.766998,19790.662500
254567,10.437231,0.094,739,11107.36,9.81,Male,Single,Bachelor's,Employed,Business,C2,0.325680,1044.09184,34104.184,10439.856998,9080.266800


# Feature Engineering

In [29]:
y = y.astype(int)
obj_cols = X.select_dtypes(include=["object"]).columns.tolist()
for col in obj_cols:
    X[col] = X[col].apply(lambda v: v.lower() if isinstance(v, str) else v)


In [89]:
X_altered = train.drop(columns=['id','loan_paid_back']).copy()
#batch 3
X_altered["grade"] = X_altered["grade_subgrade"].str[0]
X_altered["subgrade_num"] = X_altered["grade_subgrade"].str[1:].astype(int)
#batch 1
X_altered['loan_to_income'] = X_altered['loan_amount'] / X_altered['annual_income']
X_altered['effective_burden'] = X_altered['debt_to_income_ratio'] * X_altered['loan_amount']
X_altered['effective_income'] = X_altered['annual_income'] - (1-X_altered['debt_to_income_ratio'])
X_altered["annual_income"] = np.log1p(X_altered["annual_income"])
#batch 2 
X_altered["rate_stress"] = X_altered['interest_rate'] * X_altered['loan_amount'] / X_altered['annual_income']
X_altered["emi_estimate"] = X_altered['interest_rate']/12 * X_altered['loan_amount']


In [194]:
base_clf = build_pipeline(X_altered, batch_two_cols)
base_clf.fit(X_altered[batch_two_cols],y)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,steps,"[('pre', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [195]:
t_probs = base_clf.predict_proba(X_altered[batch_two_cols])[:,1]
t_preds = (t_probs>0.5).astype(int)

In [196]:
prec0   = precision_score(y, t_preds, pos_label=0)
recall0 = recall_score(y, t_preds, pos_label=0)
f1_0    = f1_score(y, t_preds, pos_label=0)
auc     = roc_auc_score(y,t_probs)

print("Model: CatBoostClassifier")
print(f"Precision (class 0): {prec0:.4f}")
print(f"Recall    (class 0): {recall0:.4f}")
print(f"F1        (class 0): {f1_0:.4f}")
print(f"AUC       : {auc:.4f}")

Model: CatBoostClassifier
Precision (class 0): 0.9695
Recall    (class 0): 0.5019
F1        (class 0): 0.6614
AUC       : 0.9163


In [197]:
t2_probs = base_clf.predict_proba(X_test[batch_two_cols])[:,1]
test_preds = (t2_probs>0.1).astype(int)
sample["loan_paid_back"] = t2_probs
sample.to_csv("submission_xgtboost.csv", index=False)

In [119]:
batch_one_cols = ['annual_income',
 'debt_to_income_ratio',
 'credit_score',
 'loan_amount',
 'interest_rate',
 'gender',
 'marital_status',
 'education_level',
 'employment_status',
 'loan_purpose',
 'grade_subgrade', 'loan_to_income', 'effective_burden','effective_income']

batch_two_cols = batch_one_cols+['rate_stress','emi_estimate']

batch_three_cols = batch_two_cols+['grade','subgrade_num']
batch_three_cols.remove('grade_subgrade')

# Catboost

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from catboost import CatBoostClassifier
import numpy as np

def build_catboost_pipeline(X_sup, cols):
    X = X_sup[cols]
    num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()

    # numeric imputation only
    preprocessor = ColumnTransformer([
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", "passthrough", cat_cols),
    ])

    # We need a helper wrapper so we can dynamically inject cat indices
    def catboost_with_cats():
        # numeric come first, then categorical (ColumnTransformer order)
        cat_indices = list(range(len(num_cols), len(num_cols) + len(cat_cols)))

        return CatBoostClassifier(
            iterations=600,
            learning_rate=0.05,
            depth=6,
            loss_function="Logloss",
            eval_metric="AUC",
            thread_count=-1,
            random_seed=42,
            l2_leaf_reg=3,
            border_count=128,
            class_weights=[4, 1],
            cat_features=cat_indices,   # ← IMPORTANT FIX
        )

    clf = Pipeline([
        ("pre", preprocessor),
        ("clf", catboost_with_cats()),
    ])

    return clf


In [None]:
from joblib import parallel_config
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

clf = build_catboost_pipeline(X_altered, batch_two_cols)

# numeric labels
y = train['loan_paid_back'].astype(int)

with parallel_config(backend='threading', prefer="threads", temp_folder=None, tqdm=True):
    # This call will now show a tqdm progress bar
    probs = cross_val_predict(clf, X_altered, y, cv=cv, method="predict_proba")[:,1]
preds = (probs > 0.5).astype(int)

prec0   = precision_score(y, preds, pos_label=0)
recall0 = recall_score(y, preds, pos_label=0)
f1_0    = f1_score(y, preds, pos_label=0)
auc     = roc_auc_score(y, probs)

print("Model: CatBoostClassifier")
print(f"Precision (class 0): {prec0:.4f}")
print(f"Recall    (class 0): {recall0:.4f}")
print(f"F1        (class 0): {f1_0:.4f}")
print(f"AUC       : {auc:.4f}")


In [None]:
clf.fit(X_altered[batch_two_cols], y)

In [163]:
t_probs = clf.predict_proba(X_altered[batch_two_cols])[:,1]
t_preds = (t_probs>0.5).astype(int)

In [164]:
prec0   = precision_score(y, t_preds, pos_label=0)
recall0 = recall_score(y, t_preds, pos_label=0)
f1_0    = f1_score(y, t_preds, pos_label=0)
auc     = roc_auc_score(y,t_probs)

print("Model: CatBoostClassifier")
print(f"Precision (class 0): {prec0:.4f}")
print(f"Recall    (class 0): {recall0:.4f}")
print(f"F1        (class 0): {f1_0:.4f}")
print(f"AUC       : {auc:.4f}")

Model: CatBoostClassifier
Precision (class 0): 0.6285
Recall    (class 0): 0.7903
F1        (class 0): 0.7002
AUC       : 0.9197


In [148]:
X_test =test.drop(columns=['id',]).copy()
#batch 1
X_test['loan_to_income'] = X_test['loan_amount'] / X_test['annual_income']
X_test['effective_burden'] = X_test['debt_to_income_ratio'] * X_test['loan_amount']
X_test['effective_income'] = X_test['annual_income'] - (1-X_test['debt_to_income_ratio'])
X_test["annual_income"] = np.log1p(X_test["annual_income"])
#batch 2 
X_test["rate_stress"] = X_test['interest_rate'] * X_test['loan_amount'] / X_test['annual_income']
X_test["emi_estimate"] = X_test['interest_rate']/12 * X_test['loan_amount']


In [199]:
test_probs = clf.predict_proba(X_test[batch_two_cols])[:,1]

In [200]:
test_preds = (test_probs>0.1).astype(int)
sample["loan_paid_back"] = test_probs


In [201]:
sample.to_csv("submission_catboost.csv", index=False)

In [174]:
X_test[batch_two_cols].head(5)


Unnamed: 0,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_to_income,effective_burden,effective_income,rate_stress,emi_estimate
0,10.267507,0.049,626,11461.42,14.73,Female,Single,High School,Employed,Other,D5,0.398228,561.60958,28780.099,16442.81452,14068.89305
1,10.749943,0.093,732,15492.25,12.85,Female,Married,Master's,Employed,Other,C1,0.332264,1440.77925,46625.483,18518.7405,16589.617708
2,10.914286,0.367,611,3796.41,13.29,Male,Single,Bachelor's,Employed,Debt consolidation,D1,0.069082,1393.28247,54954.257,4622.77498,4204.524075
3,10.152128,0.11,671,6574.3,9.57,Female,Single,Bachelor's,Employed,Debt consolidation,C3,0.256362,723.173,25643.74,6197.326128,5243.00425
4,10.133434,0.081,688,17696.89,12.8,Female,Married,PhD,Employed,Business,C1,0.703105,1433.44809,25168.721,22353.745319,18876.682667


# FIXING SHIT

In [None]:
X_lat = X_altered[batch_two_cols]
y =train["loan_paid_back"].astype(int)

In [177]:
from sklearn.model_selection import train_test_split

X_dev, X_val, y_dev, y_val = train_test_split(
    X_lat[batch_two_cols],
    train["loan_paid_back"].astype(int),
    test_size=0.2,
    random_state=42,
    stratify=train["loan_paid_back"]
)

In [178]:
from catboost import CatBoostClassifier, Pool

cat_cols = X_dev.select_dtypes(include='object').columns.tolist()

train_pool = Pool(X_dev, y_dev, cat_features=cat_cols)
val_pool   = Pool(X_val, y_val, cat_features=cat_cols)

model = CatBoostClassifier(
    iterations=800,
    learning_rate=0.05,
    depth=6,
    loss_function="Logloss",
    eval_metric="AUC",
    class_weights=[4,1],
    random_seed=42,
    verbose=False
)

model.fit(train_pool, eval_set=val_pool)


<catboost.core.CatBoostClassifier at 0x2b2f2f18560>

In [217]:
val_probs_cat = model.predict_proba(val_pool)[:, 1]
val_preds = (val_probs_cat > 0.5).astype(int)
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score

print("VAL Precision_0:", precision_score(y_val, val_preds, pos_label=0))
print("VAL Recall_0:", recall_score(y_val, val_preds, pos_label=0))
print("VAL F1_0:", f1_score(y_val, val_preds, pos_label=0))
print("VAL AUC:", roc_auc_score(y_val, val_probs_cat))
print("VAL Accuracy:", accuracy_score(y_val, val_preds))

VAL Precision_0: 0.6397287083277096
VAL Recall_0: 0.7932635983263598
VAL F1_0: 0.7082710699342498
VAL AUC: 0.9221920062911193
VAL Accuracy: 0.8685342469212703


In [188]:
full_pool= Pool(X_lat, y, cat_features=cat_cols)
model = CatBoostClassifier(
    iterations=800,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3,
    class_weights=[4,1],
    eval_metric="AUC",
    random_seed=42,
    verbose=False
)
model.fit(full_pool)

<catboost.core.CatBoostClassifier at 0x2b2f2b5a120>

In [211]:
test_pool = Pool(X_test[batch_two_cols], cat_features=cat_cols)
test_probs_cat = model.predict_proba(test_pool)[:, 1]


In [191]:
submission = pandas.DataFrame({
    "id": test["id"],
    "loan_paid_back": test_probs
})
submission.to_csv("submission_please_work.csv", index=False)

# LGBM

In [202]:
import lightgbm as lgb

# Copy engineered features
X_lgb = X_lat.copy()
y_lgb = train["loan_paid_back"].astype(int)

# Convert categorical columns to category dtype
cat_cols = ['gender', 'marital_status', 'education_level',
            'employment_status', 'loan_purpose', 'grade_subgrade']

for col in cat_cols:
    X_lgb[col] = X_lgb[col].astype("category")


In [203]:
from sklearn.model_selection import train_test_split

X_dev, X_val, y_dev, y_val = train_test_split(
    X_lgb, y_lgb,
    test_size=0.2,
    random_state=42,
    stratify=y_lgb
)


In [204]:
train_set = lgb.Dataset(X_dev, label=y_dev, categorical_feature=cat_cols)
val_set   = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_cols)

In [205]:
params = {
    "objective": "binary",
    "metric": "auc",
    "boosting_type": "gbdt",
    "learning_rate": 0.03,
    "num_leaves": 64,
    "max_depth": -1,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "lambda_l2": 2.0,
    "min_data_in_leaf": 40,
    "verbosity": -1,
    "is_unbalance": True,  # handles class imbalance
    "seed": 42
}

In [207]:
lgb_model = lgb.train(
    params,
    train_set,
    valid_sets=[train_set, val_set],
    valid_names=["train", "val"],
    num_boost_round=3000,
    callbacks=[
        lgb.early_stopping(stopping_rounds=200),
        lgb.log_evaluation(period=200)
    ],
)
# ..

Training until validation scores don't improve for 200 rounds
[200]	train's auc: 0.924402	val's auc: 0.919687
[400]	train's auc: 0.930121	val's auc: 0.920664
[600]	train's auc: 0.9348	val's auc: 0.920977
[800]	train's auc: 0.938972	val's auc: 0.920993
[1000]	train's auc: 0.942595	val's auc: 0.921026
[1200]	train's auc: 0.946102	val's auc: 0.92105
Early stopping, best iteration is:
[1154]	train's auc: 0.945305	val's auc: 0.921091


In [216]:
val_probs_lgbm = lgb_model.predict(X_val)
val_preds = (val_probs_lgbm > 0.5).astype(int)

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

print("VAL AUC:", roc_auc_score(y_val, val_probs_lgbm))
print("VAL Precision_0:", precision_score(y_val, val_preds, pos_label=0))
print("VAL Recall_0:", recall_score(y_val, val_preds, pos_label=0))
print("VAL F1_0:", f1_score(y_val, val_preds, pos_label=0))


VAL AUC: 0.9210913002817662
VAL Precision_0: 0.6471420193866639
VAL Recall_0: 0.7849372384937239
VAL F1_0: 0.7094102743481632


In [209]:
test_lgb = X_test[batch_two_cols].copy()
for col in cat_cols:
    test_lgb[col] = test_lgb[col].astype("category")

test_probs_lgb = lgb_model.predict(test_lgb)


In [210]:
submission = pandas.DataFrame({
    "id": test["id"],
    "loan_paid_back": test_probs_lgb
})
submission.to_csv("submission_lgbm.csv", index=False)

In [None]:
final_probs = (test_probs_lgb + test_probs_cat) / 2


In [218]:
best_auc = 0
best_w = None

for w in np.linspace(0.1, 0.9, 17):
    blended = w * val_probs_cat + (1-w) * val_probs_lgbm
    score = roc_auc_score(y_val, blended)
    if score > best_auc:
        best_auc = score
        best_w = w

print(best_w, best_auc)


0.7000000000000001 0.9224152037261726


In [223]:
blended = ((1-best_w) * test_probs_lgb + best_w*test_probs_cat)

In [225]:
submission = pandas.DataFrame({
    "id": test["id"],
    "loan_paid_back": blended
})
submission.to_csv("submission_ensemble_blend.csv", index=False)