In [None]:
# Loan Default Prediction - Model Build

In [1]:
!pip install xgboost onnxmltools onnx scikit-learn skl2onnx

Collecting onnxmltools
  Downloading onnxmltools-1.14.0-py2.py3-none-any.whl.metadata (8.1 kB)
Collecting onnx
  Downloading onnx-1.18.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting skl2onnx
  Downloading skl2onnx-1.19.1-py3-none-any.whl.metadata (3.8 kB)
Downloading onnxmltools-1.14.0-py2.py3-none-any.whl (352 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m352.5/352.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnx-1.18.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m78.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading skl2onnx-1.19.1-py3-none-any.whl (315 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: onnx, onnxmltools, skl2onnx
Successfully installed onnx-1.18.0 onnxmltools-1.14.0 sk

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, accuracy_score
from xgboost import XGBClassifier
import joblib
import onnxmltools
from onnxmltools.convert.common.data_types import FloatTensorType
import onnx

In [3]:
# STEP 1: Load Dataset
df = pd.read_csv('/content/Loan_default_Selected_8_features.csv')
df = df.dropna()

feature_cols = ['Age', 'Income', 'LoanAmount', 'CreditScore',
                'MonthsEmployed', 'InterestRate', 'DTIRatio', 'LoanTerm']
target_col = 'Default'


In [4]:
X = df[feature_cols]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)


In [5]:
# STEP 2: Train Scikit-learn XGB
xgb_model = XGBClassifier(random_state=8, use_label_encoder=False, eval_metric="logloss")
xgb_model.fit(X_train.values, y_train.values)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [6]:
# Save Model
joblib.dump(xgb_model, "loan_default_xgb_model.pkl")

['loan_default_xgb_model.pkl']

In [7]:
# Export to ONNX
initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))]
onnx_model = onnxmltools.convert_xgboost(xgb_model, initial_types=initial_type)
onnx.save_model(onnx_model, "Loan_default_prediction_model.onnx")
print("✅ Scikit-learn model saved: .pkl and .onnx")

✅ Scikit-learn model saved: .pkl and .onnx


In [8]:
# STEP 3: Find Best Threshold
y_probs = xgb_model.predict_proba(X_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_probs)

In [9]:
# Select threshold maximizing Youden’s J (tpr - fpr)
best_thresh = thresholds[np.argmax(tpr - fpr)]
print(f"Best Threshold: {best_thresh:.4f}")

joblib.dump(best_thresh, "loan_default_threshold.pkl")


Best Threshold: 0.5070


['loan_default_threshold.pkl']

In [10]:
# STEP 4: Train Spark XGB
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from xgboost.spark import SparkXGBClassifier


In [11]:
spark = SparkSession.builder.appName("Loan_Default_Prediction_XGB").getOrCreate()
spark_df = spark.createDataFrame(df)


In [12]:
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
spark_df = assembler.transform(spark_df).select("features", target_col)


In [13]:
train, test = spark_df.randomSplit([0.7, 0.3], seed=42)

In [14]:
spark_xgb = SparkXGBClassifier(
    features_col="features", label_col=target_col,
    num_workers=2, max_depth=5, eta=0.1, num_round=200
)
spark_xgb_model = spark_xgb.fit(train)

INFO:XGBoost-PySpark:Running xgboost-3.0.4 on 2 workers with
	booster params: {'objective': 'binary:logistic', 'device': 'cpu', 'max_depth': 5, 'eta': 0.1, 'num_round': 200, 'nthread': 1}
	train_call_kwargs_params: {'verbose_eval': True, 'num_boost_round': 100}
	dmatrix_kwargs: {'nthread': 1, 'missing': nan}
INFO:XGBoost-PySpark:Finished xgboost training!


In [15]:
# Save Spark Booster as ONNX
booster = spark_xgb_model.get_booster()
booster.save_model("spark_xgb_model.json")

initial_type = [('float_input', FloatTensorType([None, len(feature_cols)]))]
onnx_model_spark = onnxmltools.convert_xgboost(booster, initial_types=initial_type)
onnx.save_model(onnx_model_spark, "loan_default_spark_xgb_model.onnx")

print("✅ Spark XGB model saved: .onnx")

✅ Spark XGB model saved: .onnx


In [16]:
spark.stop()