### Setup MLFLow tracker

In [26]:
import mlflow
mlflow.set_tracking_uri("http://54.236.35.141:5000/")
mlflow.set_experiment("model-monitors")


<Experiment: artifact_location='s3://loan-prediction-artifacts/2', creation_time=1768247401172, experiment_id='2', last_update_time=1768247401172, lifecycle_stage='active', name='model-monitors', tags={'mlflow.experimentKind': 'custom_model_development'}>

## Train a model

In [27]:
import pandas as pd
df = pd.read_csv("dataset/train.csv")

In [28]:
df

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade,loan_paid_back
0,0,29367.99,0.084,736,2528.42,13.67,Female,Single,High School,Self-employed,Other,C3,1.0
1,1,22108.02,0.166,636,4593.10,12.92,Male,Married,Master's,Employed,Debt consolidation,D3,0.0
2,2,49566.20,0.097,694,17005.15,9.76,Male,Single,High School,Employed,Debt consolidation,C5,1.0
3,3,46858.25,0.065,533,4682.48,16.10,Female,Single,High School,Employed,Debt consolidation,F1,1.0
4,4,25496.70,0.053,665,12184.43,10.21,Male,Married,High School,Employed,Other,D1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
593989,593989,23004.26,0.152,703,20958.37,10.92,Female,Single,High School,Employed,Business,C3,1.0
593990,593990,35289.43,0.105,559,3257.24,14.62,Male,Single,Bachelor's,Employed,Debt consolidation,F5,1.0
593991,593991,47112.64,0.072,675,929.27,14.13,Female,Married,Bachelor's,Employed,Debt consolidation,C1,1.0
593992,593992,76748.44,0.067,740,16290.40,9.87,Male,Single,Bachelor's,Employed,Debt consolidation,B2,1.0


In [29]:
df.columns.tolist()

['id',
 'annual_income',
 'debt_to_income_ratio',
 'credit_score',
 'loan_amount',
 'interest_rate',
 'gender',
 'marital_status',
 'education_level',
 'employment_status',
 'loan_purpose',
 'grade_subgrade',
 'loan_paid_back']

In [30]:
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
num_cols = df.select_dtypes(exclude=["object"]).columns.tolist()
print("Categorical columns:", cat_cols)
print("Numerical columns:", num_cols)

Categorical columns: ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade']
Numerical columns: ['id', 'annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate', 'loan_paid_back']


In [31]:
df[cat_cols].describe()

Unnamed: 0,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
count,593994,593994,593994,593994,593994,593994
unique,3,4,5,5,8,30
top,Female,Single,Bachelor's,Employed,Debt consolidation,C3
freq,306175,288843,279606,450645,324695,58695


In [32]:
df[num_cols].describe()

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,loan_paid_back
count,593994.0,593994.0,593994.0,593994.0,593994.0,593994.0,593994.0
mean,296996.5,48212.202976,0.120696,680.916009,15020.297629,12.356345,0.79882
std,171471.442236,26711.942078,0.068573,55.424956,6926.530568,2.008959,0.400883
min,0.0,6002.43,0.011,395.0,500.09,3.2,0.0
25%,148498.25,27934.4,0.072,646.0,10279.62,10.99,1.0
50%,296996.5,46557.68,0.096,682.0,15000.22,12.37,1.0
75%,445494.75,60981.32,0.156,719.0,18858.58,13.68,1.0
max,593993.0,393381.74,0.627,849.0,48959.95,20.99,1.0


In [33]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=["loan_paid_back"])
y = df["loan_paid_back"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [34]:
#target encoding
import category_encoders as ce
encoder = ce.TargetEncoder(cols=cat_cols)
X_train[cat_cols] = encoder.fit_transform(X_train[cat_cols], y_train)
X_test[cat_cols] = encoder.transform(X_test[cat_cols])
X_train

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
29245,29245,46557.68,0.196,637,17736.94,14.24,0.796315,0.797130,0.810160,0.078334,0.797570,0.713594
272587,272587,36459.09,0.092,703,2746.15,12.98,0.796315,0.800087,0.789336,0.894382,0.811056,0.835771
464723,464723,58848.71,0.077,659,7286.64,10.40,0.802359,0.799222,0.789336,0.894382,0.797570,0.721923
135096,135096,49174.48,0.078,739,23461.42,10.94,0.802359,0.800087,0.810160,0.894382,0.824578,0.843676
39097,39097,54492.87,0.093,691,8378.22,9.40,0.796315,0.799222,0.810160,0.894382,0.797570,0.851439
...,...,...,...,...,...,...,...,...,...,...,...,...
110268,110268,63490.98,0.063,668,16960.02,9.89,0.796315,0.800087,0.789336,0.894382,0.797570,0.721923
259178,259178,49384.88,0.082,659,7862.06,12.62,0.796315,0.799222,0.810160,0.894382,0.778130,0.715089
365838,365838,38999.04,0.111,703,19900.32,11.08,0.796315,0.799222,0.789336,0.078334,0.797570,0.861390
131932,131932,53920.25,0.050,626,20306.85,14.29,0.796315,0.799222,0.803659,0.894382,0.800642,0.715089


In [35]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)

In [36]:
model.fit(X_train, y_train)

0,1,2
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`.",'gini'
,"splitter  splitter: {""best"", ""random""}, default=""best"" The strategy used to choose the split at each node. Supported strategies are ""best"" to choose the best split and ""random"" to choose the best random split.",'best'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: int, float or {""sqrt"", ""log2""}, default=None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at  each split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. note::  The search for a split does not stop until at least one  valid partition of the node samples is found, even if it requires to  effectively inspect more than ``max_features`` features.",
,"random_state  random_state: int, RandomState instance or None, default=None Controls the randomness of the estimator. The features are always randomly permuted at each split, even if ``splitter`` is set to ``""best""``. When ``max_features < n_features``, the algorithm will select ``max_features`` at random at each split before finding the best split among them. But the best found split may vary across different runs, even if ``max_features=n_features``. That is the case, if the improvement of the criterion is identical for several splits and one split has to be selected at random. To obtain a deterministic behaviour during fitting, ``random_state`` has to be fixed to an integer. See :term:`Glossary ` for details.",42
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0


In [37]:
X_train.iloc[0].tolist()

[29245.0,
 46557.68,
 0.196,
 637.0,
 17736.94,
 14.24,
 0.796314823659536,
 0.7971295060080107,
 0.8101600329548193,
 0.07833383003599184,
 0.7975704405061509,
 0.7135939741750359]

In [38]:
model.predict([X_train.iloc[0].tolist()])



array([0.])

In [39]:
y_test

404674    1.0
549728    1.0
125237    0.0
512666    1.0
101001    1.0
         ... 
213431    1.0
71672     1.0
565660    0.0
440849    1.0
262133    1.0
Name: loan_paid_back, Length: 178199, dtype: float64

In [40]:
X_test

Unnamed: 0,id,annual_income,debt_to_income_ratio,credit_score,loan_amount,interest_rate,gender,marital_status,education_level,employment_status,loan_purpose,grade_subgrade
404674,404674,52470.61,0.241,724,27172.82,13.35,0.796315,0.800087,0.832303,0.894382,0.797570,0.851439
549728,549728,28424.82,0.033,779,15895.96,11.57,0.796315,0.800087,0.803659,0.894382,0.802576,0.936005
125237,125237,25229.55,0.195,569,15216.06,13.83,0.802359,0.799222,0.803289,0.894382,0.797570,0.633715
512666,512666,91612.58,0.166,659,13166.90,12.47,0.802359,0.799222,0.803289,0.894382,0.782000,0.733615
101001,101001,79712.79,0.079,767,23642.37,11.23,0.796315,0.799222,0.789336,0.894382,0.778130,0.915598
...,...,...,...,...,...,...,...,...,...,...,...,...
213431,213431,38768.55,0.130,683,2280.72,13.15,0.796315,0.800087,0.810160,0.894382,0.782000,0.835771
71672,71672,25684.40,0.042,753,6014.89,10.36,0.802359,0.799222,0.789336,0.894382,0.797570,0.936005
565660,565660,14145.21,0.087,667,1352.92,12.94,0.802359,0.800087,0.789336,0.078334,0.797570,0.721923
440849,440849,20676.22,0.084,674,26015.86,10.97,0.796315,0.799222,0.810160,0.894382,0.797570,0.851439


In [41]:
from sklearn.metrics import accuracy_score, recall_score, precision_score

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test set:", accuracy)

recall = recall_score(y_test, y_pred)
print("Recall on test set:", recall)

precision = precision_score(y_test, y_pred)
print("Precision on test set:", precision)



Accuracy on test set: 0.8480126151100735
Recall on test set: 0.8993821339601131
Precision on test set: 0.9090546980581834


In [42]:
mlflow.log_params({
    "model_type": "DecisionTreeClassifier",
    "random_state": 42,
    "test_size": 0.3,
    })
mlflow.log_metric("accuracy", accuracy)  
mlflow.log_metric("recall", recall)
mlflow.log_metric("precision", precision)  

## Model Saving

### ONNX

In [43]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

#model input type
initial_type = [('float_input', FloatTensorType([None,  X.shape[1]]))]

#cxreate ONNX model
onx = convert_sklearn(
    model,
    initial_types=initial_type)

#write to disk
with open("model.onnx", "wb") as f:
    f.write(onx.SerializeToString())

### Pickle

In [44]:
import pickle
with open("model.pkl","wb") as f:
    pickle.dump(model,f)

### Version model in MLFlow registry

In [45]:
mlflow.sklearn.log_model(model, "model")
mlflow.onnx.log_model(onnx_model=onx, artifact_path="onnx-model")
mlflow.end_run()



üèÉ View run painted-snail-143 at: http://54.236.35.141:5000/#/experiments/2/runs/8ba77eab225b4e3e92fe2ec9b801ec9a
üß™ View experiment at: http://54.236.35.141:5000/#/experiments/2
