In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import importlib
import functions_sandbox as sand
importlib.reload(sand)
import pickle
import time
from imblearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.model_selection import (
    train_test_split,
    cross_validate,
    KFold,
    RandomizedSearchCV,
    StratifiedKFold
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

---

## ML modeling

#### Preprocessing

In [7]:
path_train = r"D:\IT_projects\Turing_Colledge\train_df.csv"

In [8]:
df_train = pd.read_csv(path_train)
df_train.head(3)

Unnamed: 0,target,name_contract_type,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_annuity,name_type_suite,name_income_type,name_education_type,...,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,active_credit_count,total_debt_all,prol_credits,credit_to_income
0,1,Cash loans,M,N,Y,0,10.114619,Unaccompanied,Working,Secondary / secondary special,...,0,0,0,0,0,1,1.098612,positive,0.0,1.101238
1,0,Cash loans,F,N,N,0,10.482892,Family,State servant,Higher education,...,0,0,0,0,0,0,0.693147,0,0.0,1.756262
2,0,Revolving loans,M,Y,Y,0,8.817446,Unaccompanied,Working,Secondary / secondary special,...,0,0,0,0,0,0,0.0,0,0.0,1.098612


Since we have our clean prepared data, we can split it into three parts:  
data train, data validation and data test. Data validation will have 15% of  
whole dataset as well as data test. 70% of dataset will be to train models

In [9]:
X = df_train.drop(columns="target", axis=1)
y = df_train["target"]

X_train, X_remain, y_train, y_remain = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_remain, y_remain, test_size=0.5, stratify=y_remain, random_state=42
)

I will also prepare my preprocessor for pipeline which will be used in whole  
model training process

In [10]:
num_selector = selector(dtype_include=np.number)
cat_selector = selector(dtype_include=[object, "category"])
numeric = num_selector(X)
categoric = cat_selector(X)

cat_preprocessor = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=True)
)

num_preprocessor = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

preprocessor = ColumnTransformer(
    [
        ("categorical", cat_preprocessor, categoric),
        ("numerical", num_preprocessor, numeric)
    ]
)

---

#### Model Selection

At this part we will select 5 models to work with:  
KNN, Logistic Regression, Ridge Classifier, Random Forest, XGBoost

In [11]:
start = time.time()

results = []

models = {
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(solver="saga", max_iter=100, class_weight="balanced"),
    "Ridge Classifier": RidgeClassifier(solver="lsqr", class_weight="balanced"),
    "Random Forest": RandomForestClassifier(n_jobs=6, n_estimators=50, class_weight="balanced"),
    "XGBoost": XGBClassifier(tree_method="hist", n_jobs=6),
}

for model_name, model in models.items():
    t_start = time.time()

    pipeline = make_pipeline(preprocessor, model)

    score = cross_validate(pipeline, X_train, y_train, cv=5)

    _ = pipeline.fit(X_train, y_train)
    target_predicted = pipeline.predict(X_test)

    test_score = pipeline.score(X_val, y_val)
    cv_mean = round(score['test_score'].mean(), 3)
    cv_std = score['test_score'].std()
    precision, recall, f1 = sand.model_result_calc(y_test, target_predicted, pos_label=0)
    model_time = round((time.time() - t_start) / 60, 2)

    results.append({
        "Model":model_name,
        "Test_score":test_score,
        "cv_score_mean":cv_mean,
        "cv_std":cv_std,
        "precision":precision,
        "recall":recall,
        "f1":f1,
        "model_time":model_time
    })

stop = round((time.time() - start) / 60, 2)
print(f"Total time to complete: {stop} minutes")

results_df = pd.DataFrame(results)
results_df



Total time to complete: 12.41 minutes


Unnamed: 0,Model,Test_score,cv_score_mean,cv_std,precision,recall,f1,model_time
0,KNN,0.91428,0.914,0.000358,0.920661,0.991203,0.954631,4.18
1,Logistic Regression,0.689401,0.69,0.00298,0.959702,0.695305,0.806384,4.92
2,Ridge Classifier,0.694105,0.694,0.0029,0.959838,0.699455,0.809217,0.93
3,Random Forest,0.919266,0.919,6.5e-05,0.919326,1.0,0.957968,1.71
4,XGBoost,0.91892,0.919,0.000212,0.920615,0.997689,0.957603,0.67


In [12]:
# start = time.time()

# results = []

# models = {
#     "KNN": KNeighborsClassifier(),
#     "Logistic Regression": LogisticRegression(solver="saga", max_iter=100),
#     "Ridge Classifier": RidgeClassifier(),
#     "Random Forest": RandomForestClassifier(n_jobs=6, n_estimators=50),
#     "XGBoost": XGBClassifier(tree_method="hist", n_jobs=6),
# }

# for model_name, model in models.items():
#     pipeline = make_pipeline(preprocessor, model)

#     score = cross_validate(pipeline, X_train, y_train, cv=5)

#     _ = pipeline.fit(X_train, y_train)
#     target_predicted = pipeline.predict(X_test)

#     print(f"Model name: {model_name}")
#     print(f"Score: {pipeline.score(X_val, y_val)}")
#     print(f"CV score: {score['test_score'].mean():.3f} +- {score['test_score'].std()}")
#     print(f"{sand.model_result_calc(y_test, target_predicted, pos_label=0)}\n")
# stop = round((time.time() - start) / 60, 2)
# print(f"Total time to complete: {stop} minutes")

Given the high overall performance, especially in recall and F1 score, RandomForest  
is the best choice for this dataset. It offers the best balance of precision,  
recall, and generalization ability as raw model for further updates.  
Hyperparameters will be added to increase overall model's performance and  
decision assurance

In [13]:
rf_model = RandomForestClassifier()

pipeline = make_pipeline(preprocessor, rf_model)

rf_hparams = {
    "randomforestclassifier__max_depth": [2, 5, 10, 20],
    "randomforestclassifier__n_estimators": [1, 10, 25, 50, 100],
    "randomforestclassifier__max_leaf_nodes": [10, 100, 1000],
    "randomforestclassifier__min_samples_split": [10, 20, 50, 100],
    "randomforestclassifier__min_samples_leaf": [5, 10, 20, 50, 100],
    "randomforestclassifier__class_weight": ["balanced"],
}

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

start = time.time()
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=rf_hparams,
    n_iter=20,
    cv=stratified_kfold,
    verbose=2,
    random_state=42,
    n_jobs=6,
)

random_search.fit(X_train, y_train)
print(f"Best parameters for RandomForest: {random_search.best_params_}")
stop = round((time.time() - start) / 60, 2)
print(f"Total time to complete: {stop} minutes")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters for RandomForest: {'randomforestclassifier__n_estimators': 50, 'randomforestclassifier__min_samples_split': 100, 'randomforestclassifier__min_samples_leaf': 50, 'randomforestclassifier__max_leaf_nodes': 1000, 'randomforestclassifier__max_depth': 20, 'randomforestclassifier__class_weight': 'balanced'}
Total time to complete: 13.37 minutes


We have updated our RandomForestClassifier model with hyperparameters and using  
randomized search technique found best parameters for the model:  
- max depth: 20  
- n estimators: 50  
- min samples split: 100  
- min samples leaf: 50  
- max leaf nodes: 1000  
- class weight: balanced  
Now we can check the accuracy of the model with best parameters

In [14]:
# RandomForest accuracy
best_est = random_search.best_estimator_
pred_best = best_est.predict(X_test)
rf_accuracy = accuracy_score(y_test, pred_best)
print(f"Calculated accuracy: {rf_accuracy:.4f}")

Calculated accuracy: 0.7491


We got decent result of ~0.74.  

Let's look at some insights of trained model through classification report

In [16]:
print(classification_report(y_test, pred_best))

              precision    recall  f1-score   support

           0       0.96      0.76      0.85     42403
           1       0.18      0.59      0.28      3724

    accuracy                           0.75     46127
   macro avg       0.57      0.68      0.56     46127
weighted avg       0.89      0.75      0.80     46127



The results indicate imbalanced performance between classes, with overall  
accuracy at 0.74. For class 0, the model achieves strong metrics: precision  
of 0.95, recall of 0.75, and an F1-score of 0.84, reflecting reliable predictions  
for the majority class. In contrast, class 1 metrics are weak, with precision  
at 0.17, recall at 0.59, and an F1-score of 0.27, highlighting high false  
positives and limited precision.  
The macro average F1-score of 0.55 shows disparities in performance across  
classes, while the weighted average F1-score of 0.80 is skewed by the  
dominance of class 0. These results suggest the model heavily favors the  
majority class and struggles with the minority class. These results were shows  
that even models hyperparameter "class weight" didn't have strong positive  
influence to the final results

---

After the results I wanted to see features importance to the model. Some  
features might be removed if importance is very low

In [17]:
best_rf_model = random_search.best_estimator_.named_steps['randomforestclassifier']

importances = best_rf_model.feature_importances_

feature_names = preprocessor.get_feature_names_out()
feature_importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

feature_importances_df.head(5)

Unnamed: 0,Feature,Importance
150,numerical__ext_source_3,0.181555
149,numerical__ext_source_2,0.1714
130,numerical__client_age,0.056179
131,numerical__years_employed,0.048219
154,numerical__days_last_phone_change,0.041308


Herein we can see few main features with the highest importance to the model.  
At the EDA part, I thought that flag_document features might be not that important,  
so I checked importance specially targeting only flag_document importance

In [18]:
flag_document = feature_importances_df[
    feature_importances_df["Feature"].str.contains("flag_document")
]
flag_document

Unnamed: 0,Feature,Importance
156,numerical__flag_document_3,0.010708
161,numerical__flag_document_8,0.001556
159,numerical__flag_document_6,0.001207
169,numerical__flag_document_16,0.000208
171,numerical__flag_document_18,0.000126
158,numerical__flag_document_5,8e-05
166,numerical__flag_document_13,7.6e-05
164,numerical__flag_document_11,1.7e-05
162,numerical__flag_document_9,1.1e-05
165,numerical__flag_document_12,0.0


As we can see, flag_documents importance is very low and these could be removed 
from dataset to increase model's performance

In [33]:
flag_doc = X_train.columns[X_train.columns.str.contains("flag_document")]
X_updated = X.drop(columns=flag_doc)
X_train_v1 = X_train.drop(columns=flag_doc)
X_test_v1 = X_test.drop(columns=flag_doc)

In [34]:
num_selector = selector(dtype_include=np.number)
cat_selector = selector(dtype_include=[object, "category"])
numeric = num_selector(X_updated)
categoric = cat_selector(X_updated)

cat_preprocessor = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=True)
)

num_preprocessor = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

preprocessor = ColumnTransformer(
    [
        ("categorical", cat_preprocessor, categoric),
        ("numerical", num_preprocessor, numeric)
    ]
)

pipeline = make_pipeline(preprocessor, rf_model)

In [35]:
rs = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=rf_hparams,
    n_iter=20,
    cv=stratified_kfold,
    verbose=2,
    random_state=42,
    n_jobs=6,
)

rs.fit(X_train_v1, y_train)

best_est = rs.best_estimator_
pred_best = best_est.predict(X_test_v1)
rf_accuracy = accuracy_score(y_test, pred_best)
print(f"Calculated accuracy: {rf_accuracy:.4f}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Calculated accuracy: 0.7475


In [None]:
clean_test = pd.read_csv("clean_test.csv")

In [None]:
to_predict = clean_test.iloc[5]

In [None]:
pred_target = best_est.predict_proba(clean_test.iloc[199:200])[:, 1]
prediction = pred_target >= 0.5

print(f"Prediction probality of default loan is: {pred_target}")
print(f"Final prediction is: {prediction}")

Prediction probality of default loan is: [0.58804818]
Final prediction is: [ True]


In [None]:
model = best_est
model.fit(X_train_v1, y_train)

In [None]:
# with open("model.pkl", "wb") as f:
#     pickle.dump(model, f)