In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import importlib
import functions_sandbox as sand
importlib.reload(sand)
import pickle
import time
from imblearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.model_selection import (
    train_test_split,
    cross_validate,
    KFold,
    RandomizedSearchCV,
    StratifiedKFold
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

---

## ML modeling

#### Preprocessing

In [6]:
path_train = r"D:\IT_projects\Turing_Colledge\train_df.csv"

In [7]:
df_train = pd.read_csv(path_train)
df_train.head(3)

Unnamed: 0,target,name_contract_type,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_annuity,name_type_suite,name_income_type,name_education_type,...,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,active_credit_count,total_debt_all,prol_credits,credit_to_income
0,1,Cash loans,M,N,Y,0,10.114619,Unaccompanied,Working,Secondary / secondary special,...,0,0,0,0,0,1,1.098612,positive,0.0,1.101238
1,0,Cash loans,F,N,N,0,10.482892,Family,State servant,Higher education,...,0,0,0,0,0,0,0.693147,0,0.0,1.756262
2,0,Revolving loans,M,Y,Y,0,8.817446,Unaccompanied,Working,Secondary / secondary special,...,0,0,0,0,0,0,0.0,0,0.0,1.098612


Since we have our clean prepared data, we can split it into three parts:  
data train, data validation and data test. Data validation will have 15% of  
whole dataset as well as data test. 70% of dataset will be to train models

In [None]:
X = df_train.drop(columns="target", axis=1)
y = df_train["target"]

X_train, X_remain, y_train, y_remain = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_remain, y_remain, test_size=0.5, stratify=y_remain, random_state=42
)

I will also prepare my preprocessor for pipeline which will be used in whole  
model training process

In [None]:
num_selector = selector(dtype_include=np.number)
cat_selector = selector(dtype_include=[object, "category"])
numeric = num_selector(X)
categoric = cat_selector(X)

cat_preprocessor = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=True)
)

num_preprocessor = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

preprocessor = ColumnTransformer(
    [
        ("categorical", cat_preprocessor, categoric),
        ("numerical", num_preprocessor, numeric)
    ]
)

---

#### Model Selection

At this part we will select 5 models to work with:  
KNN, Logistic Regression, Ridge Classifier, Random Forest, XGBoost

In [None]:
start = time.time()

results = []

models = {
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(solver="saga", max_iter=100, class_weight="balanced"),
    "Ridge Classifier": RidgeClassifier(solver="lsqr", class_weight="balanced"),
    "Random Forest": RandomForestClassifier(n_jobs=6, n_estimators=50, class_weight="balanced"),
    "XGBoost": XGBClassifier(tree_method="hist", n_jobs=6),
}

for model_name, model in models.items():
    t_start = time.time()

    pipeline = make_pipeline(preprocessor, model)

    score = cross_validate(pipeline, X_train, y_train, cv=5)

    _ = pipeline.fit(X_train, y_train)
    target_predicted = pipeline.predict(X_test)

    test_score = pipeline.score(X_val, y_val)
    cv_mean = round(score['test_score'].mean(), 3)
    cv_std = score['test_score'].std()
    precision, recall, f1 = sand.model_result_calc(y_test, target_predicted, pos_label=0)
    model_time = round((time.time() - t_start) / 60, 2)

    results.append({
        "Model":model_name,
        "Test_score":test_score,
        "cv_score_mean":cv_mean,
        "cv_std":cv_std,
        "precision":precision,
        "recall":recall,
        "f1":f1,
        "model_time":model_time
    })

stop = round((time.time() - start) / 60, 2)
print(f"Total time to complete: {stop} minutes")

results_df = pd.DataFrame(results)
results_df

In [None]:
# start = time.time()

# results = []

# models = {
#     "KNN": KNeighborsClassifier(),
#     "Logistic Regression": LogisticRegression(solver="saga", max_iter=100),
#     "Ridge Classifier": RidgeClassifier(),
#     "Random Forest": RandomForestClassifier(n_jobs=6, n_estimators=50),
#     "XGBoost": XGBClassifier(tree_method="hist", n_jobs=6),
# }

# for model_name, model in models.items():
#     pipeline = make_pipeline(preprocessor, model)

#     score = cross_validate(pipeline, X_train, y_train, cv=5)

#     _ = pipeline.fit(X_train, y_train)
#     target_predicted = pipeline.predict(X_test)

#     print(f"Model name: {model_name}")
#     print(f"Score: {pipeline.score(X_val, y_val)}")
#     print(f"CV score: {score['test_score'].mean():.3f} +- {score['test_score'].std()}")
#     print(f"{sand.model_result_calc(y_test, target_predicted, pos_label=0)}\n")
# stop = round((time.time() - start) / 60, 2)
# print(f"Total time to complete: {stop} minutes")

Model name: KNN
Score: 0.9138090824837812
CV score: 0.914 +- 0.0003092010317753735
Precision score: 0.92
Recall score: 0.99
F1 score: 0.95
None





Model name: Logistic Regression
Score: 0.9194998617953596
CV score: 0.919 +- 0.0002631536616727057
Precision score: 0.92
Recall score: 1.00
F1 score: 0.96
None

Model name: Random Forest
Score: 0.9194998617953596
CV score: 0.919 +- 5.9537897520459764e-05
Precision score: 0.92
Recall score: 1.00
F1 score: 0.96
None

Model name: XGBoost
Score: 0.919337268100743
CV score: 0.919 +- 0.00026733269073546504
Precision score: 0.92
Recall score: 1.00
F1 score: 0.96
None

Total time to complete: 14.78 minutes


Given the high overall performance, especially in recall and F1 score, RandomForest  
is the best choice for this dataset. It offers the best balance of precision,  
recall, and generalization ability as raw model for further updates.  
Hyperparameters will be added to increase overall model's performance and  
decision assurance

In [None]:
rf_model = RandomForestClassifier()

pipeline = make_pipeline(preprocessor, rf_model)

rf_hparams = {
    "randomforestclassifier__max_depth": [2, 5, 10, 20],
    "randomforestclassifier__n_estimators": [1, 10, 25, 50, 100],
    "randomforestclassifier__max_leaf_nodes": [10, 100, 1000],
    "randomforestclassifier__min_samples_split": [10, 20, 50, 100],
    "randomforestclassifier__min_samples_leaf": [5, 10, 20, 50, 100],
    "randomforestclassifier__class_weight": ["balanced"],
}

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

start = time.time()
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=rf_hparams,
    n_iter=20,
    cv=stratified_kfold,
    verbose=2,
    random_state=42,
    n_jobs=6,
)

random_search.fit(X_train, y_train)
print(f"Best parameters for RandomForest: {random_search.best_params_}")
stop = round((time.time() - start) / 60, 2)
print(f"Total time to complete: {stop} minutes")

Fitting 5 folds for each of 20 candidates, totalling 100 fits


4 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Vykintas\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Vykintas\AppData\Local\Programs\Python\Python311\Lib\site-packages\imblearn\utils\fixes.py", line 85, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Vykintas\AppData\Local\Programs\Python\Python311\Lib\site-packages\imblearn\pipeline.py", line 329, in fit
    Xt, yt = self._fit(X, y, r

Best parameters for RandomForest: {'randomforestclassifier__n_estimators': 100, 'randomforestclassifier__min_samples_split': 20, 'randomforestclassifier__min_samples_leaf': 20, 'randomforestclassifier__max_leaf_nodes': 1000, 'randomforestclassifier__max_depth': 10, 'randomforestclassifier__class_weight': 'balanced'}
Total time to complete: 31.86 minutes


We have updated our RandomForestClassifier model with hyperparameters and using  
randomized search technique found best parameters for the model:  
- max depth: 20  
- n estimators: 50  
- min samples split: 100  
- min samples leaf: 50  
- max leaf nodes: 1000  
- class weight: balanced  
Now we can check the accuracy of the model with best parameters

In [None]:
# RandomForest accuracy
best_est = random_search.best_estimator_
pred_best = best_est.predict(X_test)
rf_accuracy = accuracy_score(y_test, pred_best)
print(f"Calculated accuracy: {rf_accuracy:.4f}")

Calculated accuracy: 0.7090


We got decent result of ~0.74.  

Let's look at some insights of trained model through classification report

In [5]:
print(classification_report(X_test, pred_best))

NameError: name 'X_test' is not defined

The results indicate imbalanced performance between classes, with overall  
accuracy at 0.74. For class 0, the model achieves strong metrics: precision  
of 0.95, recall of 0.75, and an F1-score of 0.84, reflecting reliable predictions  
for the majority class. In contrast, class 1 metrics are weak, with precision  
at 0.17, recall at 0.59, and an F1-score of 0.27, highlighting high false  
positives and limited precision.  
The macro average F1-score of 0.55 shows disparities in performance across  
classes, while the weighted average F1-score of 0.80 is skewed by the  
dominance of class 0. These results suggest the model heavily favors the  
majority class and struggles with the minority class. These results were shows  
that even models hyperparameter "class weight" didn't have strong positive  
influence to the final results

---

After the results I wanted to see features importance to the model. Some  
features might be removed if importance is very low

In [15]:
best_rf_model = random_search.best_estimator_.named_steps['randomforestclassifier']

importances = best_rf_model.feature_importances_

feature_names = preprocessor.get_feature_names_out()
feature_importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

feature_importances_df.head(5)

Unnamed: 0,Feature,Importance
149,standard_scaler__ext_source_2,0.211414
150,standard_scaler__ext_source_3,0.201333
130,standard_scaler__client_age,0.048033
131,standard_scaler__years_employed,0.046992
154,standard_scaler__days_last_phone_change,0.036414


Herein we can see few main features with the highest importance to the model.  
At the EDA part, I thought that flag_document features might be not that important,  
so I checked importance specially targeting only flag_document importance

In [16]:
flag_document = feature_importances_df[
    feature_importances_df["Feature"].str.contains("flag_document")
]
flag_document

Unnamed: 0,Feature,Importance
156,standard_scaler__flag_document_3,0.011713
159,standard_scaler__flag_document_6,0.001803
161,standard_scaler__flag_document_8,0.001538
169,standard_scaler__flag_document_16,0.000561
158,standard_scaler__flag_document_5,0.000286
171,standard_scaler__flag_document_18,0.000222
166,standard_scaler__flag_document_13,0.000157
164,standard_scaler__flag_document_11,0.00014
167,standard_scaler__flag_document_14,0.000135
162,standard_scaler__flag_document_9,4.5e-05


As we can see, flag_documents importance is very low and these could be removed 
from dataset to increase model's performance

In [None]:
flag_doc = X_train.columns[X_train.columns.str.contains("flag_document")]
X_train_v1 = X_train.drop(columns=flag_doc)
# data_test_v1 = data_test.drop(columns=flag_doc)

In [None]:
random_search.fit(X_train_v1, y_train)

best_est = random_search.best_estimator_
pred_best = best_est.predict(X_test)
rf_accuracy = accuracy_score(y_test, pred_best)
print(f"Calculated accuracy: {rf_accuracy:.4f}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [None]:
clean_test = pd.read_csv("clean_test.csv")

In [None]:
to_predict = clean_test.iloc[5]

In [None]:
pred_target = best_est.predict_proba(clean_test.iloc[199:200])[:, 1]
prediction = pred_target >= 0.5

print(f"Prediction probality of default loan is: {pred_target}")
print(f"Final prediction is: {prediction}")

Prediction probality of default loan is: [0.58804818]
Final prediction is: [ True]


In [None]:
model = best_est
model.fit(X_train_v1, y_train)

In [None]:
# with open("model.pkl", "wb") as f:
#     pickle.dump(model, f)