In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import importlib
import functions_sandbox as sand
importlib.reload(sand)
import pickle
import time
from imblearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.model_selection import (
    train_test_split,
    cross_validate,
    KFold,
    RandomizedSearchCV,
    StratifiedKFold
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

---

## ML modeling

#### Preprocessing

In [2]:
path_train = r"D:\IT_projects\Turing_Colledge\train_df.csv"

In [3]:
df_train = pd.read_csv(path_train)
df_train.head(3)

Unnamed: 0,target,name_contract_type,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_annuity,name_type_suite,name_income_type,name_education_type,...,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,active_credit_count,total_debt_all,prol_credits,credit_to_income
0,1,Cash loans,M,N,Y,0,10.114619,Unaccompanied,Working,Secondary / secondary special,...,0,0,0,0,0,1,1.098612,positive,0.0,1.101238
1,0,Cash loans,F,N,N,0,10.482892,Family,State servant,Higher education,...,0,0,0,0,0,0,0.693147,0,0.0,1.756262
2,0,Revolving loans,M,Y,Y,0,8.817446,Unaccompanied,Working,Secondary / secondary special,...,0,0,0,0,0,0,0.0,0,0.0,1.098612


In [4]:
X = df_train.drop(columns="target", axis=1)
y = df_train["target"]

num_selector = selector(dtype_include=np.number)
cat_selector = selector(dtype_include=[object, "category"])
numeric = num_selector(X)
categoric = cat_selector(X)

cat_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
num_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("one-hot-encoder", cat_preprocessor, categoric),
        ("standard_scaler", num_preprocessor, numeric)
    ]
)

data_train, data_test, target_train, target_test = train_test_split(
    X, y, random_state=42, test_size=0.2
)

---

#### Model Selection

At this part we will select 5 models to work with:  
KNN, Logistic Regression, SVC, Random Forest, XGBoost

In [6]:
pipeline = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=50, min_samples_split=100, min_samples_leaf=50, max_leaf_nodes=1000, max_depth=20, class_weight='balanced'))
_ = pipeline.fit(data_train, target_train)
pred_proba = pipeline.predict_proba(data_test)
predictions = pipeline.predict(data_test)

In [None]:
start = time.time()
models = {
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(solver="saga", max_iter=100),
    # "Ridge": Ridge(),
    # "Lasso": Lasso(),
    "Random Forest": RandomForestClassifier(n_jobs=-1, n_estimators=50),
    "XGBoost": XGBClassifier(tree_method="hist", n_jobs=-1),
}

for model_name, model in models.items():
    pipeline = make_pipeline(preprocessor, model)

    score = cross_validate(pipeline, data_train, target_train, cv=5)

    _ = pipeline.fit(data_train, target_train)
    target_predicted = pipeline.predict(data_test)

    print(f"Model name: {model_name}")
    print(f"Score: {pipeline.score(data_test, target_test)}")
    print(f"CV score: {score['test_score'].mean():.3f} +- {score['test_score'].std()}")
    print(f"{sand.model_result_calc(target_test, target_predicted, pos_label=0)}\n")
stop = round((time.time() - start) / 60, 2)
print(f"Total time to complete: {stop} minutes")

Model name: KNN
Score: 0.9132562639220851
CV score: 0.914 +- 0.00020764640836885102
Precision score: 0.92
Recall score: 0.99
F1 score: 0.95
None





Model name: Logistic Regression
Score: 0.9193535274702047
CV score: 0.919 +- 0.00026567901462431446
Precision score: 0.92
Recall score: 1.00
F1 score: 0.96
None

Model name: Random Forest
Score: 0.9195811586426679
CV score: 0.919 +- 6.774549917304057e-05
Precision score: 0.92
Recall score: 1.00
F1 score: 0.96
None

Model name: XGBoost
Score: 0.9191421556672033
CV score: 0.919 +- 0.00023047357962757367
Precision score: 0.92
Recall score: 1.00
F1 score: 0.96
None

Total time to complete: 33.72 minutes


In [None]:
rf_model = RandomForestClassifier()

pipeline = make_pipeline(preprocessor, rf_model)

rf_hparams = {
    "randomforestclassifier__max_depth": [2, 5, 10, 20],
    "randomforestclassifier__n_estimators": [1, 10, 25, 50, 100],
    "randomforestclassifier__max_leaf_nodes": [10, 100, 1000],
    "randomforestclassifier__min_samples_split": [10, 20, 50, 100],
    "randomforestclassifier__min_samples_leaf": [5, 10, 20, 50, 100],
    "randomforestclassifier__class_weight": ["balanced"],
}

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

start = time.time()
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=rf_hparams,
    n_iter=20,
    cv=stratified_kfold,
    verbose=2,
    random_state=42,
    n_jobs=-1,
)

random_search.fit(data_train, target_train)
print(f"Best parameters for RandomForest: {random_search.best_params_}")
stop = round((time.time() - start) / 60, 2)
print(f"Total time to complete: {stop} minutes")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters for RandomForest: {'randomforestclassifier__n_estimators': 50, 'randomforestclassifier__min_samples_split': 100, 'randomforestclassifier__min_samples_leaf': 50, 'randomforestclassifier__max_leaf_nodes': 1000, 'randomforestclassifier__max_depth': 20, 'randomforestclassifier__class_weight': 'balanced'}
Total time to complete: 11.34 minutes


In [None]:
# RandomForest accuracy
best_est = random_search.best_estimator_
pred_best = best_est.predict(data_test)
rf_accuracy = accuracy_score(target_test, pred_best)
print(f"Calculated accuracy: {rf_accuracy:.4f}")

Calculated accuracy: 0.7448


In [19]:
# Cross-validation for score check
cross = cross_validate(best_est, data_train, target_train, cv=5)

print(f"CV: {cross['test_score'].mean():.4f}")

CV: 0.7509


In [20]:
print(classification_report(target_test, pred_best))

              precision    recall  f1-score   support

           0       0.95      0.75      0.84     56554
           1       0.17      0.59      0.27      4949

    accuracy                           0.74     61503
   macro avg       0.56      0.67      0.55     61503
weighted avg       0.89      0.74      0.80     61503



---

After the results I wanted to see features importance to the model. Some  
features might be removed if importance is very low

In [34]:
# After fitting the random search
best_rf_model = random_search.best_estimator_.named_steps['randomforestclassifier']

# Get feature importances
importances = best_rf_model.feature_importances_

# Align feature importances with feature names
feature_names = preprocessor.get_feature_names_out()
feature_importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# Sort the DataFrame by importance
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

# Display the DataFrame
feature_importances_df.head(5)

Unnamed: 0,Feature,Importance
151,standard_scaler__ext_source_3,0.174079
150,standard_scaler__ext_source_2,0.16901
131,standard_scaler__client_age,0.052419
132,standard_scaler__years_employed,0.047214
155,standard_scaler__days_last_phone_change,0.043489


Herein we can see few main features with the highest importance to the model.  
At the EDA part, I thought that flag_document features might be not that important,  
so I checked importance specially targeting only flag_document importance

In [None]:
flag_document = feature_importances_df[
    feature_importances_df["Feature"].str.contains("flag_document")
]
flag_document

As we can see, flag_documents importance is very low and these could be removed 
from dataset to increase model's performance

In [49]:
flag_doc = data_train.columns[data_train.columns.str.contains("flag_document")]
data_train_v1 = data_train.drop(columns=flag_doc)
data_test_v1 = data_test.drop(columns=flag_doc)

In [None]:
random_search.fit(data_train, target_train)

best_est = random_search.best_estimator_
pred_best = best_est.predict(data_test)
rf_accuracy = accuracy_score(target_test, pred_best)
print(f"Calculated accuracy: {rf_accuracy:.4f}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [None]:
clean_test = pd.read_csv("clean_test.csv")

In [41]:
to_predict = clean_test.iloc[5]

In [52]:
pred_target = best_est.predict_proba(clean_test.iloc[199:200])[:, 1]
prediction = pred_target >= 0.5

print(f"Prediction probality of default loan is: {pred_target}")
print(f"Final prediction is: {prediction}")

Prediction probality of default loan is: [0.58804818]
Final prediction is: [ True]


In [None]:
model = best_est
model.fit(data_train, target_train)

In [None]:
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)