In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import importlib
import functions_sandbox as sand
importlib.reload(sand)
import pickle
import time
import scipy.stats as stats
from scipy.stats import anderson, spearmanr, normaltest, ttest_ind, shapiro, levene
from statsmodels.stats.proportion import proportions_ztest
from imblearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mutual_info_score, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.model_selection import (
    train_test_split,
    cross_validate,
    KFold,
    RandomizedSearchCV,
    StratifiedKFold
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from xgboost import XGBClassifier

---

## ML modeling

#### Preprocessing

In [2]:
path_train = r"D:\IT_projects\Turing_Colledge\train_df.csv"
# path_train = r"C:\Users\vykintas.palskys\OneDrive - Thermo Fisher Scientific\Desktop\p4\train_df.csv"

In [3]:
df_train = pd.read_csv(path_train)
df_train.head(3)

Unnamed: 0,sk_id_curr,target,name_contract_type,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_annuity,name_type_suite,name_income_type,...,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year,active_credit_count,total_debt_all,prol_credits,credit_to_income
0,100002,1,Cash loans,M,N,Y,0,2.408261,Unaccompanied,Working,...,0,0,0,0,0,1,1.098612,positive,0.0,1.101238
1,100003,0,Cash loans,F,N,N,0,2.440858,Family,State servant,...,0,0,0,0,0,0,0.693147,0,0.0,1.756262
2,100004,0,Revolving loans,M,Y,Y,0,2.284161,Unaccompanied,Working,...,0,0,0,0,0,0,0.0,0,0.0,1.098612


In [4]:
X = df_train.drop(columns="target", axis=1)
y = df_train["target"]

num_selector = selector(dtype_include=np.number)
cat_selector = selector(dtype_include=[object, "category"])
numeric = num_selector(X)
categoric = cat_selector(X)

cat_preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
num_preprocessor = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("one-hot-encoder", cat_preprocessor, categoric),
        ("standard_scaler", num_preprocessor, numeric)
    ]
)

data_train, data_test, target_train, target_test = train_test_split(
    X, y, random_state=42, test_size=0.2
)

---

#### Model Selection

At this part we will select 5 models to work with:  
KNN, Logistic Regression, SVC, Random Forest, XGBoost

In [None]:
start = time.time()
models = {
    "KNN": KNeighborsClassifier(),
    "Logistic Regression": LogisticRegression(solver="saga", max_iter=100),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "Random Forest": RandomForestClassifier(n_jobs=-1, n_estimators=50),
    "XGBoost": XGBClassifier(tree_method="hist", n_jobs=-1),
}

for model_name, model in models.items():
    pipeline = make_pipeline(preprocessor, model)

    score = cross_validate(pipeline, data_train, target_train, cv=5)

    _ = pipeline.fit(data_train, target_train)
    target_predicted = pipeline.predict(data_test)

    print(f"Model name: {model_name}")
    print(f"Score: {pipeline.score(data_test, target_test)}")
    print(f"CV score: {score['test_score'].mean():.3f} +- {score['test_score'].std()}")
    print(f"{sand.model_result_calc(target_test, target_predicted, pos_label=0)}\n")
print(f"Total time to complete: {time.time() - start:.4f} seconds")

Model name: KNN
Score: 0.9132562639220851
CV score: 0.914 +- 0.00020764640836885102
Precision score: 0.92
Recall score: 0.99
F1 score: 0.95
None

Model name: Random Forest
Score: 0.9194998617953596
CV score: 0.919 +- 4.198627698568423e-05
Precision score: 0.92
Recall score: 1.00
F1 score: 0.96
None

Model name: XGBoost
Score: 0.9191421556672033
CV score: 0.919 +- 0.00023047357962757367
Precision score: 0.92
Recall score: 1.00
F1 score: 0.96
None

Total time to complete: 493.7042 seconds


In [None]:
rf_model = RandomForestClassifier()

pipeline = make_pipeline(preprocessor, rf_model)

rf_hparams = {
    "randomforestclassifier__max_depth": [2, 5, 10, 20],
    "randomforestclassifier__n_estimators": [1, 10, 25, 50, 100],
    "randomforestclassifier__max_leaf_nodes": [10, 100, 1000],
    "randomforestclassifier__min_samples_split": [10, 20, 50, 100],
    "randomforestclassifier__min_samples_leaf": [5, 10, 20, 50, 100],
    "randomforestclassifier__class_weight": ["balanced"],
}

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

start = time.time()
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=rf_hparams,
    n_iter=100,
    cv=stratified_kfold,
    verbose=2,
    random_state=42,
    n_jobs=-1,
)
print(f"Total time to complete: {time.time() - start:.4f} seconds")

random_search.fit(data_train, target_train)
print(f"Best parameters for RandomForest: {random_search.best_params_}")

Total time to complete: 0.0010 seconds
Fitting 5 folds for each of 100 candidates, totalling 500 fits
