# Model selection

## Data has been preprocessed, let's select the best models

In [None]:
from sklearn.metrics import f1_score, balanced_accuracy_score, recall_score, precision_score,roc_auc_score, average_precision_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
from time import time

In [None]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    y_proba = None
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]

    results = {}

    results["f1"] = f1_score(y_test, y_pred)
    results["precision"] = precision_score(y_test, y_pred)
    results["recall"] = recall_score(y_test, y_pred)
    results["weighted_accuracy"] = balanced_accuracy_score(y_test, y_pred)

    if y_proba is not None:
        results["roc_auc"] = roc_auc_score(y_test, y_proba)
        results["pr_auc"] = average_precision_score(y_test, y_proba)

    return results

In [None]:
def run_experiment(model, data, model_name, target='BANKR'):
  X = data.drop('BANKR', axis=1)
  y = data.BANKR
  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, stratify=y,  random_state=42)

  start = time()
  model.fit(X_train, y_train)
  elapsed = time() - start

  metrics = evaluate_model(model, X_test, y_test)
  metrics["model"] = model_name
  metrics["time"] = elapsed

  return metrics

In [None]:
target='BANKR'

# Data loading

In [None]:
data_lin = pd.read_csv('drive/MyDrive/data_lin.csv')
data_catboost = pd.read_csv('drive/MyDrive/data_catboost.csv')
data_non_lin = pd.read_csv('drive/MyDrive/data_non_lin.csv')

# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_reg_metrics = run_experiment(LogisticRegression(), data_lin, 'LogisticRegression', target=target)

# SVM Linear

In [None]:
from sklearn.svm import SVC

In [None]:
data_svm_lim = data_lin.sample(50000)

In [None]:
# we leave the original ratio of classes of the target variable
data_svm_lim, _ = train_test_split(
    data_lin,
    train_size=0.05,
    stratify=data_lin['BANKR'],
    random_state=42
)

In [None]:
lin_svm_metrics = run_experiment(SVC(kernel='linear'), data_svm_lim, 'SVM', target=target)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# SVM RBF

In [None]:
from sklearn.svm import SVC

In [None]:
data_svm_lim = data_lin.sample(50000)

In [None]:
# we leave the original ratio of classes of the target variable
data_svm_lim, _ = train_test_split(
    data_lin,
    train_size=0.05,
    stratify=data_lin['BANKR'],
    random_state=42
)

In [None]:
rbf_svm_metrics = run_experiment(SVC(kernel='rbf'), data_svm_lim, 'SVM', target=target)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
data_knn_lim = data_non_lin.sample(30000)

In [None]:
knn_metrics = run_experiment(KNeighborsClassifier(), data_knn_lim, 'KNN', target=target)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree_metrics = run_experiment(DecisionTreeClassifier(), data_non_lin, 'DecisionTreeClassifier', target=target)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
data_random_forest_lim = data_non_lin.sample(60000)

In [None]:
random_forest_model = RandomForestClassifier(class_weight='balanced')

In [None]:
random_forest_metrics = run_experiment(random_forest_model, data_random_forest_lim, 'RandomForestClassifier', target=target)

# CatBoost

In [None]:
!pip install catboost -q

In [None]:
from catboost import CatBoostClassifier

In [None]:
catboost_model = CatBoostClassifier(
                      iterations=1400,       # Количество итераций (деревьев)
                      learning_rate=0.015,    # Скорость обучения
                      depth=4,              # Глубина деревьев
                      eval_metric='BalancedAccuracy',     # Метрика для валидации
                      verbose=100,          # Вывод логов каждые 100 итераций
                      scale_pos_weight=250,
                      l2_leaf_reg=9.5,
                      border_count=32,
                      random_strength=0.76,
                      bagging_temperature=0.82,
                      task_type="GPU",
                      devices="0",
                  )
catboost_model = CatBoostClassifier(verbose=100,
                                    eval_metric='BalancedAccuracy')

In [None]:
catboost_metrics = run_experiment(catboost_model, data_catboost, 'CatBoostClassifier', target=target)

Learning rate set to 0.152424
0:	learn: 0.5000000	total: 1.1s	remaining: 18m 21s
100:	learn: 0.5385571	total: 1m	remaining: 9m 2s
200:	learn: 0.5790352	total: 1m 56s	remaining: 7m 44s
300:	learn: 0.6116319	total: 2m 48s	remaining: 6m 31s
400:	learn: 0.6414572	total: 3m 40s	remaining: 5m 29s
500:	learn: 0.6704303	total: 4m 29s	remaining: 4m 28s
600:	learn: 0.6936515	total: 5m 22s	remaining: 3m 34s
700:	learn: 0.7138901	total: 6m 12s	remaining: 2m 38s
800:	learn: 0.7339156	total: 7m 3s	remaining: 1m 45s
900:	learn: 0.7507456	total: 7m 53s	remaining: 52.1s
999:	learn: 0.7660844	total: 8m 44s	remaining: 0us


In [None]:
# data_catboost
 {'f1': 0.058322237017310256,
 'precision': 0.030211063594978618,
 'recall': 0.8390804597701149,
 'weighted_accruacy': np.float64(0.8617704870960032),
 'roc_auc': np.float64(0.9266421377016133),
 'pr_auc': np.float64(0.10814448830057924),
 'model': 'CatBoostClassifier',
 'time': 20.932273149490356}
 # data_non_lin
  {'f1': 0.058063656944999334,
 'precision': 0.030077262693156734,
 'recall': 0.8352490421455939,
 'weighted_accruacy': np.float64(0.8598547782837428),
 'roc_auc': np.float64(0.9276545504787987),
 'pr_auc': np.float64(0.10901435170380142),
 'model': 'CatBoostClassifier',
 'time': 22.173586130142212}

In [None]:
evals_result = catboost_model.get_evals_result()
train_metric = evals_result['learn']['BalancedAccuracy']
plt.plot(train_metric, label='train')
plt.xlabel('Iteration')
plt.ylabel('Logloss')
plt.legend()
plt.show()


# LightGBM

In [None]:
!pip install LightGBM -q

In [None]:
from lightgbm import LGBMClassifier

In [None]:
lgbm_model = LGBMClassifier(
    n_estimators=1400,
    learning_rate=0.015,
    max_depth=4,
    objective='binary',
    class_weight={0: 1, 1: 250},  # аналог scale_pos_weight
    reg_lambda=9.5,
    num_leaves=2**4,              # должен соответствовать depth
    max_bin=32,                   # аналог border_count
    subsample_freq=1,
    subsample=0.82,               # аналог bagging_temperature ≈ subsample
    colsample_bytree=0.76,
    device='gpu'# вместо random_strength
)
lgbm_model = LGBMClassifier()#device='gpu')

In [None]:
lgbm_metrics = run_experiment(lgbm_model, data_non_lin, 'LGBMClassifier', target=target)

[LightGBM] [Info] Number of positive: 2347, number of negative: 547606
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 3.167792 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 48374
[LightGBM] [Info] Number of data points in the train set: 549953, number of used features: 197
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.004268 -> initscore=-5.452418
[LightGBM] [Info] Start training from score -5.452418


# XGBoost

In [None]:
!pip install xgboost



In [None]:
from xgboost import XGBClassifier

In [None]:
xgboost_model = XGBClassifier(
    n_estimators=1400,
    learning_rate=0.015,
    max_depth=4,
    scale_pos_weight=250,
    reg_lambda=9.5,
    max_bin=32,
    colsample_bytree=0.76,
    subsample=0.82,
    eval_metric="aucpr",
    tree_method="hist",
)
xgboost_model = XGBClassifier()

In [None]:
xgboost_metrics = run_experiment(xgboost_model, data_non_lin, 'XGBoost', target)

In [None]:
models_statistics = pd.DataFrame([log_reg_metrics, lin_svm_metrics, rbf_svm_metrics, knn_metrics, tree_metrics, random_forest_metrics, catboost_metrics, lgbm_metrics, xgboost_metrics])
models_statistics.set_index('model', inplace=True)

In [None]:
models_statistics.sort_values(by='weighted_accuracy', ascending=False)

Unnamed: 0_level_0,f1,precision,recall,weighted_accruacy,roc_auc,pr_auc,time
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
DecisionTreeClassifier,0.076397,0.069001,0.085568,0.540308,0.520491,0.00986,256.927801
XGBoost,0.086548,0.164286,0.058748,0.528733,0.833775,0.062004,75.817216
LGBMClassifier,0.058552,0.073786,0.048531,0.522959,0.905399,0.055228,65.422262
CatBoostClassifier,0.038369,0.313725,0.020434,0.510121,0.922931,0.10421,537.551985
LogisticRegression,0.019417,0.195122,0.010217,0.505018,0.822843,0.058617,7.063285
KNN,0.0,0.0,0.0,0.5,0.559988,0.010869,0.022454
SVM,0.0,0.0,0.0,0.5,,,46.514925
SVM,0.0,0.0,0.0,0.5,,,21.346785
RandomForestClassifier,0.0,0.0,0.0,0.499766,0.851279,0.021204,29.158497
