# Final model

#### Let's fit and save the best model

## Imports

In [1]:
!pip install catboost -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, f1_score, recall_score, roc_auc_score, precision_score
from catboost import CatBoostClassifier
from catboost.utils import get_gpu_device_count
import pickle



## Data loading

In [3]:
TARGET = 'BANKR'

In [4]:
data = pd.read_csv('drive/MyDrive/data_catboost.csv')
X = data.drop(TARGET, axis=1)
y = data[TARGET]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

## Model training

In [5]:
def set_to_gpu(params):
  has_gpu = get_gpu_device_count() > 0
  params["task_type"] = "GPU" if has_gpu else "CPU"
  params["devices"] = "0" if has_gpu else ""
  return params

In [6]:
best_params =  {'iterations': 1486,
                'learning_rate': 0.028711863825961668,
                'depth': 3,
                'scale_pos_weight': 304.53677886131595,
                'l2_leaf_reg': 14.072790632515524,
                'border_count': 78,
                'random_strength': 0.47114280903385825,
                'bagging_temperature': 0.5886678131427623,
                "task_type": "GPU",
                "devices": "0",
                "verbose": 250,
                "eval_metric": "BalancedAccuracy",
                "loss_function": "Logloss"}

best_params = set_to_gpu(best_params)

In [7]:
model = CatBoostClassifier(**best_params)
model.fit(X_train, y_train)

Default metric period is 5 because BalancedAccuracy is/are not implemented for GPU
Metric BalancedAccuracy is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.8130280	total: 44.5ms	remaining: 1m 6s
250:	learn: 0.8606713	total: 4.32s	remaining: 21.3s
500:	learn: 0.8734271	total: 8.38s	remaining: 16.5s
750:	learn: 0.8822416	total: 11.2s	remaining: 11s
1000:	learn: 0.8903859	total: 14.3s	remaining: 6.93s
1250:	learn: 0.8969527	total: 19.4s	remaining: 3.65s
1485:	learn: 0.9022362	total: 22.5s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7df301b782c0>

## Model evaluating

In [8]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    y_proba = None
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]

    results = {}

    results["f1"] = f1_score(y_test, y_pred)
    results["precision"] = precision_score(y_test, y_pred)
    results["recall"] = recall_score(y_test, y_pred)
    results["weighted_accuracy"] = balanced_accuracy_score(y_test, y_pred)

    if y_proba is not None:
        results["roc_auc"] = roc_auc_score(y_test, y_proba)
        results["pr_auc"] = average_precision_score(y_test, y_proba)

    return results

In [11]:
evaluate_model(model, X_test, y_test)

{'f1': 0.05510094987458366,
 'precision': 0.028467029231815093,
 'recall': 0.855683269476373,
 'weighted_accuracy': np.float64(0.8652070715037383),
 'roc_auc': np.float64(0.9270615432628764),
 'pr_auc': np.float64(0.10780198557485542)}

# Saving model

In [15]:
with open('drive/MyDrive/best_cb_model.pkl', 'wb') as f:
  pickle.dump(model, f)