In [2]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from catboost import CatBoostClassifier
   
    
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
)
    
from sklearn.pipeline import Pipeline

from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    f1_score,
    confusion_matrix,
    recall_score,
    precision_score,
    accuracy_score
)
    
from sklearn.feature_selection import (
    SelectFromModel,
)

from sklearn_pandas import DataFrameMapper, gen_features
from sklearn.base import BaseEstimator, TransformerMixin
    

sns.set(style="darkgrid", font_scale=1.4)

RANDOM_STATE = 42

* Загрузим данные

In [3]:
data = pd.read_csv("heart_train.csv")

In [4]:
data = data[(data["Blood sugar"] > data["Blood sugar"].quantile(0.01)) & (data["Blood sugar"] < data["Blood sugar"].quantile(0.99))]
data = data[(data["Heart rate"] > data["Heart rate"].quantile(0.01)) & (data["Heart rate"] < data["Heart rate"].quantile(0.99))]

* Разобьм данные на выборки

In [5]:
X = data.drop(columns=["Heart Attack Risk (Binary)"])
y = data["Heart Attack Risk (Binary)"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    random_state=RANDOM_STATE,
    stratify=y
)

In [7]:
num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
cat_cols = data.select_dtypes(include=["object"]).columns.tolist()

In [8]:
class num_cust_transformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        data = X.copy()

        data = data.drop(columns=["Unnamed: 0","id"])

        data[["Stress Level"]] = data[["Stress Level"]].fillna(1)
        data = data.fillna(0)

        data["Sleep Hours Per Day"] = round(data["Sleep Hours Per Day"],2)

        #data = data[(data["Blood sugar"] > data["Blood sugar"].quantile(0.01)) & (data["Blood sugar"] < data["Blood sugar"].quantile(0.99))]
        #data = data[(data["Heart rate"] > data["Heart rate"].quantile(0.01)) & (data["Heart rate"] < data["Heart rate"].quantile(0.99))]

        return data


In [9]:
class cat_cust_transformer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        data = X.copy()

        data["Gender"] = data["Gender"].str.replace("1.0","Male").str.replace("0.0","Female")

        return data

### Применим вышенаписанные преобразования для набора данных

In [10]:
gen_num_cols = (
    num_cols,
    [
        num_cust_transformer()
    ],
    {"alias":f"num_col"}
)



In [11]:
gen_cat_cols = (
    cat_cols,
    [
        cat_cust_transformer()
    ],
    {"alias":"cat_col"}
)

In [12]:
preprocess_mapper = DataFrameMapper(
    
    [
        gen_num_cols,
        gen_cat_cols
    ],
    input_df=True,
    df_out=True
)

### Составим кастомный CatBoost

In [13]:
class CustCatBoostClassifier(CatBoostClassifier):

    
    def fit(self, X_train, y_train, **fit_params):
        
        cat_features = X_train.select_dtypes(include=["object"]).columns.tolist()
        
        print(1)

        return super().fit(
            X_train,
            y_train,
            cat_features=cat_features,
            **fit_params
        )

### Составим кастомный селектор

In [14]:
class CustCatBoostSelector(SelectFromModel):

    def transform(self, X):
        
        important_features_indices = list(self.get_support(indices=True))

        
        _X = X.iloc[:, important_features_indices].copy()

        return _X

### Создадим кастомный Pipeline для модели

In [15]:
final_pipe = Pipeline([
    ("preprocessor",preprocess_mapper),
    (
        "selector",
        CustCatBoostSelector(
            estimator=CustCatBoostClassifier(verbose=False),
            max_features=2
        )
    ),
    (
        "model",
        CustCatBoostClassifier(
            learning_rate=0.001,
            random_state=RANDOM_STATE,
            scale_pos_weight=2,
            verbose=True
        )
    )
])

In [16]:
param_grid = (
    {
        "model__depth":[5],
        "selector__max_features":[3],
        "model__max_bin":[250],
        "model__l2_leaf_reg":[6]
    }
)


gs = GridSearchCV(
    final_pipe,
    param_grid,
    scoring="f1",
    n_jobs=-1,
    cv=3,
    verbose=True

)

In [17]:
gs.fit(X_train,y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits




1
1
0:	learn: 0.6931367	total: 2.22ms	remaining: 2.22s
1:	learn: 0.6931223	total: 4.16ms	remaining: 2.07s
2:	learn: 0.6931124	total: 6.28ms	remaining: 2.08s
3:	learn: 0.6931062	total: 8.34ms	remaining: 2.08s
4:	learn: 0.6930967	total: 10.4ms	remaining: 2.06s
5:	learn: 0.6930767	total: 12.4ms	remaining: 2.05s
6:	learn: 0.6930475	total: 14.6ms	remaining: 2.07s
7:	learn: 0.6930266	total: 16.7ms	remaining: 2.07s
8:	learn: 0.6930202	total: 18.8ms	remaining: 2.07s
9:	learn: 0.6929993	total: 20.8ms	remaining: 2.06s
10:	learn: 0.6929784	total: 22.7ms	remaining: 2.04s
11:	learn: 0.6929517	total: 24.8ms	remaining: 2.04s
12:	learn: 0.6929448	total: 26.9ms	remaining: 2.04s
13:	learn: 0.6929341	total: 28.9ms	remaining: 2.03s
14:	learn: 0.6929232	total: 30.9ms	remaining: 2.03s
15:	learn: 0.6928984	total: 33.1ms	remaining: 2.03s
16:	learn: 0.6928831	total: 35.1ms	remaining: 2.03s
17:	learn: 0.6928455	total: 37.1ms	remaining: 2.02s
18:	learn: 0.6928371	total: 39ms	remaining: 2.01s
19:	learn: 0.6928300

In [18]:
gs.best_score_

np.float64(0.49323475023372093)

In [19]:
f1_score(y_test, gs.predict(X_test))



0.5156059991892987

In [20]:
confusion_matrix(y_test, gs.predict(X_test))



array([[ 231, 1110],
       [  85,  636]])

In [21]:
accuracy_score(y_test, gs.predict(X_test))



0.4204655674102813

### Вывод:

В рамках этой записной книжки я реализовал модель CatBoost. Сделал для нее Pipeline. Метрика качества стала лучше чем у прошлой модели.