In [1]:
import time
import numpy as np
from numpy import exp
import pandas as pd
import catboost as cb
import os

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", 30)

## Useful Functions

In [2]:
def get_input(data_path: str) -> pd.DataFrame:
    """
    Считывание данных и вывод основной информации о наборе данных.

    Parameters
    ----------
    data_path: str
        Название файла.

    Returns
    -------
    data: pandas.core.frame.DataFrame
        Загруженный набор данных в pandas.DataFrame

    """
    base_path = "data"
    data = pd.read_csv(f"{base_path}/{data_path}")
    data.columns = [col.lower() for col in data.columns]
    print(f"{data_path}: shape = {data.shape[0]} rows, {data.shape[1]} cols")

    return data

In [3]:
def catboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")
    X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train, categorical,
            eval_set=[(x_train, y_train), (x_valid, y_valid)]
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds

In [4]:
def catboost_hold_out_validation(params, X, y, split_params = [0.7, 0.2, 0.1], categorical = None):
    """
    Hold-Out валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    split_params: List[float], optional, default = [0.7, 0.2, 0.1]
        Параметры (доли) разбиения выборки.
        Опциональный параметр, по умолчанию, равен [0.7, 0.2, 0.1].
    
    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimator: catboost.core.CatBoostClassifier
        Обученный классификатор catboost.

    test_prediction: np.array, optional
        Вектор прогнозов для тестовой выборки.
        Опциональный объект, возвращается только, если split_params
        содержит 3 значения.

    """
    numeric = list(set(x_train.columns) - set(categorical))
    x_train, x_valid = train_test_split(
        X, train_size=split_params[0], random_state=27
    )
    y_train, y_valid = train_test_split(
        y, train_size=split_params[0], random_state=27
    )

    if len(split_params) == 3:
        test_size = int(split_params[2] * X.shape[0])

        x_valid, x_test = train_test_split(
            x_valid, test_size=test_size, random_state=72
        )
        y_valid, y_test = train_test_split(
            y_valid, test_size=test_size, random_state=72
        )

    model = cb.CatBoostClassifier(**params)
    model.fit(
        x_train, y_train, categorical,
        eval_set=[(x_train, y_train), (x_valid, y_valid)]
    )

    print("="*80)
    valid_score = roc_auc_score(y_valid, model.predict_proba(x_valid)[:, 1])
    print(f"Valid Score = {round(valid_score, 4)}")

    if len(split_params) == 3:

        test_prediction = model.predict_proba(x_test)[:, 1]
        test_score = roc_auc_score(y_test, test_prediction)
        print(f"Test Score = {round(test_score, 4)}")

        return estimator, test_prediction

    else:
        return estimator

## Base Tables

In [5]:
train = get_input("train.csv")
test = get_input("test.csv")

data = pd.concat([train, test], axis=0)
data = data.reset_index(drop=True)
data.head(n=2)

train.csv: shape = 617 rows, 58 cols
test.csv: shape = 5 rows, 57 cols


Unnamed: 0,id,ab,af,ah,am,ar,ax,ay,az,bc,bd,bn,bp,bq,br,...,eu,fc,fd,fe,fi,fl,fr,fs,gb,ge,gf,gh,gi,gl,class
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,22.5984,175.638726,152.707705,823.928241,...,3.828384,13.39464,10.265073,9028.291921,3.58345,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1.0
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.63219,0.025578,13.51779,1.2299,5496.92824,19.4205,155.86803,14.75472,51.216883,...,52.26048,17.175984,0.29685,6785.003474,10.358927,0.173229,0.49706,0.568932,9.292698,72.611063,27981.56275,29.13543,32.131996,21.978,0.0


In [6]:
train["class"].value_counts()

0    509
1    108
Name: class, dtype: int64

## greeks

In [7]:
greeks = get_input("greeks.csv")
greeks.head(n=2)

greeks.csv: shape = 617 rows, 6 cols


Unnamed: 0,id,alpha,beta,gamma,delta,epsilon
0,000ff2bfdfe9,B,C,G,D,3/19/2019
1,007255e47698,A,C,M,B,Unknown


## baseline

In [8]:
features_to_drop = ["id", "class"]


target, test_id = train["class"], test["id"]
train = train.drop(features_to_drop, axis=1)
test = test.drop(["id"], axis=1)

categorial = train.dtypes[train.dtypes == "object"].index
numerical = list(set(train.columns) - set(categorial))

train = train.replace(np.inf, np.nan)
train = train.replace(-np.inf, np.nan)

In [9]:
list(categorial)

['ej']

## KFold

In [10]:
cb_params = {
    "n_estimators": 2000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 10,
    "max_depth": 6,
    "l2_leaf_reg": 10,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 42
}

cv = KFold(n_splits=5, random_state=1234123, shuffle=True)

estimators, oof_preds = catboost_cross_validation(
    params=cb_params, X=train, y=target, cv=cv, categorical=list(categorial)
)

Thu Aug  3 03:21:51 2023, Cross-Validation, 617 rows, 56 cols
0:	test: 0.7753101	test1: 0.7617739	best: 0.7617739 (0)	total: 146ms	remaining: 4m 51s
10:	test: 0.9700018	test1: 0.8653846	best: 0.8712716 (8)	total: 201ms	remaining: 36.3s
20:	test: 0.9740075	test1: 0.8720565	best: 0.8763736 (13)	total: 254ms	remaining: 23.9s
30:	test: 0.9782209	test1: 0.8795133	best: 0.8814757 (28)	total: 291ms	remaining: 18.5s
40:	test: 0.9784583	test1: 0.9026688	best: 0.9026688 (40)	total: 319ms	remaining: 15.3s
50:	test: 0.9805056	test1: 0.9116954	best: 0.9128728 (48)	total: 349ms	remaining: 13.3s
60:	test: 0.9824343	test1: 0.9191523	best: 0.9191523 (60)	total: 373ms	remaining: 11.8s
70:	test: 0.9840959	test1: 0.9207221	best: 0.9226845 (67)	total: 399ms	remaining: 10.8s
80:	test: 0.9848674	test1: 0.9164050	best: 0.9226845 (67)	total: 421ms	remaining: 9.96s
90:	test: 0.9862619	test1: 0.9222920	best: 0.9226845 (67)	total: 443ms	remaining: 9.3s
100:	test: 0.9873598	test1: 0.9218995	best: 0.9226845 (67)	to

In [11]:
oof_score = roc_auc_score(
    target, oof_preds
)
print(f"OOF-score = {round(oof_score, 5)}")
# [0.72194, 0.72659, 0.73283, 0.72053, 0.72657]
# OOF-score = 0.72481

OOF-score = 0.87364


## Подготовка прогноза

In [13]:
y_pred = np.zeros((test.shape[0], 2))
test[numerical] = test[numerical].astype(float)
test[categorial] = test[categorial].astype(str)

for estimator in estimators:
    y_pred += estimator.predict_proba(test)
y_pred

array([[2.9765253, 2.0234747],
       [2.9765253, 2.0234747],
       [2.9765253, 2.0234747],
       [2.9765253, 2.0234747],
       [2.9765253, 2.0234747]])

In [14]:
sigmoid = lambda x: 1/ (1+ exp(-x))
prob = sigmoid(y_pred)
prob

array([[0.95150228, 0.88323982],
       [0.95150228, 0.88323982],
       [0.95150228, 0.88323982],
       [0.95150228, 0.88323982],
       [0.95150228, 0.88323982]])

In [15]:
# y_pred_1 = np.zeros(test.shape[0])
# test[numerical] = test[numerical].astype(float)
# test[categorial] = test[categorial].astype(str)
# y_pred_0 = np.zeros(test.shape[0])
# test[numerical] = test[numerical].astype(float)
# test[categorial] = test[categorial].astype(str)

# for estimator in estimators:
#     print(estimator.predict_proba(test))
#     y_pred_1 += estimator.predict_proba(test)[:,1]
#     y_pred_0 += estimator.predict_proba(test)[:,-1]
# y_pred_1, y_pred_0

In [16]:
if os.path.isdir("predict") == False:
    os.mkdir("predict")
    print("Папка predict создана")

In [17]:
d = pd.read_csv("data\sample_submission.csv")
d.head()

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.5,0.5
1,010ebe33f668,0.5,0.5
2,02fa521e1838,0.5,0.5
3,040e15f562a2,0.5,0.5
4,046e85c7cc7f,0.5,0.5


In [18]:
pred = pd.DataFrame({
    "id": test_id,
    "class_0": prob[:,0],
    "class_1": prob[:,1]
})
pred.to_csv("predict/baseline_submit.csv", index=False)

In [19]:
pred

Unnamed: 0,id,class_0,class_1
0,00eed32682bb,0.951502,0.88324
1,010ebe33f668,0.951502,0.88324
2,02fa521e1838,0.951502,0.88324
3,040e15f562a2,0.951502,0.88324
4,046e85c7cc7f,0.951502,0.88324
