In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split,  KFold
import catboost as cb
import time
import math
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
np.random.seed(13)

In [3]:
df_train = pd.read_csv('../data/assignment_train.csv')
df_test = pd.read_csv('../data/assignment_test.csv')

In [4]:
def catboost_cross_validation(params, X, y, cv, categorical = None):
    """
    Кросс-валидация для модели catbooost.

    Parameters
    ----------
    params: dict
        Словарь гиперпараметров модели.

    X: pandas.core.frame.DataFrame
        Матрица признако для обучения модели.

    y: pandas.core.frame.Series
        Вектор целевой переменной для обучения модели.

    cv: KFold or StratifiedKFold generator.
        Объект KFold / StratifiedKFold для определения
        стратегии кросс-валидации модели.

    categorical: str, optional, default = None
        Список категориальных признаков.
        Опциональный параметр, по умолчанию, не используется.

    Returns
    -------
    estimators: list
        Список с объектами обученной модели.

    oof_preds: np.array
        Вектор OOF-прогнозов.

    """
    estimators, folds_scores = [], []
    oof_preds = np.zeros(X.shape[0])

    print(f"{time.ctime()}, Cross-Validation, {X.shape[0]} rows, {X.shape[1]} cols")
    X[categorical] = X[categorical].astype(str)

    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):

        x_train, x_valid = X.loc[train_idx], X.loc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        model = cb.CatBoostClassifier(**params)
        model.fit(
            x_train, y_train, categorical,
#             eval_set=[(x_train, y_train), (x_valid, y_valid)]
        )
        oof_preds[valid_idx] = model.predict_proba(x_valid)[:, 1]
        score = roc_auc_score(y_valid, oof_preds[valid_idx])
        print(f"Fold {fold+1}, Valid score = {round(score, 5)}")
        folds_scores.append(round(score, 5))
        estimators.append(model)

    print(f"Score by each fold: {folds_scores}")
    print("="*65)
    return estimators, oof_preds

In [5]:
cb_params = {
    "n_estimators": 2000,
    "learning_rate": 0.05,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "GPU",
    "max_bin": 20,
    "silent": True,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 17
}

In [6]:
def get_score(df_train):
    train = df_train.drop(['isFraud', "TransactionID", "TransactionDT"], axis=1)
    target = df_train['isFraud']

    categorial = train.dtypes[train.dtypes == "object"].index
    cv = KFold(n_splits=5, random_state=13, shuffle=True)

    estimators, oof_preds = catboost_cross_validation(
        params=cb_params, X=train, y=target, cv=cv, categorical=categorial
    )
    
    oof_score = roc_auc_score(target, oof_preds)
    
    print(f"OOF-score = {round(oof_score, 5)}")
    return estimators, oof_preds

In [7]:
estimators, oof_preds = get_score(df_train)

Sat Sep 26 15:34:13 2020, Cross-Validation, 50001 rows, 391 cols
Fold 1, Valid score = 0.87622
Fold 2, Valid score = 0.90444
Fold 3, Valid score = 0.89173
Fold 4, Valid score = 0.89442
Fold 5, Valid score = 0.88596
Score by each fold: [0.87622, 0.90444, 0.89173, 0.89442, 0.88596]
OOF-score = 0.8897


In [8]:
start_date = pd.to_datetime('2017-12-01')
DATE = 'TransactionDT'
df_train[DATE] = pd.to_datetime(df_train[DATE], unit='s', origin=start_date)
df_train[DATE + '_Y'] = df_train[DATE].dt.year
df_train[DATE + '_M'] = df_train[DATE].dt.month
df_train[DATE + '_WD'] = df_train[DATE].dt.weekday
df_train[DATE + '_H'] = df_train[DATE].dt.hour
df_train[DATE + '_D'] = df_train[DATE].dt.day

In [9]:
estimators, oof_preds = get_score(df_train)

Sat Sep 26 15:44:18 2020, Cross-Validation, 50001 rows, 396 cols
Fold 1, Valid score = 0.87678
Fold 2, Valid score = 0.90482
Fold 3, Valid score = 0.88937
Fold 4, Valid score = 0.89613
Fold 5, Valid score = 0.88413
Score by each fold: [0.87678, 0.90482, 0.88937, 0.89613, 0.88413]
OOF-score = 0.88943


In [10]:
def get_group_val_card(X):
    cards = [f'card{i}' for i in range(1,7)]
    res = X[['TransactionAmt']+cards].copy()
    for i in cards:
        m = res.groupby(i)['TransactionAmt'].mean()
        res[f'{i}_mean'] = res[i].map(m)
        res[f'{i}_diff'] = res['TransactionAmt'] - res[f'{i}_mean']
        res[f'{i}_rel'] = res['TransactionAmt'] / res[f'{i}_mean']
    return res.drop(['TransactionAmt']+cards, axis=1)

df_train = pd.concat([df_train,  get_group_val_card(df_train)], axis=1)

In [11]:
estimators, oof_preds = get_score(df_train)

Sat Sep 26 15:54:08 2020, Cross-Validation, 50001 rows, 414 cols
Fold 1, Valid score = 0.87891
Fold 2, Valid score = 0.90643
Fold 3, Valid score = 0.89125
Fold 4, Valid score = 0.89721
Fold 5, Valid score = 0.88511
Score by each fold: [0.87891, 0.90643, 0.89125, 0.89721, 0.88511]
OOF-score = 0.89108


In [14]:
def process_card(df_train):
    cards = [f'card{i}' for i in range(1,7)]
    for i in cards:
        freq_encoder = df_train[i].value_counts(normalize=True)
        df_train[i + "_freq_enc"] = df_train[i].map(freq_encoder)        

    return df_train

df_train = process_card(df_train)

In [15]:
estimators, oof_preds = get_score(df_train)

Sat Sep 26 16:06:57 2020, Cross-Validation, 50001 rows, 420 cols
Fold 1, Valid score = 0.87691
Fold 2, Valid score = 0.90407
Fold 3, Valid score = 0.89064
Fold 4, Valid score = 0.89192
Fold 5, Valid score = 0.88724
Score by each fold: [0.87691, 0.90407, 0.89064, 0.89192, 0.88724]
OOF-score = 0.88963


In [16]:
value_colum = "TransactionAmt"
df_train['Log_' + value_colum] = np.log(df_train[value_colum])
df_train['LogFloor_' + value_colum] = df_train[value_colum].apply(lambda x: math.floor(np.log(x)))
df_train['LogRemainder_' + value_colum] = df_train['Log_' + value_colum] - df_train['LogFloor_' + value_colum]

In [17]:
estimators, oof_preds = get_score(df_train)

Sat Sep 26 16:17:22 2020, Cross-Validation, 50001 rows, 423 cols
Fold 1, Valid score = 0.87964
Fold 2, Valid score = 0.90278
Fold 3, Valid score = 0.89216
Fold 4, Valid score = 0.89158
Fold 5, Valid score = 0.88678
Score by each fold: [0.87964, 0.90278, 0.89216, 0.89158, 0.88678]
OOF-score = 0.89007


In [26]:
pca = PCA(n_components = 4)
scalar = StandardScaler()

value_colum =  df_train.drop(['TransactionID', 'TransactionDT', 'isFraud'], axis=1).select_dtypes(include=[np.number]).columns
X_std = scalar.fit_transform(df_train[value_colum].apply(lambda x: x.fillna(x.mean())))
X_std_pca = pca.fit_transform(X_std)

v_col = 'pca'

df_train[[v_col + '_c1', v_col + '_c2', v_col + '_c3', v_col + '_c4']] = \
    pd.DataFrame(X_std_pca, columns = [v_col + '_c1', v_col + '_c2', v_col + '_c3', v_col + '_c4'])

In [28]:
estimators, oof_preds = get_score(df_train)

Sat Sep 26 16:33:12 2020, Cross-Validation, 50001 rows, 427 cols
Fold 1, Valid score = 0.8736
Fold 2, Valid score = 0.90579
Fold 3, Valid score = 0.89279
Fold 4, Valid score = 0.89293
Fold 5, Valid score = 0.88365
Score by each fold: [0.8736, 0.90579, 0.89279, 0.89293, 0.88365]
OOF-score = 0.88927


In [78]:
def calculate_permutation_importance(estimators, x_valid: pd.DataFrame,
                                     y_valid: pd.Series) -> pd.Series:   

    scores = {}
    y_pred = np.zeros(x_valid.shape[0])
    for estimator in estimators:
        y_pred += estimator.predict_proba(x_valid)[:, 1]    
    y_pred = y_pred / len(estimators)
        
    base_score = roc_auc_score(y_valid, y_pred)    

    for feature in x_valid.columns:
        x_valid_copy = x_valid.copy()
        x_valid_copy[feature] = np.random.permutation(x_valid_copy[feature])

        y_pred = np.zeros(x_valid_copy.shape[0])
        for estimator in estimators:
            y_pred += estimator.predict_proba(x_valid_copy)[:, 1]    
        y_pred = y_pred / len(estimators)
        score = roc_auc_score(y_valid, y_pred)    

        scores[feature] = base_score - score

    scores = pd.Series(scores)
    scores = scores.sort_values(ascending=False)

    return scores

In [79]:
train = df_train.drop(['isFraud', "TransactionID", "TransactionDT"], axis=1).fillna(-9999)
target = df_train['isFraud']

In [101]:
perm_importance = calculate_permutation_importance(estimators, train, target)
col = perm_importance[perm_importance > 0.01].index.to_list() + ['isFraud', "TransactionID", "TransactionDT"]
df_train_x = df_train[col]

In [102]:
estimators, oof_preds = get_score(df_train_x)

Sat Sep 26 22:47:06 2020, Cross-Validation, 50001 rows, 70 cols
Fold 1, Valid score = 0.88042
Fold 2, Valid score = 0.90104
Fold 3, Valid score = 0.89266
Fold 4, Valid score = 0.89507
Fold 5, Valid score = 0.88546
Score by each fold: [0.88042, 0.90104, 0.89266, 0.89507, 0.88546]
OOF-score = 0.89012


Вывод: catboost под капотом имеет много функций и сильно повлиять на предсказание довольно сложно, как показывает эксперимент добавление новых признаков может ухучшать результат и улучшать. Когда признаков становится более 200 имеет смысл провести отбор признаков