In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [2]:
df = pd.read_csv('train.csv')
y = df.target

df.drop(['ID', 'target'], axis=1, inplace=True)
df.fillna(-9999, inplace=True)
cat_features_ids = np.where(df.apply(pd.Series.nunique) < 30000)[0].tolist()

In [3]:
train, test, y_train, y_test = train_test_split(df, y, test_size = 0.1,random_state = 42)
train, val, y_train, y_val = train_test_split(train, y_train, test_size = 0.25)

In [4]:
clf = CatBoostClassifier(learning_rate=0.1, iterations=100, random_seed=42, eval_metric='AUC', logging_level='Silent')
clf.fit(train, y_train, cat_features=cat_features_ids, eval_set=(val, y_val))
prediction = clf.predict_proba(test)
print('Roc-auc score with Catboost:',roc_auc_score(y_test, prediction[:, 1]))

Roc-auc score with Catboost: 0.7841281938499387


In [5]:
kfold = KFold(n_splits=10)
pred = []
train, test, y_train, y_test = train_test_split(df, y, test_size = 0.1,random_state = 42)
for train_ind, test_ind in kfold.split(train):
    train_val, test_val, y_train_val, y_test_val = train.iloc[train_ind, :], train.iloc[test_ind, :],\
                                                   y_train.iloc[train_ind], y_train.iloc[test_ind]
    clf.fit(train_val, y_train_val, cat_features=cat_features_ids, eval_set=(test_val, y_test_val))
    prediction = clf.predict_proba(test)
    pred.append(
    prediction[:, 1]
    )
    

print('Roc-auc score with Catboost:',roc_auc_score(y_test, np.mean(pred, axis = 0)))

Roc-auc score with Catboost: 0.7930162585925847
