In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score

train_df = pd.read_csv(
    "./data/train.csv")
test_df = pd.read_csv(
    "./data/test.csv", index_col=[0])
null_value_stats = train_df.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

X = train_df.drop('Label', axis=1)
y = train_df.Label


X_train, X_validation, y_train, y_validation = train_test_split(
    X, y, train_size=0.75, random_state=42)

X_test = test_df

In [None]:
import hyperopt
from numpy.random import default_rng
from hyperopt import hp

def hyperopt_objective(params):
    model = CatBoostClassifier(
        **params,
    )
    
    cv_data = cv(
        Pool(X, y),
        model.get_params(),
        logging_level='Silent',
    )
    best_accuracy = np.max(cv_data['test-PRAUC-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

params = {
    'iterations': 10000,
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', np.log(1e-5), np.log(10)),
    'model_size_reg': hp.loguniform('model_size_reg', np.log(1e-5), np.log(10)),
    'depth': hp.quniform('depth', 4, 10, 1),  # Tree depth (integer)
    'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.2)),
    'eval_metric': "PRAUC",
    'loss_function': metrics.CrossEntropy(),
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': True,
    'od_type': 'Iter',
    'od_wait': 40,
    "task_type":"GPU",

}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=default_rng(123)
)

In [4]:
print(best)

{'depth': 7.0, 'l2_leaf_reg': 1.0884939894237269, 'learning_rate': 0.0731510479644652, 'model_size_reg': 0.011454331570092468}


In [None]:
params.update(best)
params['od_wait'] = 1000
best_model = CatBoostClassifier(
    **params,  custom_metric=[
        "AUC", "Accuracy", "Logloss", "NormalizedGini", "BalancedAccuracy", "CrossEntropy"]
)
print(best_model.get_params())
cv_data = cv(Pool(X, y), best_model.get_params(), plot=True)

train_pool = Pool(X_train, y_train)
validate_pool = Pool(X_validation, y_validation)
best_model.fit(train_pool, eval_set=validate_pool)
print('Best model validation accuracy: {:.4}'.format(
    roc_auc_score(y_validation, best_model.predict_proba(
        X_validation)[:, 1], max_fpr=0.01)
))  

#stack and document, plot,...then see the results. 

In [17]:
best_model.save_model('catboost_model_best.dump')
pred_test = best_model.predict_proba(test_df.to_numpy())[:, 1]
submission = pd.DataFrame({'id': test_df.index, 'Label': pred_test})
submission.set_index('id').to_csv("submission-catboost.csv")

In [26]:
#submit the trian stuff. 
best_model.load_model('catboost_model_best.dump')
pred_test = best_model.predict_proba(X)[:, 1]
submission = pd.DataFrame({'id': train_df.index, 'Label': pred_test})
submission.set_index('id').to_csv("train_submission.csv")

{'iterations': 10000, 'l2_leaf_reg': 1.0884939894237269, 'model_size_reg': 0.011454331570092468, 'depth': 7.0, 'learning_rate': 0.0731510479644652, 'eval_metric': 'PRAUC', 'loss_function': CrossEntropy(use_weights=True [mandatory=False]), 'random_seed': 42, 'logging_level': 'Silent', 'use_best_model': True, 'od_type': 'Iter', 'od_wait': 1000, 'task_type': 'GPU'}


In [None]:
#ensemble over many seeds. 

models = []
for i in range(30):
    params['random_seed'] = i
    X_train_final, X_validation_final, y_train_final, y_validation_final = train_test_split(
        X, y, train_size=0.90, random_state=i)
    model = CatBoostClassifier(
        **params
    )
    train_pool_final = Pool(X_train_final, y_train_final)
    validation_pool_final = Pool(X_validation_final, y_validation_final)
    model.fit(train_pool_final, eval_set=validation_pool_final)
    score = roc_auc_score(y_validation_final, model.predict_proba(
            X_validation_final)[:, 1], max_fpr=0.01)
    
    print('Best model validation accuracy: {:.4}'.format(
        score
        ))
    models.append((score, i, model))
models = sorted(models, key=lambda x: x[0], reverse=True)
    

In [49]:
model_preds = []
#try ensemble the top 5, then top3, then top 2. 
for score, seed, model in models[3:]:
    pred = model.predict_proba(test_df.to_numpy())[:, 1]
    model_preds.append(pred)
model_preds = np.array(model_preds)
avg = np.mean(model_preds, axis=0)
submission = pd.DataFrame({'id': test_df.index, 'Label': avg})
submission.set_index('id').to_csv("submission-catboost-ensemble_3.csv")