In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
cont_features = [
    "cont0", "cont1", "cont2", "cont3", "cont4", "cont5", "cont6", "cont7",
    "cont8", "cont9", "cont10",
]
cat_features = [
    "cat0", "cat1", "cat2", "cat3", "cat4", "cat5", "cat6", "cat7",
    "cat8", "cat9", "cat10", "cat11", "cat12", "cat13", "cat14", "cat15",
    "cat16", "cat17", "cat18"
]
target = train["target"]

In [4]:
from category_encoders import CatBoostEncoder, LeaveOneOutEncoder
from sklearn.preprocessing import LabelEncoder

xgb_cat_features = []
lgb_cat_features = []
cb_cat_features = []
ridge_cat_features = []

loo_features = []
le_features = []

def label_encode(train, test, column):
    le = LabelEncoder()
    new_feature = "{}_le".format(column)
    le.fit(train[column].unique().tolist() + test[column].unique().tolist())
    train[new_feature] = le.transform(train[column])
    test[new_feature] = le.transform(test[column])
    return new_feature

def loo_encode(train, test, column):
    loo = LeaveOneOutEncoder()
    new_feature = "{}_loo".format(column)
    loo.fit(train[column], train['target']) 
    train[new_feature] = loo.transform(train[column])
    test[new_feature] = loo.transform(test[column])
    return new_feature

In [5]:
for feature in cat_features:
    loo_features.append(loo_encode(train, test, feature))
    le_features.append(label_encode(train, test, feature))
    
xgb_cat_features.extend(loo_features)
lgb_cat_features.extend(le_features)
cb_cat_features.extend(cat_features)
ridge_cat_features.extend(loo_features)

In [6]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

random_state = 2021
n_folds = 10
k_fold = StratifiedKFold(n_splits=n_folds, random_state=random_state, shuffle=True)

xgb_train_preds = np.zeros(train.shape[0], )
xgb_test_preds = np.zeros(test.shape[0], )
xgb_features = xgb_cat_features + cont_features

lgb_train_preds = np.zeros(train.shape[0], )
lgb_test_preds = np.zeros(test.shape[0], )
lgb_features = lgb_cat_features + cont_features

cb_train_preds = np.zeros(train.shape[0], )
cb_test_preds = np.zeros(test.shape[0], )
cb_features = cb_cat_features + cont_features

ridge_train_preds = np.zeros(train.shape[0], )
ridge_test_preds = np.zeros(test.shape[0], )
ridge_features = ridge_cat_features + cont_features

for fold, (train_idx, test_idx) in enumerate(k_fold.split(train, target)):
    print("--> Fold {}".format(fold + 1))
    y_train = target.iloc[train_idx]
    y_valid = target.iloc[test_idx]
    
    lgb_X_train, lgb_X_valid = train[lgb_features].iloc[train_idx], train[lgb_features].iloc[test_idx]
    xgb_X_train, xgb_X_valid = train[xgb_features].iloc[train_idx], train[xgb_features].iloc[test_idx]
    cb_X_train, cb_X_valid = train[cb_features].iloc[train_idx], train[cb_features].iloc[test_idx]
    ridge_X_train, ridge_X_valid = train[ridge_features].iloc[train_idx], train[ridge_features].iloc[test_idx]
    
    xgb_model = XGBClassifier(
        seed=random_state,
        n_estimators=10000,
        verbosity=1,
        eval_metric="auc",
        tree_method="gpu_hist",
        gpu_id=0,
        alpha=9.037672745139417,
        colsample_bytree=0.6204453741210664,
        gamma=0.7655610995827371,
        reg_lambda=6.854931929134254,
        learning_rate=0.013401479391378243,
        max_bin=304,
        max_depth=14,
        min_child_weight=1.5513425169835457,
        subsample=0.8303017072175757,
    )
    xgb_model.fit(
        xgb_X_train,
        y_train,
        eval_set=[(xgb_X_valid, y_valid)], 
        verbose=0,
        early_stopping_rounds=200
    )
    
    train_oof_preds = xgb_model.predict_proba(xgb_X_valid)[:,1]
    test_oof_preds = xgb_model.predict_proba(test[xgb_features])[:,1]
    xgb_train_preds[test_idx] = train_oof_preds
    xgb_test_preds += test_oof_preds / n_folds
    print(": XGB - ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_preds, average="micro")))


    lgb_model = LGBMClassifier(
        cat_feature=[x for x in range(len(lgb_cat_features))],
        random_state=random_state,
        cat_l2=25.999876242730252,
        cat_smooth=89.2699690675538,
        colsample_bytree=0.2557260109926193,
        early_stopping_round=200,
        learning_rate=0.00918685483594994,
        max_bin=788,
        max_depth=81,
        metric="auc",
        min_child_samples=292,
        min_data_per_group=177,
        n_estimators=1600000,
        n_jobs=-1,
        num_leaves=171,
        reg_alpha=0.7115353581785044,
        reg_lambda=5.658115293998945,
        subsample=0.9262904583735796,
        subsample_freq=1,
        verbose=-1,
    )
    lgb_model.fit(
        lgb_X_train,
        y_train,
        eval_set=[(lgb_X_valid, y_valid)], 
        verbose=0,
    )
    
    train_oof_preds = lgb_model.predict_proba(lgb_X_valid)[:,1]
    test_oof_preds = lgb_model.predict_proba(test[lgb_features])[:,1]
    lgb_train_preds[test_idx] = train_oof_preds
    lgb_test_preds += test_oof_preds / n_folds
    print(": LGB - ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_preds, average="micro")))
    
    
    cb_model = CatBoostClassifier(
        verbose=0,
        eval_metric="AUC",
        loss_function="Logloss",
        random_state=random_state,
        num_boost_round=20000,
        od_type="Iter",
        od_wait=200,
        task_type="GPU",
        devices="0",
        cat_features=[x for x in range(len(cb_cat_features))],
        bagging_temperature=1.288692494969795,
        grow_policy="Depthwise",
        l2_leaf_reg=9.847870133539244,
        learning_rate=0.01877982653902465,
        max_depth=8,
        min_data_in_leaf=1,
        penalties_coefficient=2.1176668909602734,
    )
    cb_model.fit(
        cb_X_train,
        y_train,
        eval_set=[(cb_X_valid, y_valid)], 
        verbose=0,
    )
    
    train_oof_preds = cb_model.predict_proba(cb_X_valid)[:,1]
    test_oof_preds = cb_model.predict_proba(test[cb_features])[:,1]
    cb_train_preds[test_idx] = train_oof_preds
    cb_test_preds += test_oof_preds / n_folds
    print(": CB - ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_preds, average="micro")))

    
    ridge_model = RidgeClassifier(
        random_state=random_state,
    )
    ridge_model.fit(
        ridge_X_train,
        y_train,
    )

    train_oof_preds = ridge_model.decision_function(ridge_X_valid)
    test_oof_preds = ridge_model.decision_function(test[ridge_features])
    ridge_train_preds[test_idx] = train_oof_preds
    ridge_test_preds += test_oof_preds / n_folds
    print(": Ridge - ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_preds, average="micro")))
    print("")
    
print("--> Overall metrics")
print(": XGB - ROC AUC Score = {}".format(roc_auc_score(target, xgb_train_preds, average="micro")))
print(": LGB - ROC AUC Score = {}".format(roc_auc_score(target, lgb_train_preds, average="micro")))
print(": CB - ROC AUC Score = {}".format(roc_auc_score(target, cb_train_preds, average="micro")))
print(": Ridge - ROC AUC Score = {}".format(roc_auc_score(target, ridge_train_preds, average="micro")))


--> Fold 1




: XGB - ROC AUC Score = 0.8983017846431489




: LGB - ROC AUC Score = 0.8992752749691963
: CB - ROC AUC Score = 0.8969627750077146
: Ridge - ROC AUC Score = 0.8766369762926142

--> Fold 2




: XGB - ROC AUC Score = 0.8959564270898941




: LGB - ROC AUC Score = 0.8965719305867795
: CB - ROC AUC Score = 0.894896330360522
: Ridge - ROC AUC Score = 0.8758266069616415

--> Fold 3




: XGB - ROC AUC Score = 0.8940539594014382




: LGB - ROC AUC Score = 0.895886292280639
: CB - ROC AUC Score = 0.8927082989283495
: Ridge - ROC AUC Score = 0.8728614118821589

--> Fold 4




: XGB - ROC AUC Score = 0.8942548198343717




: LGB - ROC AUC Score = 0.8952394120091154
: CB - ROC AUC Score = 0.8928892610593528
: Ridge - ROC AUC Score = 0.8731456945335946

--> Fold 5




: XGB - ROC AUC Score = 0.8960471278527358




: LGB - ROC AUC Score = 0.8971458428093266
: CB - ROC AUC Score = 0.8952166205500075
: Ridge - ROC AUC Score = 0.8777665458860092

--> Fold 6




: XGB - ROC AUC Score = 0.897114734236636




: LGB - ROC AUC Score = 0.8982421838933615
: CB - ROC AUC Score = 0.8961538779342406
: Ridge - ROC AUC Score = 0.8768702891611878

--> Fold 7




: XGB - ROC AUC Score = 0.8976791053175636




: LGB - ROC AUC Score = 0.8990042825802195
: CB - ROC AUC Score = 0.896403813617131
: Ridge - ROC AUC Score = 0.875852497009206

--> Fold 8




: XGB - ROC AUC Score = 0.8973837778816753




: LGB - ROC AUC Score = 0.8978977897671531
: CB - ROC AUC Score = 0.8967892426412977
: Ridge - ROC AUC Score = 0.8761802854403709

--> Fold 9




: XGB - ROC AUC Score = 0.894070636426787




: LGB - ROC AUC Score = 0.8943696958642415
: CB - ROC AUC Score = 0.8923963686506299
: Ridge - ROC AUC Score = 0.873680243840537

--> Fold 10




: XGB - ROC AUC Score = 0.8944077382563806




: LGB - ROC AUC Score = 0.8958117194942317
: CB - ROC AUC Score = 0.893722748560412
: Ridge - ROC AUC Score = 0.8762643441471586

--> Overall metrics
: XGB - ROC AUC Score = 0.8959058932774282
: LGB - ROC AUC Score = 0.8969254992537874
: CB - ROC AUC Score = 0.8948060924258656
: Ridge - ROC AUC Score = 0.8754986716769508


In [7]:
random_state = 2021
n_folds = 10
k_fold = StratifiedKFold(n_splits=n_folds, random_state=random_state, shuffle=True)

l1_train = pd.DataFrame(data={
    "xgb": xgb_train_preds.tolist(),
    "lgb": lgb_train_preds.tolist(),
    "cb": cb_train_preds.tolist(),
    "ridge": ridge_train_preds.tolist(),
    "target": target.tolist()
})
l1_test = pd.DataFrame(data={
    "xgb": xgb_test_preds.tolist(),
    "lgb": lgb_test_preds.tolist(),
    "cb": cb_test_preds.tolist(),
    "ridge": ridge_test_preds.tolist(),    
})

train_preds = np.zeros(len(l1_train.index), )
test_preds = np.zeros(len(l1_test.index), )
features = ["xgb", "lgb", "cb", "ridge"]

In [8]:
for fold, (train_index, test_index) in enumerate(k_fold.split(l1_train, target)):
    print("--> Fold {}".format(fold + 1))
    y_train = target.iloc[train_index]
    y_valid = target.iloc[test_index]

    x_train = pd.DataFrame(l1_train[features].iloc[train_index])
    x_valid = pd.DataFrame(l1_train[features].iloc[test_index])
    
    model = RidgeClassifier(
        random_state=random_state,
    )
    model.fit(
        x_train,
        y_train,
    )

    train_oof_preds = model.decision_function(x_valid)
    test_oof_preds = model.decision_function(l1_test[features])
    train_preds[test_index] = train_oof_preds
    test_preds += test_oof_preds / n_folds
    print(": ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_preds, average="micro")))
    print("")
    
print("--> Overall metrics")
print(": ROC AUC Score = {}".format(roc_auc_score(target, train_preds, average="micro")))

--> Fold 1
: ROC AUC Score = 0.899600387087311

--> Fold 2
: ROC AUC Score = 0.8970327015324786

--> Fold 3
: ROC AUC Score = 0.8959350137322821

--> Fold 4
: ROC AUC Score = 0.8955916924138633

--> Fold 5
: ROC AUC Score = 0.8974905622017265

--> Fold 6
: ROC AUC Score = 0.8985930947562503

--> Fold 7
: ROC AUC Score = 0.8992157740818357

--> Fold 8
: ROC AUC Score = 0.8985173933299797

--> Fold 9
: ROC AUC Score = 0.8949465069503907

--> Fold 10
: ROC AUC Score = 0.8960272166774221

--> Overall metrics
: ROC AUC Score = 0.8972760782048211


In [9]:
submission = pd.read_csv('sample_submission.csv')
# submission["target"] = test_preds.tolist()
# submission.to_csv("ensemble_model_1.csv", index=False)

In [11]:
submission['target'] = test_preds
submission.to_csv('model_cv_4_test_roc.csv', index = False)

sub2 = pd.DataFrame({'target' : train_preds})
sub2.to_csv('model_cv_4_train_roc.csv', index = False)