In [1]:
import numpy as np
import pandas as pd

import warnings

warnings.filterwarnings('ignore')

In [2]:
train=pd.read_csv('/kaggle/input/../input/tabular-playground-series-mar-2021/train.csv')
test=pd.read_csv('/kaggle/input/../input/tabular-playground-series-mar-2021/test.csv')
submit=pd.read_csv('/kaggle/input/../input/tabular-playground-series-mar-2021/sample_submission.csv')

In [3]:
def datainfo(df):
    return pd.DataFrame([(col,df[col].nunique(),df[col].dtype,df[col].isna().sum(),
                         df[col].unique()[:5]) for col in df.columns],
                        columns=['name','nunique','dtype','missing','value:5'])
datainfo(train)

Unnamed: 0,name,nunique,dtype,missing,value:5
0,id,300000,int64,0,"[0, 1, 2, 3, 4]"
1,cat0,2,object,0,"[A, B]"
2,cat1,15,object,0,"[I, K, A, F, L]"
3,cat2,19,object,0,"[A, G, C, O, D]"
4,cat3,13,object,0,"[B, A, C, D, G]"
5,cat4,20,object,0,"[B, E, H, I, D]"
6,cat5,84,object,0,"[BI, AB, BU, M, T]"
7,cat6,16,object,0,"[A, K, C, I, G]"
8,cat7,51,object,0,"[S, W, E, Y, G]"
9,cat8,61,object,0,"[Q, AD, BM, Y, AG]"


In [4]:
cat_cols=[col for col in train.columns if 'cat' in col]
num_cols=[col for col in train.columns if 'cont' in col]
target=train['target']

In [5]:
from category_encoders import LeaveOneOutEncoder
from sklearn.preprocessing import LabelEncoder

xgb_cat_features = []
lgb_cat_features = []
cb_cat_features = []
knn_cat_features= []

loo_features = []
le_features = []

# label encoding은 x에 잘 안쓰는데 쓰는 경우도 있다. 
# 그리고 train과 test 모두를 아우르는 fit을 사용한다. 
# train만 fit 하고 transform을 train,test하는 줄 알았는데 이러면 data leakage가 있을것같다. 
# 하지만, train에서만 fitting 이되고 test에 안되면 overfitting이니까 필요한것 같기도 하고. 

def label_encode(train_df, test_df, column):
    le = LabelEncoder()
    new_feature = "{}_le".format(column)
    le.fit(train_df[column].unique().tolist() + test_df[column].unique().tolist())
    train_df[new_feature] = le.transform(train_df[column])
    test_df[new_feature] = le.transform(test_df[column])
    return new_feature

# leave-ont-out encoding은 train으로만 fit. 

def loo_encode(train_df, test_df, column):
    loo = LeaveOneOutEncoder()
    new_feature = "{}_loo".format(column)
    loo.fit(train_df[column], train_df["target"])
    train_df[new_feature] = loo.transform(train_df[column])
    test_df[new_feature] = loo.transform(test_df[column])
    return new_feature

for feature in cat_cols:
    loo_features.append(loo_encode(train, test, feature))
    le_features.append(label_encode(train, test, feature))
    
xgb_cat_features.extend(loo_features)
# lightgbm 은 label encoding 이 잘 먹히나보다.
lgb_cat_features.extend(le_features)
# catboost 는 category 분석 모델이라 그런지 그대로 넣는다.
cb_cat_features.extend(cat_cols)
knn_cat_features.extend(loo_features)

# Generate Level 1 Models

In [15]:
import warnings
warnings.filterwarnings("ignore")

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

random_state = 2021
n_folds = 5
k_fold = StratifiedKFold(n_splits=n_folds, random_state=random_state, shuffle=True)

xgb_train_preds = np.zeros(len(train.index), )
xgb_test_preds = np.zeros(len(test.index), )
xgb_features = xgb_cat_features + num_cols

lgb_train_preds = np.zeros(len(train.index), )
lgb_test_preds = np.zeros(len(test.index), )
lgb_features = lgb_cat_features + num_cols

cb_train_preds = np.zeros(len(train.index), )
cb_test_preds = np.zeros(len(test.index), )
cb_features = cb_cat_features + num_cols

knn_train_preds = np.zeros(len(train.index), )
knn_test_preds = np.zeros(len(test.index), )
knn_features = knn_cat_features + num_cols

for fold, (train_index, test_index) in enumerate(k_fold.split(train, target)):
    print("--> Fold {}".format(fold + 1))
    y_train = target.iloc[train_index]
    y_valid = target.iloc[test_index]

    xgb_x_train = pd.DataFrame(train[xgb_features].iloc[train_index])
    xgb_x_valid = pd.DataFrame(train[xgb_features].iloc[test_index])

    lgb_x_train = pd.DataFrame(train[lgb_features].iloc[train_index])
    lgb_x_valid = pd.DataFrame(train[lgb_features].iloc[test_index])

    cb_x_train = pd.DataFrame(train[cb_features].iloc[train_index])
    cb_x_valid = pd.DataFrame(train[cb_features].iloc[test_index])

    knn_x_train = pd.DataFrame(train[knn_features].iloc[train_index])
    knn_x_valid = pd.DataFrame(train[knn_features].iloc[test_index])

    xgb_model = XGBClassifier(
        seed=random_state,
        eval_metric="auc"
    )
    xgb_model.fit(
        xgb_x_train,
        y_train,
        eval_set=[(xgb_x_valid, y_valid)], 
    )

    train_oof_preds = xgb_model.predict_proba(xgb_x_valid)[:,1]
    test_oof_preds = xgb_model.predict_proba(test[xgb_features])[:,1]
    xgb_train_preds[test_index] = train_oof_preds
    xgb_test_preds += test_oof_preds / n_folds
    print(": XGB - ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_preds, average="micro")))

    lgb_model = LGBMClassifier(
        # 번호로 접근 하는것 같다. 그래서 처음에 cat_features들을 앞에 넣었다. 
        cat_feature=[x for x in range(len(lgb_cat_features))],
        random_state=random_state,
        metric="auc"
    )
    lgb_model.fit(
        lgb_x_train,
        y_train,
        eval_set=[(lgb_x_valid, y_valid)], 
    )

    train_oof_preds = lgb_model.predict_proba(lgb_x_valid)[:,1]
    test_oof_preds = lgb_model.predict_proba(test[lgb_features])[:,1]
    lgb_train_preds[test_index] = train_oof_preds
    lgb_test_preds += test_oof_preds / n_folds
    print(": LGB - ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_preds, average="micro")))

    cb_model = CatBoostClassifier(
        eval_metric="AUC",
        loss_function="Logloss",
        random_state=random_state,
        cat_features=[x for x in range(len(cb_cat_features))]
    )
    cb_model.fit(
        cb_x_train,
        y_train,
        eval_set=[(cb_x_valid, y_valid)], 
        verbose=0,
    )

    train_oof_preds = cb_model.predict_proba(cb_x_valid)[:,1]
    test_oof_preds = cb_model.predict_proba(test[cb_features])[:,1]
    cb_train_preds[test_index] = train_oof_preds
    cb_test_preds += test_oof_preds / n_folds
    print(": CB - ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_preds, average="micro")))
    
    knn_model = KNeighborsClassifier()
    knn_model.fit(
        knn_x_train,
        y_train,
    )

    train_oof_preds = knn_model.predict_proba(knn_x_valid)[:,-1]
    test_oof_preds = knn_model.predict_proba(test[knn_features])[:,-1]
    knn_train_preds[test_index] = train_oof_preds
    knn_test_preds += test_oof_preds / n_folds
    print(": KNN - ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_preds, average="micro")))
    
print("--> Overall metrics")
print(": XGB - ROC AUC Score = {}".format(roc_auc_score(target, xgb_train_preds, average="micro")))
print(": LGB - ROC AUC Score = {}".format(roc_auc_score(target, lgb_train_preds, average="micro")))
print(": CB - ROC AUC Score = {}".format(roc_auc_score(target, cb_train_preds, average="micro")))
print(": KNN - ROC AUC Score = {}".format(roc_auc_score(target, knn_train_preds, average="micro")))

--> Fold 1
[0]	validation_0-auc:0.85994
[1]	validation_0-auc:0.86685
[2]	validation_0-auc:0.86939
[3]	validation_0-auc:0.87389
[4]	validation_0-auc:0.87557
[5]	validation_0-auc:0.87715
[6]	validation_0-auc:0.87831
[7]	validation_0-auc:0.87946
[8]	validation_0-auc:0.88057
[9]	validation_0-auc:0.88145
[10]	validation_0-auc:0.88246
[11]	validation_0-auc:0.88333
[12]	validation_0-auc:0.88394
[13]	validation_0-auc:0.88432
[14]	validation_0-auc:0.88511
[15]	validation_0-auc:0.88531
[16]	validation_0-auc:0.88573
[17]	validation_0-auc:0.88586
[18]	validation_0-auc:0.88613
[19]	validation_0-auc:0.88648
[20]	validation_0-auc:0.88696
[21]	validation_0-auc:0.88715
[22]	validation_0-auc:0.88755
[23]	validation_0-auc:0.88784
[24]	validation_0-auc:0.88795
[25]	validation_0-auc:0.88804
[26]	validation_0-auc:0.88815
[27]	validation_0-auc:0.88830
[28]	validation_0-auc:0.88854
[29]	validation_0-auc:0.88859
[30]	validation_0-auc:0.88866
[31]	validation_0-auc:0.88875
[32]	validation_0-auc:0.88884
[33]	vali

[72]	validation_0-auc:0.88777
[73]	validation_0-auc:0.88779
[74]	validation_0-auc:0.88776
[75]	validation_0-auc:0.88777
[76]	validation_0-auc:0.88775
[77]	validation_0-auc:0.88775
[78]	validation_0-auc:0.88774
[79]	validation_0-auc:0.88775
[80]	validation_0-auc:0.88771
[81]	validation_0-auc:0.88766
[82]	validation_0-auc:0.88765
[83]	validation_0-auc:0.88766
[84]	validation_0-auc:0.88770
[85]	validation_0-auc:0.88766
[86]	validation_0-auc:0.88765
[87]	validation_0-auc:0.88762
[88]	validation_0-auc:0.88763
[89]	validation_0-auc:0.88765
[90]	validation_0-auc:0.88765
[91]	validation_0-auc:0.88760
[92]	validation_0-auc:0.88763
[93]	validation_0-auc:0.88764
[94]	validation_0-auc:0.88764
[95]	validation_0-auc:0.88775
[96]	validation_0-auc:0.88774
[97]	validation_0-auc:0.88774
[98]	validation_0-auc:0.88771
[99]	validation_0-auc:0.88774
: XGB - ROC AUC Score = 0.8877365552589255
[1]	valid_0's auc: 0.850163
[2]	valid_0's auc: 0.854319
[3]	valid_0's auc: 0.857463
[4]	valid_0's auc: 0.860013
[5]	v

[50]	valid_0's auc: 0.889527
[51]	valid_0's auc: 0.889641
[52]	valid_0's auc: 0.889701
[53]	valid_0's auc: 0.889775
[54]	valid_0's auc: 0.889955
[55]	valid_0's auc: 0.890005
[56]	valid_0's auc: 0.890095
[57]	valid_0's auc: 0.890194
[58]	valid_0's auc: 0.890266
[59]	valid_0's auc: 0.890304
[60]	valid_0's auc: 0.8904
[61]	valid_0's auc: 0.890433
[62]	valid_0's auc: 0.890506
[63]	valid_0's auc: 0.890559
[64]	valid_0's auc: 0.890622
[65]	valid_0's auc: 0.890703
[66]	valid_0's auc: 0.890739
[67]	valid_0's auc: 0.890784
[68]	valid_0's auc: 0.890822
[69]	valid_0's auc: 0.890829
[70]	valid_0's auc: 0.890901
[71]	valid_0's auc: 0.890988
[72]	valid_0's auc: 0.891026
[73]	valid_0's auc: 0.891074
[74]	valid_0's auc: 0.891108
[75]	valid_0's auc: 0.891146
[76]	valid_0's auc: 0.891178
[77]	valid_0's auc: 0.891223
[78]	valid_0's auc: 0.891255
[79]	valid_0's auc: 0.89132
[80]	valid_0's auc: 0.891401
[81]	valid_0's auc: 0.891442
[82]	valid_0's auc: 0.891447
[83]	valid_0's auc: 0.891517
[84]	valid_0's au

[18]	validation_0-auc:0.88370
[19]	validation_0-auc:0.88386
[20]	validation_0-auc:0.88397
[21]	validation_0-auc:0.88427
[22]	validation_0-auc:0.88462
[23]	validation_0-auc:0.88502
[24]	validation_0-auc:0.88508
[25]	validation_0-auc:0.88529
[26]	validation_0-auc:0.88543
[27]	validation_0-auc:0.88547
[28]	validation_0-auc:0.88555
[29]	validation_0-auc:0.88565
[30]	validation_0-auc:0.88566
[31]	validation_0-auc:0.88577
[32]	validation_0-auc:0.88594
[33]	validation_0-auc:0.88602
[34]	validation_0-auc:0.88603
[35]	validation_0-auc:0.88630
[36]	validation_0-auc:0.88633
[37]	validation_0-auc:0.88644
[38]	validation_0-auc:0.88644
[39]	validation_0-auc:0.88645
[40]	validation_0-auc:0.88652
[41]	validation_0-auc:0.88672
[42]	validation_0-auc:0.88679
[43]	validation_0-auc:0.88677
[44]	validation_0-auc:0.88684
[45]	validation_0-auc:0.88685
[46]	validation_0-auc:0.88683
[47]	validation_0-auc:0.88686
[48]	validation_0-auc:0.88691
[49]	validation_0-auc:0.88691
[50]	validation_0-auc:0.88693
[51]	valid

In [21]:
random_state = 2021
n_folds = 5
k_fold = StratifiedKFold(n_splits=n_folds, random_state=random_state, shuffle=True)

l1_train = pd.DataFrame(data={
    "xgb": xgb_train_preds.tolist(),
    "lgb": lgb_train_preds.tolist(),
    "cb": cb_train_preds.tolist(),
    "knn": knn_train_preds.tolist()
})
l1_test = pd.DataFrame(data={
    "xgb": xgb_test_preds.tolist(),
    "lgb": lgb_test_preds.tolist(),
    "cb": cb_test_preds.tolist(),
    "knn": knn_test_preds.tolist()
})

train_preds = np.zeros(len(l1_train.index), )
test_preds = np.zeros(len(l1_test.index), )
features = ["xgb", "lgb", "cb", "knn"]

for fold, (train_index, test_index) in enumerate(k_fold.split(l1_train, target)):
    print("--> Fold {}".format(fold + 1))
    y_train = target.iloc[train_index]
    y_valid = target.iloc[test_index]

    x_train = pd.DataFrame(l1_train[features].iloc[train_index])
    x_valid = pd.DataFrame(l1_train[features].iloc[test_index])
    
    model = XGBClassifier(random_state=random_state)
    model.fit(
        x_train,
        y_train,
    )

    train_oof_preds = model.predict_proba(x_valid)[:,-1]
    test_oof_preds = model.predict_proba(l1_test[features])[:,-1]
    train_preds[test_index] = train_oof_preds
    test_preds += test_oof_preds / n_folds
    print(": ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_preds, average="micro")))
    print("")
    
print("--> Overall metrics")
print(": ROC AUC Score = {}".format(roc_auc_score(target, train_preds, average="micro")))

--> Fold 1
: ROC AUC Score = 0.8950426009976062

--> Fold 2
: ROC AUC Score = 0.8924715393794299

--> Fold 3
: ROC AUC Score = 0.8941091390989113

--> Fold 4
: ROC AUC Score = 0.8957667876272667

--> Fold 5
: ROC AUC Score = 0.8918412936173982

--> Overall metrics
: ROC AUC Score = 0.8937907560189443


In [24]:
# submit["target"] = test_preds.tolist()
# submit.to_csv("20210313_stacking_submit.csv", index=False)

In [None]:
# submit.to_csv('all_gmmclass_submission.csv',index=False)