In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,roc_auc_score, log_loss, auc
from sklearn.model_selection import StratifiedKFold, KFold,cross_val_predict,cross_val_score,cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import optuna
import lightgbm as lgbm
import xgboost as xgb
import pickle
import category_encoders as ce
from catboost import CatBoostClassifier, Pool
from sklearn.feature_selection import SelectKBest, f_classif,chi2,SelectPercentile
from scipy.stats import rankdata
from sklearn.mixture import GaussianMixture 
from scipy.sparse import hstack
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve

import string
import gc 
import re

In [2]:
train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
sample = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

In [3]:
features = [col for col in train.columns if 'f' in col]
target = train.claim

In [4]:
train['missing'] = train[features].isna().sum(axis=1)
test['missing'] = test[features].isna().sum(axis=1)

train['std'] = train[features].std(axis=1)
test['std'] = test[features].std(axis=1)

features += ['missing', 'std']
missing = train['missing'].copy()

In [5]:
train[features] = train[features].fillna(train[features].mean())
test[features] = test[features].fillna(test[features].mean())

In [6]:
scaler = StandardScaler()
train[features] = scaler.fit_transform(train[features])
test[features] = scaler.transform(test[features])

In [7]:
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [8]:
train_df = reduce_memory_usage(train, verbose=True)
test_df = reduce_memory_usage(test, verbose=True)

Mem. usage decreased to 223.82 Mb (74.9% reduction)
Mem. usage decreased to 114.83 Mb (74.8% reduction)


In [9]:
del train
del test
gc.collect()

21

In [10]:
X = train_df.drop(labels=['id','claim'],axis=1,inplace=False).copy()
y = train_df['claim'].values
X_test = test_df.drop(labels=['id'],axis=1,inplace=False).copy()
random_seed = 2001

In [11]:
params_lgbm = {
               'reg_lambda': 0.017312313250320625, 
               'reg_alpha': 40.471907180619546, 
               'colsample_bytree': 0.21300900004915263, 
               'subsample': 0.7388658620715864, 
               'learning_rate': 0.01, 
               'num_leaves': 123, 
               'min_child_samples': 180, 
               'max_depth': 11}

xgb_params = {
              'lambda': 0.20897942008143003, 
              'alpha': 0.006422496271380073, 
              'colsample_bytree': 0.38388359320106824, 
              'colsample_bynode': 0.5029183650213261, 
              'colsample_bylevel': 0.7593124262153282, 
              'subsample': 0.5901796218736816, 
              'eta': 0.01, 
              'grow_policy': 'depthwise', 
              'max_depth': 7, 
              'min_child_weight': 5, 
              'max_bin': 347, 
              'deterministic_histogram': False
              }

catboost_params = {
                    'learning_rate': 0.5395805682481578,
                    'depth': 1, 
                    'l2_leaf_reg': 2.7553037523696142, 
                    'random_strength': 14.893441746059075, 
                    'bagging_temperature': 1, 
                    'border_count': 128, 
                    'grow_policy': 'SymmetricTree', 
                    'min_data_in_leaf': 207
                    }

In [12]:
# # for the fixed learning rate, use the opt n iterations and tune the tree hyperparameters
# def objective(trial,data=X,target=y):
 
#     param_space = { 
#                'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
#                 'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
#                 'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.9),
#                 'colsample_bynode': trial.suggest_float('colsample_bynode', 0.1, 0.9),
#                 'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.1, 0.9),
#                 'subsample': trial.suggest_float('subsample', 0.1, 0.9),
#                 'eta':trial.suggest_float('eta', 1e-2, 1e-2),
#                 'grow_policy': trial.suggest_categorical("grow_policy", ['depthwise','lossguide']),
#                 'max_depth': trial.suggest_int('max_depth',2,25),
#                 'seed': 2001,
#                 'min_child_weight': trial.suggest_int('min_child_weight', 0, 300),
#                 'max_bin': trial.suggest_int('max_bin', 256, 512),
#                 'deterministic_histogram':trial.suggest_categorical('deterministic_histogram',[False]),
#                 "tree_method" : "gpu_hist",
#                 "predictor" : 'gpu_predictor',
#                 "objective" : 'binary:logistic',
#                 "eval_metric" : 'auc'
#                 }

#     #X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=.1,random_state=2021,stratify=y)
#     k=5
#     seed_list=[2001]
#     kf = StratifiedKFold(n_splits=k,shuffle=True,random_state=random_seed)
#     oof = np.zeros(len(X))
#     score_list = []
#     fold=1

#     splits = list(kf.split(X,y))
#     for train_idx, val_idx in splits:
#         X_train, X_val = X.iloc[train_idx,:], X.iloc[val_idx,:]
#         y_train, y_val = y[train_idx], y[val_idx]

#         val_preds_list = []

#         for seed in seed_list:
#             # fit and run model
#             param_space['seed'] = seed
#             dtrain = xgb.DMatrix(data=X_train, label=y_train)
#             dval = xgb.DMatrix(data=X_val, label=y_val)
#             dtest = xgb.DMatrix(data=X_test)
#             xgb.set_config(verbosity=0)


#             model = xgb.train(param_space, dtrain,\
#                            evals=[(dtrain,'train'),(dval,'val')],\
#                            verbose_eval=False,
#                            early_stopping_rounds=100,
#                            num_boost_round=100000)




#             val_preds_list.append(model.predict(dval))
#             #test_preds_list.append(model.predict_proba(X_test)[:,1])

#         oof[val_idx] = np.mean(val_preds_list,axis=0)
#         score = roc_auc_score(y_val, oof[val_idx])
#         #print(f"fold: {fold},logloss: {score}")
#         score_list.append(score)
#         fold +=1

#     cv_logloss = np.mean(score_list)

#     return cv_logloss

In [13]:
# study = optuna.create_study(direction='maximize')
# study.optimize(objective,n_trials= 30)
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

In [14]:
# params_lgbm['boosting_type'] = 'gbdt'
# params_lgbm['device_type'] = 'gpu'
# params_lgbm ['objective'] = 'binary',
# params_lgbm ['metric'] = 'auc'
# params_lgbm ['verbosity'] = -1
# params_lgbm ['n_estimators']= 50000
# #params_lgbm["cat_feature"] = cat_features

# name = 'lighgbm_3seeds_5fold'
# k=5
# seed_list=[2001,2002,2003]
# kf = StratifiedKFold(n_splits=k,shuffle=True,random_state=random_seed)
# oof = np.zeros(len(X))
# test_preds_list = []
# score_list = []
# fold=1
  
# splits = list(kf.split(X,y))
# fold = 1
# for train_idx, val_idx in splits:
#     X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#     y_train, y_val = y[train_idx], y[val_idx]

#     val_preds_list = []

#     for seed in seed_list:

#     # fit and run model
#         params_lgbm['random_state'] = seed

#         model = lgbm.LGBMClassifier(**params_lgbm)

#         model.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_val,y_val)],
#                   early_stopping_rounds=200,
#                   eval_metric='auc',
#                   eval_names=['train','val'],verbose=200)


#         val_preds_list.append(model.predict_proba(X_val)[:, 1])
#         test_preds_list.append(model.predict_proba(X_test)[:, 1])

#     oof[val_idx] = np.mean(val_preds_list,axis=0)
#     score = roc_auc_score(y_val, oof[val_idx])
#     print(f"fold: {fold},auc: {score}")
#     score_list.append(score)
#     # print(f"fold: {fold}, class0 tr %: {y_train.value_counts()[0]/len(y_train)}, class0 val %: {y_val.value_counts()[0]/len(y_val)} ")
#     fold +=1

# cv_auc = np.mean(score_list)
# print(f"{name} ,auc: {cv_auc}")

# preds= np.mean(test_preds_list,axis=0)


# file_name_oof = name +"_oof.csv"
# file_name_test = name + "_test.csv"
# with open(file_name_oof, "wb") as fp:
#       pickle.dump(oof, fp)

# #files.download(file_name_oof)

# sample['claim'] = preds
# sample.to_csv(file_name_test,index=None)
# #files.download(file_name_test)

In [15]:
# params_xgb = xgb_params
# params_xgb["tree_method"] = "gpu_hist"
# params_xgb["predictor"] = 'gpu_predictor'
# params_xgb["eval_metric"] ='auc'
# params_xgb['objective'] = 'binary:logistic'

# name = 'xgboost_3seeds_5fold'
# k=5
# seed_list=[2001, 2002, 2003]
# kf = StratifiedKFold(n_splits=k,shuffle=True,random_state=2001)
# oof = np.zeros(len(train_df))
# test_preds_list = []
# score_list = []
# fold=1

# splits = list(kf.split(X,y))
# fold = 1
# for train_idx, val_idx in splits:
#     X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#     y_train, y_val = y[train_idx], y[val_idx]

#     val_preds_list = []

#     for seed in seed_list:

#         # fit and run model
#         params_xgb['seed'] = seed

#         dtrain = xgb.DMatrix(data=X_train, label=y_train)
#         dval = xgb.DMatrix(data=X_val, label=y_val)
#         dtest = xgb.DMatrix(data=X_test)

#         model = xgb.train(params_xgb, dtrain,\
#                            evals=[(dtrain,'train'),(dval,'val')],\
#                            verbose_eval=100,
#                            early_stopping_rounds=100,
#                            num_boost_round=100000)

#         val_preds_list.append(model.predict(dval))
#         test_preds_list.append(model.predict(dtest))

#     oof[val_idx] = np.mean(val_preds_list,axis=0)
#     score = roc_auc_score(y_val, oof[val_idx])
#     print(f"fold: {fold},auc: {score}")
#     score_list.append(score)

#     # print(f"fold: {fold}, class0 tr %: {y_train.value_counts()[0]/len(y_train)}, class0 val %: {y_val.value_counts()[0]/len(y_val)} ")
#     fold +=1

# cv_logloss = np.mean(score_list)
# print(f"{name} ,auc: {cv_logloss}")

# preds= np.mean(test_preds_list,axis=0)


# file_name_oof = name + "_oof.csv"
# file_name_test = name + "_test.csv"
# with open(file_name_oof, "wb") as fp:
#       pickle.dump(oof, fp)

# # files.download(file_name_oof)

# sample['claim'] = preds
# sample.to_csv(file_name_test,index=None)
# # files.download(file_name_test) 

In [16]:
# # for the fixed learning rate, use the opt n iterations and tune the tree hyperparameters
# def objective(trial,data=X,target=y):

#     train_pool = Pool(data, label=target)
#     train_pool.quantize(task_type="GPU")
    
#     param_space = {
#         "od_type" : "Iter",
#         "od_wait" : 1000,
#         'learning_rate': trial.suggest_loguniform('learning_rate', 1e-2, 1),
#         "depth": trial.suggest_int("depth", 1, 16),
#         "l2_leaf_reg": trial.suggest_loguniform('l2_leaf_reg', 1, 10),
#         "random_strength": trial.suggest_float("random_strength",1,20),
#         "bagging_temperature": trial.suggest_int("bagging_temperature",0,1),
#         "border_count": trial.suggest_int("border_count",128,128),
#         "grow_policy":trial.suggest_categorical("grow_policy",["Depthwise","SymmetricTree","Lossguide"]),
#         'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300)

#         }

#     #X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=.1,random_state=2021,stratify=y)
#     k=3
#     seed_list=[2001]
#     kf = StratifiedKFold(n_splits=k,shuffle=True,random_state=random_seed)
#     oof = np.zeros(len(data))
#     score_list = []
#     fold=1

#     splits = list(kf.split(X,y))
#     for train_idx, val_idx in splits:
# #         X_train, X_val = X[train_idx,:], X[val_idx,:]
# #         y_train, y_val = y[train_idx], y[val_idx]
#         tr_pool = train_pool.slice(train_idx)
#         val_pool = train_pool.slice(val_idx)

#         val_preds_list = []


#         for seed in seed_list:
#           # fit and run model
#             param_space['random_state'] = seed
# #             param_space["cat_features"] = cat_features

#             model = CatBoostClassifier(**param_space,
#                                     task_type="GPU",
#                                     thread_count=-1,
#                                      iterations=50000,
#                                     eval_metric = "AUC",     
#                                      use_best_model=True)

#             model.fit(tr_pool, eval_set = [(val_pool)],
#                   use_best_model=True,
#                   early_stopping_rounds=600,
#                   verbose=False)


#             val_preds_list.append(model.predict_proba(val_pool)[:, 1])
#             #test_preds_list.append(model.predict_proba(X_test)[:,1])

#         oof[val_idx] = np.mean(val_preds_list,axis=0)
#         score = roc_auc_score(target[val_idx], oof[val_idx])
#         print(f"fold: {fold},AUC: {score}")
#         score_list.append(score)
#         fold +=1

#     cv_logloss = np.mean(score_list)

#     return cv_logloss

In [17]:
# study = optuna.create_study(direction='maximize')
# study.optimize(objective,n_trials= 30)
# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

In [18]:
# study.best_params

In [19]:
train_pool = Pool(X, label=y)
train_pool.quantize(task_type="GPU")

params_cb = catboost_params
# params_cb ["loss_function"] = 'Classification'
params_cb ["od_wait"] = 1000
params_cb ["od_type"] = 'Iter'
#params_cb ["min_data_in_leaf"] = 1
#params_cb ["max_ctr_complexity"] = 15
params_cb ["task_type"] = "GPU"
# params_cb["cat_features"] = cat_features
 

name = 'catboost_3seeds_5fold'
k=5
seed_list=[2001, 2002, 2003]
kf = StratifiedKFold(n_splits=k,shuffle=True,random_state=random_seed)
oof = np.zeros(len(X))
test_preds_list = []
score_list = []
fold=1
  
splits = list(kf.split(X,y))
fold = 1
for train_idx, val_idx in splits:
#     X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#     y_train, y_val = y[train_idx], y[val_idx]
    tr_pool = train_pool.slice(train_idx)
    val_pool = train_pool.slice(val_idx)

    val_preds_list = []

    for seed in seed_list:

        # fit and run model
        params_cb['random_state'] = seed

        model = CatBoostClassifier(**params_cb,
                thread_count = -1,
                iterations=50000,
                use_best_model=True,
                eval_metric = "AUC",          
        )

        model.fit(tr_pool, eval_set = [(val_pool)],
                  use_best_model=True,
#                   eval_set=[(X_val,y_val)],
                  early_stopping_rounds=600,
                  verbose=100)



        val_preds_list.append(model.predict_proba(val_pool)[:, 1])
        test_preds_list.append(model.predict_proba(X_test)[:, 1])

    oof[val_idx] = np.mean(val_preds_list,axis=0)
    score = roc_auc_score(y[val_idx], oof[val_idx])
    print(f"fold: {fold},AUC: {score}")
    score_list.append(score)
    # print(f"fold: {fold}, class0 tr %: {y_train.value_counts()[0]/len(y_train)}, class0 val %: {y_val.value_counts()[0]/len(y_val)} ")
    fold +=1

cv_logloss = np.mean(score_list)
print(f"{name} ,AUC: {cv_logloss}")

preds= np.mean(test_preds_list,axis=0)

file_name_oof = name + "_oof.csv"
file_name_test = name + "_test.csv"
with open(file_name_oof, "wb") as fp:
      pickle.dump(oof, fp)

# files.download(file_name_oof)

sample['claim'] = preds
sample.to_csv(file_name_test,index=None)
# files.download(file_name_test) 

0:	learn: 0.7726570	test: 0.7731213	best: 0.7731213 (0)	total: 20.1ms	remaining: 16m 45s
100:	learn: 0.8115785	test: 0.8105234	best: 0.8105234 (100)	total: 1.51s	remaining: 12m 26s
200:	learn: 0.8136023	test: 0.8122422	best: 0.8122697 (198)	total: 3.46s	remaining: 14m 18s
300:	learn: 0.8146376	test: 0.8128727	best: 0.8128965 (279)	total: 5.06s	remaining: 13m 55s
400:	learn: 0.8153970	test: 0.8134669	best: 0.8135015 (397)	total: 6.58s	remaining: 13m 34s
500:	learn: 0.8159433	test: 0.8138816	best: 0.8138849 (499)	total: 8.07s	remaining: 13m 17s
600:	learn: 0.8162851	test: 0.8141946	best: 0.8141980 (599)	total: 9.53s	remaining: 13m 3s
700:	learn: 0.8165432	test: 0.8144152	best: 0.8144180 (698)	total: 11s	remaining: 12m 51s
800:	learn: 0.8168261	test: 0.8145818	best: 0.8145929 (796)	total: 12.4s	remaining: 12m 43s
900:	learn: 0.8170504	test: 0.8146870	best: 0.8146870 (900)	total: 14.5s	remaining: 13m 11s
1000:	learn: 0.8172369	test: 0.8147525	best: 0.8147608 (993)	total: 16.8s	remaining: 1