In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv
/kaggle/input/tabular-playground-series-aug-2021/train.csv
/kaggle/input/tabular-playground-series-aug-2021/test.csv


In [8]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import catboost as ctb
import lightgbm as lgb

import time

In [3]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/train.csv", index_col="id")
test  = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2021/test.csv", index_col="id")
target = train["loss"]
train = train.drop("loss", axis=1)

In [37]:
N_ESTIMATORS = 5000
lr = 0.005
SEED = 299792458
SPLITS = 10
VERBOSE =  500
EARLY_STOPPING_ROUNDS = 100

xgb_params = {
    "n_estimators":N_ESTIMATORS,
    "learning_rate":0.01,
    "tree_method":'gpu_hist',
    "gpu_id":0,
    "max_depth":11,
    "subsample":0.98,
    "colsample_bytree":0.6,
    "n_jobs":4,
    "booster": 'gbtree', 
    "reg_lambda": 32,
    "reg_alpha": 7,
    "objective":"reg:squarederror",
    "min_child_weight":19,
    "importance_type":"total_gain",
    "eta":0.02,
}

In [17]:
train_oof = np.zeros(train.shape[0])
test_oof = np.zeros(test.shape[0])

importances = pd.DataFrame()

kfd = KFold(n_splits=SPLITS, random_state=SEED, shuffle=True)

for fold, (train_ids, valid_ids) in enumerate(kfd.split(X=train, y=target)):
    X_train, X_valid = train.iloc[train_ids], train.iloc[valid_ids]
    y_train, y_valid = target.iloc[train_ids], target.iloc[valid_ids]
    
    start = time.time()
    
    xgb_model = xgb.XGBRegressor(**xgb_params)
    xgb_model.fit(X_train,
                  y_train,
                  eval_set=[(X_valid, y_valid)],
                  eval_metric="rmse",
                  early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                  verbose=500)
    
    
    oof_preds = xgb_model.predict(X_valid)
    train_oof[valid_ids] = oof_preds
    
    test_oof  += xgb_model.predict(test) / 10
    
    print(f"Fold: {fold} ---- Rmse: {mean_squared_error(oof_preds, y_valid, squared=False):.6f} ---- time elapsed: {time.time()-start:.1f}s")


print("Final rmse: ", mean_squared_error(train_oof, target, squared=False))

[0]	validation_0-rmse:10.08388
[500]	validation_0-rmse:7.83917
[1000]	validation_0-rmse:7.82026
[1500]	validation_0-rmse:7.81347
[1977]	validation_0-rmse:7.81031
Fold: 0 ---- Rmse: 7.810154 ---- time elapsed: 149.5s
[0]	validation_0-rmse:10.15828
[500]	validation_0-rmse:7.91214
[1000]	validation_0-rmse:7.89712
[1500]	validation_0-rmse:7.89217
[1600]	validation_0-rmse:7.89222
Fold: 1 ---- Rmse: 7.892147 ---- time elapsed: 118.5s
[0]	validation_0-rmse:10.07831
[500]	validation_0-rmse:7.83684
[1000]	validation_0-rmse:7.82292
[1347]	validation_0-rmse:7.82049
Fold: 2 ---- Rmse: 7.820337 ---- time elapsed: 99.0s
[0]	validation_0-rmse:10.11840
[500]	validation_0-rmse:7.86952
[1000]	validation_0-rmse:7.84862
[1500]	validation_0-rmse:7.84282
[1733]	validation_0-rmse:7.84254
Fold: 3 ---- Rmse: 7.841591 ---- time elapsed: 132.0s
[0]	validation_0-rmse:10.11836
[500]	validation_0-rmse:7.85472
[1000]	validation_0-rmse:7.84170
[1384]	validation_0-rmse:7.83990
Fold: 4 ---- Rmse: 7.839470 ---- time ela

In [31]:
tmp = pd.DataFrame()
tmp["xgb_oof"] = train_oof
tmp["id"] = train.index
tmp.to_csv('xgb_oof.csv', index=False, header=tmp.columns)

predictions = pd.DataFrame()
predictions["id"] = test.index
predictions["loss"] = test_oof / 10
predictions.to_csv('xgb_submission.csv', index=False, header=predictions.columns)

In [26]:
xgb_oof = train_oof
xgb_pred = test_oof / 10

In [33]:
lgb_params={
    'lambda_l1': 0.19673487505279366,
    'lambda_l2': 6.205681774095499e-05,
    'num_leaves': 20,
    'learning_rate': 0.1229039615047327,
    'feature_fraction': 0.8566649457461354,
    'bagging_fraction': 0.9999164419693399,
    'bagging_freq': 10,
    'min_child_samples': 92,
    'num_threads': 5
}

In [38]:
lgb_oof = np.zeros(train.shape[0])
lgb_pred = np.zeros(test.shape[0])

importances = pd.DataFrame()

kfd = KFold(n_splits=SPLITS, random_state=SEED, shuffle=True)

for fold, (train_ids, valid_ids) in enumerate(kfd.split(X=train, y=target)):
    X_train, X_valid = train.iloc[train_ids], train.iloc[valid_ids]
    y_train, y_valid = target.iloc[train_ids], target.iloc[valid_ids]
    
    start = time.time()
    
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    lgb_model.fit(X_train,
                  y_train,
                  eval_set=[(X_valid, y_valid)],
                  eval_metric="rmse",
                  early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                  verbose=500)
    
    
    oof_preds = lgb_model.predict(X_valid)
    lgb_oof[valid_ids] = oof_preds
    
    lgb_pred  += xgb_model.predict(test) / SPLITS
    
    print(f"Fold: {fold} ---- Rmse: {mean_squared_error(oof_preds, y_valid, squared=False):.6f} ---- time elapsed: {time.time()-start:.1f}s")


print("Final rmse: ", mean_squared_error(lgb_oof, target, squared=False))

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[95]	valid_0's rmse: 7.83826	valid_0's l2: 61.4384
Fold: 0 ---- Rmse: 7.838265 ---- time elapsed: 26.1s
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's rmse: 7.91056	valid_0's l2: 62.577
Fold: 1 ---- Rmse: 7.910560 ---- time elapsed: 26.6s
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's rmse: 7.83456	valid_0's l2: 61.3803
Fold: 2 ---- Rmse: 7.834556 ---- time elapsed: 27.5s
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's rmse: 7.86845	valid_0's l2: 61.9125
Fold: 3 ---- Rmse: 7.868449 ---- time elapsed: 27.3s
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[91]	valid_0's rmse: 7.8521	vali

In [41]:
tmp = pd.DataFrame()
tmp["xgb_oof"] = lgb_oof
tmp["id"] = train.index
tmp.to_csv('lgb_oof.csv', index=False, header=tmp.columns)

predictions = pd.DataFrame()
predictions["id"] = test.index
predictions["loss"] = lgb_pred
predictions.to_csv('lgb_submission.csv', index=False, header=predictions.columns)

In [64]:
ctb_params={'iterations': 8195,
 'od_wait': 2000,
 'learning_rate': 0.02039421755643651,
 'reg_lambda': 95.14582565179668,
 'subsample': 0.6044381624463067,
 'random_strength': 15.077418882976177,
 'depth': 12,
 'min_data_in_leaf': 5,
 'leaf_estimation_iterations': 4,
 'loss_function': 'RMSE',
 'eval_metric': 'RMSE',
 'leaf_estimation_method': 'Newton',
 'random_state': 42,
  'task_type':"GPU",
  'eval_metric':'RMSE'
           }

In [65]:
ctb_oof = np.zeros(train.shape[0])
ctb_pred = np.zeros(test.shape[0])

importances = pd.DataFrame()

kfd = KFold(n_splits=5, random_state=SEED, shuffle=True)

for fold, (train_ids, valid_ids) in enumerate(kfd.split(X=train, y=target)):
    X_train, X_valid = train.iloc[train_ids], train.iloc[valid_ids]
    y_train, y_valid = target.iloc[train_ids], target.iloc[valid_ids]
    
    start = time.time()
    
    ctb_model = ctb.CatBoostRegressor(**ctb_params)
    ctb_model.fit(X_train,
                  y_train,
                  eval_set=[(X_valid, y_valid)],
                  early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                  verbose=500)
    
    
    oof_preds = ctb_model.predict(X_valid)
    ctb_oof[valid_ids] = oof_preds
    
    ctb_pred  += ctb_model.predict(test) / 5
    
    print(f"Fold: {fold} ---- Rmse: {mean_squared_error(oof_preds, y_valid, squared=False):.6f} ---- time elapsed: {time.time()-start:.1f}s")


print("Final rmse: ", mean_squared_error(ctb_oof, target, squared=False))

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 7.9358008	test: 7.9542042	best: 7.9542042 (0)	total: 59.4ms	remaining: 8m 6s
500:	learn: 7.4805314	test: 7.9126225	best: 7.9126225 (500)	total: 27.1s	remaining: 6m 56s
1000:	learn: 7.0665016	test: 7.8993367	best: 7.8993367 (1000)	total: 54.7s	remaining: 6m 33s
1500:	learn: 6.6760209	test: 7.8951244	best: 7.8948569 (1493)	total: 1m 21s	remaining: 6m 3s
2000:	learn: 6.3086391	test: 7.8912512	best: 7.8911967 (1990)	total: 1m 48s	remaining: 5m 36s
bestTest = 7.889787703
bestIteration = 2229
Shrink model to first 2230 iterations.
Fold: 0 ---- Rmse: 7.889788 ---- time elapsed: 132.9s
0:	learn: 7.9400252	test: 7.9365887	best: 7.9365887 (0)	total: 59.5ms	remaining: 8m 7s
500:	learn: 7.4813755	test: 7.8908333	best: 7.8908333 (500)	total: 26.4s	remaining: 6m 45s
1000:	learn: 7.0758487	test: 7.8744784	best: 7.8744784 (1000)	total: 53.7s	remaining: 6m 26s
1500:	learn: 6.6776542	test: 7.8694384	best: 7.8694231 (1499)	total: 1m 20s	remaining: 6m
2000:	learn: 6.2879508	test: 7.8655699	best:

In [73]:
tmp = pd.DataFrame()
tmp["xgb_oof"] = xgb_oof
tmp["lgb_oof"] = lgb_oof
tmp["ctb_oof"] = ctb_oof
tmp["id"] = train.index
tmp.to_csv('oof.csv', index=False, header=tmp.columns)

predictions = pd.DataFrame()
predictions["id"] = test.index
predictions["loss"] = ctb_pred
predictions.to_csv('ctb_submission.csv', index=False, header=predictions.columns)

In [72]:
tmp

Unnamed: 0,xgb_oof,lgb_oof,ctb_oof,id
0,8.474790,7.362822,7.757325,0
1,7.472493,7.602264,7.069909,1
2,4.581374,5.282008,5.417567,2
3,6.699088,7.808072,6.956544,3
4,7.032556,6.478108,6.834692,4
...,...,...,...,...
249995,7.602599,7.902640,7.834418,249995
249996,4.600126,5.388126,6.208942,249996
249997,6.836284,6.951775,6.238810,249997
249998,5.746559,6.483632,6.559118,249998


In [66]:
ctb_pred

array([7.36794592, 5.83224401, 8.23223294, ..., 6.53224891, 6.7890644 ,
       7.04423691])