# Boosting Algorithms for Tabular Regression

Using CatBoost and XGB for regression and tuning the models with Optuna for improved accuracy.

## Importing Libraries

In [None]:
import numpy as np 
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, train_test_split, RepeatedKFold, StratifiedKFold
from catboost import CatBoostRegressor
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler


## Loading, Exploring and Preprocessing Data

In [None]:
train_data = pd.read_csv("../input/tabular-playground-series-aug-2021/train.csv")
valid_data = pd.read_csv("../input/tabular-playground-series-aug-2021/test.csv")

In [None]:
train_data.drop('id', axis=1, inplace=True)
#test_data.drop('id', axis=1, inplace=True)

In [None]:
X = train_data.drop('loss', axis=1)
y = train_data['loss']

In [None]:
ss = StandardScaler()
X = ss.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)

In [None]:
valid_ids = valid_data['id']
valid_data = valid_data.drop('id',axis=1)

In [None]:
valid_data = ss.fit_transform(valid_data)

## Training Models and doing Hyperparameter Tuning

In [None]:
def objective(trial,data=X,target=y):
    
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.25,random_state=42)
    params = {'iterations':trial.suggest_int("iterations", 1000, 20000),
              'od_wait':trial.suggest_int('od_wait', 500, 2000),
             'loss_function':'RMSE',
              'task_type':"GPU",
              'eval_metric':'RMSE',
              'leaf_estimation_method':'Newton',
              'bootstrap_type': 'Bernoulli',
              'learning_rate' : trial.suggest_uniform('learning_rate',0.02,1),
              'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
              'subsample': trial.suggest_uniform('subsample',0,1),
              'random_strength': trial.suggest_uniform('random_strength',10,50),
              'depth': trial.suggest_int('depth',1,15),
              'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
              'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
               }
    model = CatBoostRegressor(**params)  
    model.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=100,verbose=False)
        
    y_preds = model.predict(X_test)
    loss = np.sqrt(mean_squared_error(y_test, y_preds))
    
    return loss

In [None]:
OPTUNA_OPTIMIZATION = True

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

In [None]:
cat_params = study.best_trial.params
cat_params['loss_function'] = 'RMSE'
cat_params['eval_metric'] = 'RMSE'
cat_params['bootstrap_type']= 'Bernoulli'
cat_params['leaf_estimation_method'] = 'Newton'
cat_params['random_state'] = 42
cat_params['task_type']='GPU'

In [None]:
kf = StratifiedKFold(n_splits = 10 , shuffle = True , random_state = 42)

In [None]:
test_preds=None

for fold, (tr_index , val_index) in enumerate(kf.split(X , y)):
    
    print("⁙" * 10)
    print(f"Fold {fold + 1}")
    
    x_train,x_val = X[tr_index] , X[val_index]
    y_train,y_val = y[tr_index] , y[val_index]
        
    eval_set = [(x_val, y_val)]
    
    model =CatBoostRegressor(**cat_params)
    model.fit(x_train, y_train, eval_set = eval_set, verbose = False)
    
    train_preds = model.predict(x_train)    
    val_preds = model.predict(x_val)
    
    print(np.sqrt(mean_squared_error(y_val, val_preds)))
    
    if test_preds is None:
        test_preds = model.predict(valid_data)
    else:
        test_preds += model.predict(valid_data)

In [None]:
test_preds /= 10

In [None]:
test_preds

In [None]:
submission = pd.read_csv("../input/tabular-playground-series-aug-2021/sample_submission.csv")

In [None]:
submission['loss']=test_preds

In [None]:
submission.to_csv('catboost.csv',index=False)

In [None]:
model.predict(valid_data)

In [None]:
def objective_xgb(trial,data=X,target=y):
    X_train, X_valid, y_train, y_valid = train_test_split(data, target, stratify=target, test_size=0.15)

    param_grid = {
        'max_depth': trial.suggest_int('max_depth', 6, 15),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1.0, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000, 100),
        'eta': 0.01,
        'reg_alpha': trial.suggest_int('reg_alpha', 1, 50),
        'reg_lambda': trial.suggest_int('reg_lambda', 5, 100),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 20),
    }

    reg = XGBRegressor(tree_method='gpu_hist', **param_grid)
    # TODO: PRUNING
    # pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation-error')
    reg.fit(X_train, y_train,
            eval_set=[(X_valid, y_valid)], eval_metric='rmse',
            verbose=False)

#     return np.sqrt(-cross_val_score(reg, X_valid, y_valid, scoring='neg_mean_squared_error').mean())
    return mean_squared_error(y_valid, reg.predict(X_valid), squared=False)

In [None]:
OPTUNA_OPTIMIZATION = True

study = optuna.create_study(direction='minimize')
study.optimize(objective_xgb, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold

In [None]:

xgb_params = study.best_trial.params
xgb_params['eta'] = 0.01
xgb_params['tree_method'] = 'gpu_hist'

In [None]:
n_splits = 10
test_preds = None
kf_rmse = []

for fold, (train_idx, valid_idx) in enumerate(KFold(n_splits=n_splits, shuffle=True).split(X, y)):
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]
    
    model = XGBRegressor(**xgb_params)
    model.fit(X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric='rmse', verbose=False)
       
    valid_pred = model.predict(X_valid)
    rmse = mean_squared_error(y_valid, valid_pred, squared=False)
    print(f'Fold {fold+1}/{n_splits} RMSE: {rmse:.4f}')
    kf_rmse.append(rmse)
    
    if test_preds is None:
        test_preds = model.predict(valid_data)
    else:
        test_preds += model.predict(valid_data)

test_preds /= n_splits
print(f'Average KFold RMSE: {np.mean(np.array(kf_rmse)):.5f}')

In [None]:
sample_submission = pd.read_csv("../input/tabular-playground-series-aug-2021/sample_submission.csv")

In [None]:
sample_submission['loss'] = test_preds
sample_submission.to_csv('submission.csv', index=False)