# Tabular Playground Series Competition - Feb 2021 


### It's multivariate regression problem!

In this kernel, I am exploring the statistical regression models to predict the target with the given features (with categorical and continous variables) with the following libraries/modules:

> ### Benchmark models:
- Artificial Neural Network
- Scikit-Learn (Python ML Library)
- Tabular Fastai

> ### With Hyperparameter Tuning

- CatBoost and XGBoost algorithms with Optuna Hyperparameter Tuning
- PyCaret's Regression module


__Bonus: AutoViML library__

Our evaluation metric is "Root Mean Squared Error' (RMSE). The lower is the RMSE, the better fit is the model.

### Let's Start!! DO CARE TO UPVOTE😁

In [None]:
# general libraries
import os
import gc
import numpy as np
from numpy import mean
from numpy import std
import pandas as pd
from pathlib import Path

# plotting
import matplotlib.pyplot as plt
from matplotlib import pyplot
import seaborn as sns
from IPython.display import display
%matplotlib inline

# sklearn - metric, train test split
from sklearn.model_selection import KFold,train_test_split,cross_val_score, RepeatedKFold
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

# Read in the data files

In [None]:
def load_data():
    train      = pd.read_csv("../input/tabular-playground-series-feb-2021/train.csv",index_col='id')
    test       = pd.read_csv("../input/tabular-playground-series-feb-2021/test.csv",index_col='id')
    submission = pd.read_csv("../input/tabular-playground-series-feb-2021/sample_submission.csv")
    return train,test,submission

train,_,_ = load_data()
display(train)

# Regression on Deep Artificial Neural Networks


[READ: Linear Regression Deep Neural Network](https://machinelearningmastery.com/regression-tutorial-keras-deep-learning-library-python/)

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras import layers
from keras.layers import Dense
from keras.models import Sequential, Model
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import Normalization
print(tf.__version__)

In [None]:
train.values.shape

In [None]:
# Label encoding

target = train.pop('target')

for c in train.columns:
    if train[c].dtype=='object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values))
        train[c] = lbl.transform(train[c].values)
        
# Normalisation of the data
normalizer = preprocessing.Normalization()
normalizer.adapt(np.array(train))


X_train, X_test, y_train, y_test = train_test_split(train, target, train_size=0.80)

# define the model
def ann_model(norm):
    model = keras.Sequential([
        norm,
        layers.Dense(24, input_dim=24, kernel_initializer='normal', activation='relu'),
        layers.Dense(10,activation='relu'),
        layers.Dense(5),
        layers.Dense(1)
    ])
    model.compile(loss='mean_squared_error',optimizer=tf.keras.optimizers.Adam(0.01))
    return model

dnn_model = ann_model(normalizer)
dnn_model.summary()

In [None]:
history = dnn_model.fit(X_train,y_train, validation_split =0.2,batch_size=128, epochs=50,verbose=0)

In [None]:
# generic function to visualise the fit of the model and MSE
def plot_results(name, y, yhat, num_to_plot=25000, lims=(0,15), figsize=(10,7)):
    MSE = math.sqrt(((yhat-y)**2).mean())
    RMSE = np.sqrt(MSE)
    plt.figure(figsize=figsize)
    a = plt.axes(aspect='equal')
    plt.scatter(y[:num_to_plot], yhat[:num_to_plot])
    plt.ylim(lims)
    plt.xlim(lims)
    _ = plt.plot(lims, lims)
    plt.title(f'{name}: {RMSE:0.6f}', fontsize=16)
    plt.show()

In [None]:
predictions= dnn_model.predict(X_test).flatten()
test_labels = y_test

a = plt.axes(aspect='equal')
plt.scatter(test_labels, predictions)
plt.xlabel('True Values')
plt.ylabel('Predictions')
lims = [0, 14]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)

In [None]:
# error distributiion

error = predictions - y_test
plt.hist(error, bins=2000)
plt.xlabel('Prediction Error')
_ = plt.ylabel('Count')

In [None]:
# summarizing the fit of the model
dnn_MSE      = round(mean_squared_error(y_test, predictions, squared=False),6)
dnn_RMSE     = round(np.sqrt(mean_squared_error(y_test, predictions, squared=False)),6)
dnn_R2       = round(metrics.r2_score(y_test, predictions),6)

In [None]:
models =['Neural Network','Decision Tree','Random Forest','LightGBM','XGBoost','CatBoost','Decision Tree-fastai','Random Forest-fastai',
         'CatBoost-fastai','XGBoost-Optuna','CatBoost-Optuna-gpu','CatBoost-Optuna-cpu']
results = pd.DataFrame(index=models,columns=['MSE','RMSE','R2'])

In [None]:
results.iloc[0:1,0:1] = dnn_MSE
results.iloc[0:1,1:2] = dnn_RMSE
results.iloc[0:1,2:3] = dnn_R2

In [None]:
del train, target, history
gc.collect()

# Scikit-Learn Library of Python

## Encode the categoricals

There are different strategies to accomplish this, and different approaches will have different performance when using different algorithms.

In [None]:
train,test,_ = load_data()

for c in train.columns:
    if train[c].dtype=='object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values))
        train[c] = lbl.transform(train[c].values)
              
for c in test.columns:
    if test[c].dtype=='object':
        lbl = LabelEncoder()
        lbl.fit(list(test[c].values))
        test[c] = lbl.transform(test[c].values)

In [None]:
# split the data       
target = train.pop('target')

X_train,X_test,y_train,y_test = train_test_split(train,target,test_size=0.2)
X_train.shape,y_train.shape,X_test.shape,y_test.shape

## Decision Tree

In [None]:
## Decision Tree

from sklearn.tree import DecisionTreeRegressor
import math


model_tree = DecisionTreeRegressor(
    criterion='mse',splitter='best',min_samples_split=1.0,min_samples_leaf=5, max_features=1.0,random_state=42,max_leaf_nodes=20
)

model_tree.fit(X_train, y_train)
y_tree = model_tree.predict(X_test)

# summarizing the fit of the model
tree_MSE      = round(mean_squared_error(y_test, y_tree, squared=False),6)
tree_RMSE     = round(np.sqrt(mean_squared_error(y_test, y_tree, squared=False)),6)
tree_R2       = round(metrics.r2_score(y_test, y_tree),6)

In [None]:
results.iloc[1:2,0:1] = tree_MSE
results.iloc[1:2,1:2] = tree_RMSE
results.iloc[1:2,2:3] = tree_R2

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

def rf(xs,y,n_estimators=100, n_jobs=-1,max_samples = 240000,max_features = 0.5,min_samples_leaf=0.5,**kwargs):
    return RandomForestRegressor(n_jobs=-1,n_estimators=100,max_samples=max_samples,
                                 max_features=max_features,min_samples_leaf=min_samples_leaf,oob_score=True).fit(xs,y)

                              
modelRF = rf(X_train, y_train,n_estimators=100)
y_pred_rf = modelRF.predict(X_test)

# summarizing the fit of the model

rf_MSE      = round(mean_squared_error(y_test, y_pred_rf, squared=False),6)
rf_RMSE     = round(np.sqrt(mean_squared_error(y_test, y_pred_rf, squared=False)),6)
rf_R2       = round(metrics.r2_score(y_test, y_pred_rf),6)

results.iloc[2:3,0:1] = rf_MSE
results.iloc[2:3,1:2] = rf_RMSE
results.iloc[2:4,2:3] = rf_R2

## Light GBM Regressor

> ### The parameters need to tune to get good results on a leaf-wise tree algorithm:

1. num_leaves      : the number of leaves should be smaller than 2^(max_depth)
2. min_data_in_leaf: For a large dataset, it can be set to hundreds or thousands
3. max_depth       : limit the depth of the tree

> ### Faster speeds on the algorithm can be obtained by using:

1. small max_bin
2. save_binary to speed up data loading in future learning
3. optimal bagging_freq and bagging_fraction
4. feature_fraction for feature sub-sampling
5. Use a small learning rate with large num_iterations

> ### Avoid Overfitting

1. Trying lambda_l1, lambda_l2, and min_gain_to_split for regularization
2. Avoid growing a very deep tree

In [None]:
import lightgbm as ltb

model_ltb= ltb.LGBMRegressor(boosting_type='gbdt',num_leaves=50,min_data_in_leaf=1000,max_depth=7,learning_rate=0.009,n_estimators=500)
model_ltb.fit(X_train, y_train)
y_pred_lg = model_ltb.predict(X_test)

# summarizing the fit of the model

lg_MSE      = round(mean_squared_error(y_test, y_pred_lg, squared=False),6)
lg_RMSE     = round(np.sqrt(mean_squared_error(y_test, y_pred_lg, squared=False)),6)
lg_R2       = round(metrics.r2_score(y_test, y_pred_lg),6)

results.iloc[3:4,0:1] = lg_MSE
results.iloc[3:4,1:2] = lg_RMSE
results.iloc[3:4,2:3] = lg_R2

## XGBoost

In [None]:
import xgboost as xgb

xgbmodel = xgb.XGBRegressor(objective='reg:squarederror') 

# Fitting the model 
xgbmodel.fit(X_train, y_train)

# Predict the model 
y_pred_xgb = xgbmodel.predict(X_test)

# summarizing the fit of the model
xg_MSE      = round(mean_squared_error(y_test, y_pred_xgb, squared=False),6)
xg_RMSE     = round(np.sqrt(mean_squared_error(y_test, y_pred_xgb, squared=False)),6)
xg_R2       = round(metrics.r2_score(y_test, y_pred_xgb),6)

results.iloc[4:5,0:1] = xg_MSE
results.iloc[4:5,1:2] = xg_RMSE
results.iloc[4:5,2:3] = xg_R2

## CatBoost

In [None]:
# fit the model on the whole dataset
catmodel = CatBoostRegressor(verbose=0, n_estimators=1000)
# Fitting the model 
catmodel.fit(X_train, y_train)

# Predict the model 
y_pred_cat = catmodel.predict(X_test)
# summarizing the fit of the model

cat_MSE      = round(math.sqrt(((y_pred_cat-y_test)**2).mean()),6)
cat_RMSE     = round(np.sqrt(cat_MSE),6)
cat_R2       = metrics.r2_score(y_pred_cat,y_test)

results.iloc[5:6,0:1] = cat_MSE
results.iloc[5:6,1:2] = cat_RMSE
results.iloc[5:6,3:4] = cat_R2

In [None]:
plot_results("Cat Boost Regressor", y_test, y_pred_cat)

In [None]:
del train,target,X_train,X_test,y_train,y_test
gc.collect()

# PyCaret - Statistical Models

#### Read [**Tutorial on PyCaret Library**](https://github.com/pycaret/)

In [None]:
!pip install pycaret
from pycaret.regression import *

In [None]:
train,test,_ = load_data()

for c in train.columns:
    if train[c].dtype=='object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values))
        train[c] = lbl.transform(train[c].values)
              
for c in test.columns:
    if test[c].dtype=='object':
        lbl = LabelEncoder()
        lbl.fit(list(test[c].values))
        test[c] = lbl.transform(test[c].values)

### Unseen Data for Predictions

In [None]:
data = train.sample(frac=0.9, random_state=42)
data_unseen = train.drop(data.index)

train.reset_index(drop=True, inplace=True)
data_unseen.reset_index(drop=True, inplace=True)

print('Data for Modeling          : ' + str(train.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

## Pull out the target, and make a validation split

In [None]:
target = train['target']

In [None]:
clf = setup(train,target='target',session_id=42)

In [None]:
best = compare_models(exclude = ['rf','xgboost','lightgbm','br','ransac','lr','dt','lar','huber','par','omp','knn','ridge','et','ada','en'])

In [None]:
%%time

cat = create_model('catboost',verbose=False)

In [None]:
# untuned model is efficient tha untuned
#tuned_model = tune_model(cat)

In [None]:
plot_model(cat, plot = 'error')

### Predict on Test / Hold-out Sample

In [None]:
predict_model(cat)

### Finalize Mode

In [None]:
final_model = finalize_model(cat)
predict_model(final_model)

### Predict on Unseen Data

In [None]:
unseen_predictions = predict_model(final_model,data=data_unseen)
unseen_predictions.head()

In [None]:
from pycaret.utils import check_metric
check_metric(unseen_predictions.target, unseen_predictions.Label, 'RMSE')

### Saving the Model

In [None]:
save_model(final_model,'CatBoost_Model')

In [None]:
gc.collect()

### Hyper parameters

In [None]:
best_params = final_model.get_all_params()
print(best_params)

### Pycart - CatBoost Model - Submission

In [None]:
sample_submission = pd.read_csv("../input/tabular-playground-series-feb-2021/sample_submission.csv")

In [None]:
pred_catboost = predict_model(final_model,data=test)

id = sample_submission['id'].values
label = pred_catboost.Label.values


out_df=pd.DataFrame({'id':id,'target':label})

# round the predictions to 6 decimal values
for c in out_df.columns:
    if out_df[c].dtype=='float64':
        out_df[c]= round(out_df[c],6)
        
display(out_df.head())
out_df.to_csv('submission_catboost_pycaret.csv',index=False)

In [None]:
gc.collect()

# Statistical Models with Fastai

In [None]:
# This file contains all the main external libs we'll use
import fastai
from fastai.imports  import *
from fastai.tabular.all import *

In [None]:
input_path = Path('/kaggle/input/tabular-playground-series-feb-2021/')
train = pd.read_csv(input_path / 'train.csv', index_col='id')
train.columns

In [None]:
print("Number of Samples              :",len(train))
print("Number of Categorical variables:",10)
print("Number of Continuos variables  :",14)
print("Max Features                   :",24)

In [None]:
train['cat0'] = train['cat0'].astype('category')
train['cat1'] = train['cat1'].astype('category')
train['cat2'] = train['cat2'].astype('category')
train['cat3'] = train['cat3'].astype('category')
train['cat4'] = train['cat4'].astype('category')
train['cat5'] = train['cat5'].astype('category')
train['cat6'] = train['cat6'].astype('category')
train['cat7'] = train['cat7'].astype('category')
train['cat8'] = train['cat8'].astype('category')
train['cat9'] = train['cat9'].astype('category')

In [None]:
cat_names = ['cat0','cat1','cat2','cat3','cat4','cat5','cat6','cat7','cat8','cat9']
cont_names = ['cont0','cont1','cont2','cont3','cont4','cont5','cont6','cont7','cont8','cont9','cont10','cont11','cont12','cont13']

In [None]:
sizes ='A','B','C','D','E','F','G','H','I','J','K','L','M','N','O'

train['cat0'].cat.set_categories(sizes, ordered=False, inplace =True)
train['cat1'].cat.set_categories(sizes, ordered=False, inplace =True)
train['cat2'].cat.set_categories(sizes, ordered=False, inplace =True)
train['cat3'].cat.set_categories(sizes, ordered=False, inplace =True)
train['cat4'].cat.set_categories(sizes, ordered=False, inplace =True)
train['cat5'].cat.set_categories(sizes, ordered=False, inplace =True)
train['cat6'].cat.set_categories(sizes, ordered=False, inplace =True)
train['cat7'].cat.set_categories(sizes, ordered=False, inplace =True)
train['cat8'].cat.set_categories(sizes, ordered=False, inplace =True)
train['cat9'].cat.set_categories(sizes, ordered=False, inplace =True)

In [None]:
splits = RandomSplitter(valid_pct=0.2)(range_of(train))

tp = TabularPandas(train,cat_names=cat_names,cont_names=cont_names,procs=[Categorify,FillMissing, Normalize],y_names='target',splits=splits)
len(tp.train),len(tp.valid)

In [None]:
tp.show(3)

In [None]:
tp.items.head(3)

### Test data

In [None]:
test= pd.read_csv(input_path / 'test.csv', index_col='id')

to = TabularPandas(test,cat_names=cat_names,cont_names=cont_names,procs=[Categorify,FillMissing, Normalize])

## Decision Tree

In [None]:
# defining independent and dependent variables
xs,y = tp.train.xs,tp.train.y

In [None]:
from fastai.imports import *
from sklearn.tree import DecisionTreeRegressor

m = DecisionTreeRegressor(max_features=24,max_leaf_nodes=25,max_depth=10)
m.fit(xs,y)

In [None]:
from sklearn import tree
import graphviz
feature_names = xs.columns.values
dot_data = tree.export_graphviz(m, out_file=None,feature_names=feature_names,class_names=y,filled=True, rounded=True) 
graph = graphviz.Source(dot_data) 
graph

In [None]:
# defining independent and dependent variables
valid_xs,valid_y = tp.valid.xs,tp.valid.y
y_pred_dt   = m.predict(valid_xs)

In [None]:
def cal_rmse(pred,y):return round(math.sqrt(((pred-y)**2).mean()),6)

def model_rmse(m,xs,y):return cal_rmse(m.predict(xs),y)

dt_MSE      = round(mean_squared_error(valid_y, y_pred_dt, squared=False),6)
dt_RMSE     = np.sqrt(dt_MSE)
dt_R2       = round(metrics.r2_score(valid_y, y_pred_dt),6)

results.iloc[6:7,0:1] = dt_MSE
results.iloc[6:7,1:2] = dt_RMSE
results.iloc[6:7,2:3] = dt_R2

In [None]:
gc.collect()

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

def rf(xs,y,n_estimators=250, n_jobs=-1,max_samples = 240000,max_features =1.0,min_samples_leaf=0.5,**kwargs):
    return RandomForestRegressor(n_jobs=-1,n_estimators=100,max_samples=max_samples,max_features=max_features,min_samples_leaf=min_samples_leaf,oob_score=True).fit(xs,y)

m_rf = rf(xs,y,min_samples_leaf=4,max_leaf_nodes=250,max_depth=10)

In [None]:
def cal_rmse(pred,y):return round(math.sqrt(((pred-y)**2).mean()),6)
def model_rmse(m,xs,y): return cal_rmse(m.predict(xs),y)

# defining independent and dependent variables
valid_xs,valid_y = tp.valid.xs,tp.valid.y

y_pred_rf = m_rf.predict(valid_xs)

rfa_MSE      = round(model_rmse(m_rf,valid_xs,valid_y),6)
rfa_RMSE     = np.sqrt(rfa_MSE)
rfa_R2       = round(metrics.r2_score(valid_y,y_pred_rf),6)

results.iloc[7:8,0:1] = rfa_MSE
results.iloc[7:8,1:2] = rfa_RMSE
results.iloc[7:8,2:3] = rfa_R2

## CatBoost with fastai

In [None]:
# fit the model on the whole dataset

def cat(xs,y,n_estimators=1000, n_jobs=-1,**kwargs):
    return CatBoostRegressor(verbose=0, n_estimators=1000).fit(xs,y)

params = {'nan_mode': 'Min','eval_metric': 'RMSE','iterations': 2500,'sampling_frequency': 'PerTree',
          'leaf_estimation_method': 'Newton','grow_policy': 'SymmetricTree','penalties_coefficient': 1,'boosting_type': 'Plain',
          'model_shrink_mode': 'Constant','feature_border_type': 'GreedyLogSum','bayesian_matrix_reg': 0.10000000149011612,'l2_leaf_reg': 3,
          'random_strength': 1,'rsm':1,'boost_from_average': True,'model_size_reg': 0.5,'subsample': 0.800000011920929,'use_best_model': False,
          'random_seed':14,'depth': 10,'posterior_sampling': False,'border_count': 254,'classes_count': 0,'auto_class_weights': 'None',
          'sparse_features_conflict_fraction': 0,'leaf_estimation_backtracking': 'AnyImprovement','best_model_min_trees': 1,'model_shrink_rate': 0,
          'min_data_in_leaf': 300,'loss_function': 'RMSE','learning_rate': 0.010290546311954876,'score_function': 'Cosine','task_type': 'CPU',
          'leaf_estimation_iterations': 1,'bootstrap_type': 'MVS','max_leaves': 64}

# Fitting the model 
m_cat = cat(xs,y,**params)

def cal_rmse(pred,y):return round(math.sqrt(((pred-y)**2).mean()),6)
def model_rmse(m,xs,y): return cal_rmse(m.predict(xs),y)

# defining independent and dependent variables
valid_xs,valid_y = tp.valid.xs,tp.valid.y

y_pred_cat = m_cat.predict(valid_xs)

cata_MSE      = round(model_rmse(m_cat,valid_xs,valid_y),6)
cata_RMSE     = np.sqrt(cata_MSE)
cata_R2       = round(metrics.r2_score(valid_y,y_pred_cat),6)

results.iloc[8:9,0:1] = cata_MSE
results.iloc[8:9,1:2] = cata_RMSE
results.iloc[8:9,2:3] = cata_R2

## Feature Importance

In [None]:
def rf_imp_features(m,df):
    return pd.DataFrame({'cols':df.columns,'imp_features':m.feature_importances_}).sort_values('imp_features',ascending=False)

In [None]:
ffig = rf_imp_features(m,xs)
ffig[:15]

In [None]:
def plot_fig(ffig):
    return ffig.plot('cols','imp_features','barh',figsize=(12,8),legend=False)

plot_fig(ffig)
plt.show()

In [None]:
gc.collect()

# Models with Optuna Hyperparameter Tuning
Optuna is a black-box optimizer that needs an objective function. It returns a numerical value to evaluate the performance of the hyperparameters.

In [None]:
!pip install optuna 
import optuna

## XGBoost with Optuna

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')

for c in train.columns:
    if train[c].dtype=='object': 
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values))
        train[c] = lbl.transform(train[c].values)
        
target = train.pop('target')
X_train,X_test,y_train,y_test = train_test_split(train,target,test_size=0.2)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

In [None]:
import xgboost as xgb

def objective(trial):
    param = {
        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.001,0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': 4000,
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20]),
        'random_state': trial.suggest_categorical('random_state', [24, 48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight',1,500)
    }
    model = xgb.XGBRegressor(**param)  
    model.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=100,verbose=False)
    preds = model.predict(X_test)
    rmse = round(np.sqrt(mean_squared_error(y_test,preds)),6)
    return rmse

In [None]:
%%time

#study = optuna.create_study(direction='minimize')
#study.optimize(objective, n_trials=25)
#print('Number of finished trials:', len(study.trials))
#print('Best trial:', study.best_trial.params)

In [None]:
# plot_optimization_history: shows the scores from all trials as well as the best score so far at each point

#optuna.visualization.plot_optimization_history(study)

In [None]:
# fit the model on the whole dataset

best_trial =  {'lambda': 0.07768755871021779, 'alpha': 9.52276768372669,
             'colsample_bytree': 0.3, 'subsample': 0.7, 'learning_rate': 0.02, 'max_depth': 7, 'random_state': 24, 'min_child_weight': 117}

model = xgb.XGBRegressor(**best_trial)
model.fit(X_train,y_train,eval_set=[(X_test,y_test)],verbose=False)

# Predict the model 
preds = model.predict(X_test)
mse = round(mean_squared_error(y_test, preds,squared=False),6)

xgopt_MSE      = mse
xgopt_RMSE     = np.sqrt(mse)
xgopt_R2       = round(metrics.r2_score(y_test, preds),6)

results.iloc[9:10,0:1] = xgopt_MSE
results.iloc[9:10,1:2] = xgopt_RMSE
results.iloc[9:10,2:3] = xgopt_R2

In [None]:
gc.collect()

## CatBoost with Optuna

In [None]:
def objective(trial):
    param = {
        'loss_function': 'RMSE',
        'task_type': 'GPU',
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
        'max_bin': trial.suggest_int('max_bin', 200, 400),
        #'rsm': trial.suggest_uniform('rsm', 0.3, 1.0),
        'subsample': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.006, 0.018),
        'n_estimators': 25000,
        'max_depth': trial.suggest_categorical('max_depth', [7,10,14,16]),
        'random_state': trial.suggest_categorical('random_state', [24, 48,2020]),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 300)
    }
    model = CatBoostRegressor(**param)  
    model.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=200,verbose=False)
    preds = model.predict(X_test)
    rmse = mean_squared_error(y_test, preds,squared=False)
    return rmse

In [None]:
import optuna


class StopWhenTrialKeepBeingPrunedCallback:
    def __init__(self, threshold: int):
        self.threshold = threshold
        self._consequtive_pruned_count = 0

    def __call__(self, study: optuna.study.Study, trial: optuna.trial.FrozenTrial) -> None:
        if trial.state == optuna.trial.TrialState.PRUNED:
            self._consequtive_pruned_count += 1
        else:
            self._consequtive_pruned_count = 0

        if self._consequtive_pruned_count >= self.threshold:
            study.stop()

In [None]:
%%time

import logging
import sys

# Add stream handler of stdout to show the messages
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
study = optuna.create_study(direction='minimize',pruner=optuna.pruners.MedianPruner())

study_stop_cb = StopWhenTrialKeepBeingPrunedCallback(2)
study.optimize(objective, n_trials=20,callbacks=[study_stop_cb])

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
# plot_optimization_history: shows the scores from all trials as well as the best score so far at each point

optuna.visualization.plot_optimization_history(study)

### With CPU

best_trial = {'l2_leaf_reg': 0.02247766515106271, 'max_bin': 364, 'subsample': 0.6708650091202213,
             'learning_rate': 0.010290546311954876, 'max_depth': 10, 'random_state': 24, 'min_data_in_leaf': 300,
            'loss_function': 'RMSE','n_estimators':  25000,'rsm':0.5}

In [None]:
best_trial = {'l2_leaf_reg': 0.02247766515106271, 'max_bin': 364, 'subsample': 0.6708650091202213,
             'learning_rate': 0.010290546311954876, 'max_depth': 10, 'random_state': 24, 'min_data_in_leaf': 300,
            'loss_function': 'RMSE','n_estimators':  25000,'rsm':0.5}

model_catopt_cpu = CatBoostRegressor(**best_trial)  
model_catopt_cpu.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=200,verbose=False)
preds = model_catopt_cpu.predict(X_test)
mse = round(mean_squared_error(y_test, preds,squared=False),6)

catboost_MSE      = mse
catboost_RMSE     = np.sqrt(mse)
catboost_R2       = round(metrics.r2_score(y_test, preds),6)


results.iloc[11:12,0:1] = catboost_MSE
results.iloc[11:12,1:2] = catboost_RMSE
results.iloc[11:12,2:3] = catboost_R2

In [None]:
train,test,_ = load_data()

for c in test.columns:
    if test[c].dtype=='object':
        lbl = LabelEncoder()
        lbl.fit(list(test[c].values))
        test[c] = lbl.transform(test[c].values)
        
test_features = test.values

submission = pd.read_csv("../input/tabular-playground-series-feb-2021/sample_submission.csv")

pred3 = model_catopt_cpu.predict(test_features).flatten()

submission['target'] = pred3

# round the predictions to 6 decimal values
for c in submission.columns:
    if submission[c].dtype=='float64':
        submission[c]= round(submission[c],6)
    
submission.to_csv('submission_catboost_optuna_cpu.csv',index=False)

### with GPU

best_params= {'l2_leaf_reg': 0.013856725926090555,'max_bin': 257,'bagging_fraction': 0.6788425346135741,'learning_rate': 0.010983813229740517,
'max_depth': 10,'random_state': 24,'min_data_in_leaf': 300}

In [None]:
best_trial = study.best_trial.params

model = CatBoostRegressor(**best_trial)  
model.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=200,verbose=False)
preds = model.predict(X_test)
mse = round(mean_squared_error(y_test, preds,squared=False),6)

catboost_MSE_gpu     = mse
catboost_RMSE_gpu    = np.sqrt(mse)
catboost_R2_gpu      = round(metrics.r2_score(y_test, preds),6)


results.iloc[10:11,0:1] = catboost_MSE_gpu
results.iloc[10:11,1:2] = catboost_RMSE_gpu
results.iloc[10:11,2:3] = catboost_R2_gpu

In [None]:
gc.collect()

# Results and Submissions


In [None]:
results = results.sort_values(by=['MSE'], ascending=True)
display(results)

In [None]:
y_pred_cat_fastai = m_cat.predict(to.items)
# model: CatBoost-fastai

submission['target'] = y_pred_cat_fastai

# round the predictions to 6 decimal values
for c in submission.columns:
    if submission[c].dtype=='float64':
        submission[c]= round(submission[c],6)
    
submission.to_csv('submission_catboost_fastai.csv',index=False)

In [None]:
train,test,_ = load_data()

for c in test.columns:
    if test[c].dtype=='object':
        lbl = LabelEncoder()
        lbl.fit(list(test[c].values))
        test[c] = lbl.transform(test[c].values)
        
test_features = test.values

submission = pd.read_csv("../input/tabular-playground-series-feb-2021/sample_submission.csv"

# model: CatBoost-Optuna-GPU

pred2 = model.predict(test_features).flatten()

submission['target'] = pred2

# round the predictions to 6 decimal values
for c in submission.columns:
    if submission[c].dtype=='float64':
        submission[c]= round(out_df[c],6)
    
submission.to_csv('submission_catboost_optuna_gpu.csv',index=False)

# model: CatBoost-Optuna-CPU

pred3 = model_catopt_cpu(test_features).flatten()

submission['target'] = pred3

# round the predictions to 6 decimal values
for c in submission.columns:
    if submission[c].dtype=='float64':
        submission[c]= round(out_df[c],6)
    
submission.to_csv('submission_catboost_optuna_cpu.csv',index=False)

# model: - CatBoost-pycaret

pred_catboost = predict_model(final_model,data=test)

id = sample_submission['id'].values
label = pred_catboost.Label.values
out_df=pd.DataFrame({'id':id,'target':label})

# round the predictions to 6 decimal values
for c in out_df.columns:
    if out_df[c].dtype=='float64':
        out_df[c]= round(out_df[c],6)
        
out_df.to_csv('submission_catboost_pycaret.csv',index=False)

# Using AutoViML 

AutoVIML is an open-source python package that makes machine learning easy.

In [None]:
!pip install autoviml
from autoviml.Auto_ViML import Auto_ViML
!pip install autoviml --no-cache-dir --ignore-installed

In [None]:
!pip install --upgrade pip
!pip install SHAP
!pip3 install --upgrade Pillow
import PIL
gc.collect()

In [None]:
train,test,submission = load_data()

In [None]:
# load the data sets

from catboost import CatBoostRegressor

model, features, trainm, testm = Auto_ViML(
    train=train,
    target="target",
    test=test,
    sample_submission="",
    hyper_param="RS",
    feature_reduction=True,
    scoring_parameter="mse",
    KMeans_Featurizer=False,
    Boosting_Flag="CatBoost",
    Binning_Flag=True,
    Add_Poly=False,
    Stacking_Flag=False,
    Imbalanced_Flag=True,
    verbose=0
)

In [None]:
print(model)

In [None]:
train,test,submission = load_data()

for c in train.columns:
    if train[c].dtype=='object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values))
        train[c] = lbl.transform(train[c].values)

for c in test.columns:
    if test[c].dtype=='object':
        lbl = LabelEncoder()
        lbl.fit(list(test[c].values))
        test[c] = lbl.transform(test[c].values)


target = train.pop('target')

X_train, X_test, y_train, y_test = train_test_split(train, target, train_size=0.80)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
automodel = model.fit(X_train,y_train,eval_set=[(X_test,y_test)],verbose=False)

preds = automodel.predict(X_test)

mse = round(mean_squared_error(y_test, preds),6)


print(mse)

In [None]:
display(results)