# CatBoost Tabular Playground Series Prediction

### Summary
In this notebook, I will use CatBoost Regressor to solve Tabular Playground Series Prediction. I will try hyperparameter searching and K-Fold Algorithm to see if this can have an impact on test results.

# Import Packages

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

# Common Functions

In [2]:
def submit(model, test_features, test_ids, filename):
    loss_pred = model.predict(test_features)
    submission = pd.DataFrame({"id": test_ids, "loss": loss_pred.reshape(-1)})
    submission.to_csv(filename, index = False)

# Load Datasets

In [3]:
train_data = pd.read_csv("D:/Study/Data Science Content/Datasets/tabular-playground-series-aug-2021/train.csv")

test_data = pd.read_csv("D:/Study/Data Science Content/Datasets/tabular-playground-series-aug-2021/test.csv")

# EDA

In [4]:
train_data.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,loss
0,0,-0.00235,59,0.766739,-1.35046,42.2727,16.6857,30.3599,1.2673,0.392007,...,-42.4399,26.854,1.45751,0.696161,0.941764,1.82847,0.92409,2.29658,10.4898,15
1,1,0.784462,145,-0.463845,-0.530421,27324.9,3.47545,160.498,0.828007,3.73586,...,-184.132,7.90137,1.70644,-0.494699,-2.0583,0.819184,0.439152,2.3647,1.14383,3
2,2,0.317816,19,-0.432571,-0.382644,1383.26,19.7129,31.1026,-0.515354,34.4308,...,7.43721,37.2181,3.25339,0.337934,0.615037,2.21676,0.745268,1.69679,12.3055,6
3,3,0.210753,17,-0.616454,0.946362,-119.253,4.08235,185.257,1.38331,-47.5214,...,9.66778,0.626942,1.49425,0.517513,-10.2221,2.62731,0.61727,1.45645,10.0288,2
4,4,0.439671,20,0.968126,-0.092546,74.302,12.3065,72.186,-0.233964,24.3991,...,290.657,15.6043,1.73557,-0.476668,1.39019,2.19574,0.826987,1.78485,7.07197,1


In [5]:
test_data.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,250000,0.812665,15,-1.23912,-0.893251,295.577,15.8712,23.0436,0.942256,29.898,...,0.446389,-422.332,-1.4463,1.69075,1.0593,-3.01057,1.94664,0.52947,1.38695,8.78767
1,250001,0.190344,131,-0.501361,0.801921,64.8866,3.09703,344.805,0.807194,38.4219,...,0.377179,10352.2,21.0627,1.84351,0.251895,4.44057,1.90309,0.248534,0.863881,11.7939
2,250002,0.919671,19,-0.057382,0.901419,11961.2,16.3965,273.24,-0.0033,37.94,...,0.99014,3224.02,-2.25287,1.551,-0.559157,17.8386,1.83385,0.931796,2.33687,9.054
3,250003,0.860985,19,-0.549509,0.471799,7501.6,2.80698,71.0817,0.792136,0.395235,...,1.39688,9689.76,14.7715,1.4139,0.329272,0.802437,2.23251,0.893348,1.35947,4.84833
4,250004,0.313229,89,0.588509,0.167705,2931.26,4.34986,1.57187,1.1183,7.75463,...,0.862502,2693.35,44.1805,1.5802,-0.191021,26.253,2.68238,0.361923,1.5328,3.7066


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Columns: 102 entries, id to loss
dtypes: float64(95), int64(7)
memory usage: 194.5 MB


In [7]:
train_data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,250000.0,124999.500000,72168.927986,0.000000,62499.750000,124999.500000,187499.250000,249999.00000
f0,250000.0,0.511213,0.307884,-0.069273,0.251287,0.514962,0.777322,1.07207
f1,250000.0,51.378476,42.396636,-17.000000,18.000000,41.000000,75.000000,273.00000
f2,250000.0,0.107155,1.322200,-7.895580,-0.611172,0.253815,0.759249,9.76859
f3,250000.0,0.050010,0.792368,-1.475560,-0.719418,0.004099,0.765456,1.68019
...,...,...,...,...,...,...,...,...
f96,250000.0,2.417556,0.892563,-1.131980,1.906718,2.340430,2.910020,5.57604
f97,250000.0,0.537484,0.226589,0.005249,0.359646,0.531348,0.709807,1.10540
f98,250000.0,1.576900,0.646306,-0.646967,1.215810,1.451285,1.901632,4.49262
f99,250000.0,8.048805,5.647368,-0.842397,3.732800,7.182205,10.998550,34.01920


In [8]:
train_data.shape

(250000, 102)

In [9]:
corr_score = train_data.corr()

In [10]:
corr_score["loss"].sort_values(ascending = False)

loss    1.000000
f13     0.025730
f46     0.023828
f28     0.022780
f74     0.021610
          ...   
f96    -0.020655
f66    -0.022424
f58    -0.023370
f52    -0.026566
f25    -0.030164
Name: loss, Length: 102, dtype: float64

# Data Preprocessing

### Drop Id Column

In [11]:
train_data.pop("id")
test_ids = test_data.pop("id")

In [12]:
train_mean = train_data.mean()
train_std = train_data.std()

In [13]:
train_target_mean = train_mean.pop("loss")
train_targets_std = train_std.pop("loss")

### Train Validation Split

In [14]:
validation_split = 0.2

In [15]:
train_features, validation_features = train_test_split(train_data, test_size = validation_split)

In [16]:
train_targets, validation_targets = train_features.pop("loss"), validation_features.pop("loss")

### Data Scaling

In [17]:
should_scale = False
if should_scale == True:
    train_features = (train_features - train_mean) / train_std
    validation_features = (validation_features - train_mean) / train_std
    test_features = (test - train_mean) / train_std
    print(test_features.head())
    print(train_features.head())
    print(validation_features.head())
else:
    test_features = test_data

### Model Develpoment

#### Using Catboost

In [18]:
import catboost
import time
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
begin = time.time()
parameters = {
    "depth": [6, 7, 8],
    "learning_rate": [0.08, 0.1],
    "iterations": [300, 350], 
}
def train_catboost(hyperparameters, X_train, X_val, y_train, y_val):
    keys = hyperparameters.keys()
    best_index = {key:0 for key in keys}
    best_cat = None
    best_score = 10e8
    for (index, key) in enumerate(keys):
        print("Find best parameter for %s" %(key))
        items = hyperparameters[key]
        best_parameter = None
        temp_best = 10e8
        for (key_index, item) in enumerate(items):
            iterations = hyperparameters["iterations"][best_index["iterations"]] if key != "iterations" else item
            learning_rate = hyperparameters["learning_rate"][best_index["learning_rate"]] if key != "learning_rate" else item
            depth = hyperparameters["depth"][best_index["depth"]] if key != "depth" else item
            print("Train with iterations: %d learning_rate: %.2f depth:%d"%(iterations, learning_rate, depth))
            cat = catboost.CatBoostRegressor(
                iterations = iterations, 
                learning_rate = learning_rate,
                depth = depth
            )
            cat.fit(X_train, y_train, verbose=False)
            y_pred = cat.predict(X_val)
            score = np.sqrt(mean_squared_error(y_val, y_pred))
            print("RMSE: %.2f"%(score))
            if score < temp_best:
                temp_best = score
                best_index[key] = key_index
                best_parameter = item
            if score < best_score:
                best_score = score
                best_cat = cat
        print("Best Parameter for %s: "%(key), best_parameter)
    best_parameters = {
        "iterations": hyperparameters["iterations"][best_index["iterations"]],
        "learning_rate": hyperparameters["learning_rate"][best_index["learning_rate"]],
        "depth": hyperparameters["depth"][best_index["depth"]]
    }
    return best_cat, best_score, best_parameters
best_cat, best_score, best_parameters = train_catboost(parameters, train_features, validation_features, train_targets, validation_targets)
print("Best CatBoost Model: ", best_cat)
print("Best MAE: ", best_score)
elapsed = time.time() - begin 
print("Elapsed time: ", elapsed)
submit(best_cat, test_features, test_ids, "submission.csv")

Find best parameter for depth
Train with iterations: 300 learning_rate: 0.08 depth:6
RMSE: 7.84
Train with iterations: 300 learning_rate: 0.08 depth:7
RMSE: 7.84
Train with iterations: 300 learning_rate: 0.08 depth:8
RMSE: 7.85
Best Parameter for depth:  7
Find best parameter for learning_rate
Train with iterations: 300 learning_rate: 0.08 depth:7
RMSE: 7.84
Train with iterations: 300 learning_rate: 0.10 depth:7
RMSE: 7.84
Best Parameter for learning_rate:  0.1
Find best parameter for iterations
Train with iterations: 300 learning_rate: 0.10 depth:7
RMSE: 7.84
Train with iterations: 350 learning_rate: 0.10 depth:7
RMSE: 7.84
Best Parameter for iterations:  350
Best CatBoost Model:  <catboost.core.CatBoostRegressor object at 0x000001D93BC61820>
Best MAE:  7.839513469283204
Elapsed time:  202.26738667488098


In [19]:
from sklearn.model_selection import KFold
fold = 1
for train_indices, val_indices in KFold(n_splits=5, shuffle=True).split(train_data):
    print("Training with Fold %d"%(fold))
    X_train = train_data.iloc[train_indices]
    X_val = train_data.iloc[val_indices]
    y_train = X_train.pop("loss")
    y_val = X_val.pop("loss")
    if should_scale:
        X_train = (X_train - train_mean) / train_std
        X_val = (X_val - train_mean) / train_std
    cat = catboost.CatBoostRegressor(
        iterations = best_parameters["iterations"], 
        learning_rate = best_parameters["learning_rate"],
        depth = best_parameters["depth"]
    )
    cat.fit(X_train, y_train, verbose=False)
    y_pred = cat.predict(X_val)
    score = np.sqrt(mean_squared_error(y_val, y_pred))
    print("RMSE: %.2f"%(score))
    submit(cat, test_features, test_ids, "submission_fold%d.csv"%(fold))
    fold += 1

Training with Fold 1
RMSE: 7.86
Training with Fold 2
RMSE: 7.89
Training with Fold 3
RMSE: 7.84
Training with Fold 4
RMSE: 7.88
Training with Fold 5
RMSE: 7.82


### Conclusion

Hyper Parameter Searching don't affect the result too much. K-Fold Algorithm has obvious impact in validation dataset, but doesn't have an obvious impact on test result.