# Introduction
Greetings!👋

In this kernel you will find my data science approach to "Tabular Playground Series - May 2021" competition using CatBoost and Optuna.

As always, any feedback Is very much appreciated! :)

# Table of contents:

1. Meeting our data

2. Creating visualizations

3. Doing a bit of preprocessing

4. Creating and evaluating models

5. Parameter tuning with Optuna

6. Creating a final model and submitting results

# 1. Meeting our data

In [None]:
import pandas as pd
import numpy as np

train = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/train.csv', index_col = 'id')
test = pd.read_csv('/kaggle/input/tabular-playground-series-may-2021/test.csv', index_col = 'id')
train

In [None]:
test

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.info()

In [None]:
test.info()

In [None]:
target = train.target.copy()
target

In [None]:
target.describe()

In [None]:
train.drop('target', axis = 1, inplace = True)
train

In [None]:
(train.columns).equals(test.columns)

In [None]:
train.describe().T.style.bar(subset = ['mean'], color = 'royalblue').background_gradient(subset = ['std'], cmap = 'Blues_r')

# 2. Creating visualizations

In [None]:
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

sns.set_style('whitegrid')

In [None]:
plt.figure(figsize = (16, 6))
target_order = sorted(target.unique())
sns.barplot(x = target.value_counts().index, y = target.value_counts(), order = target_order, palette = 'Blues_r')

In [None]:
def plot_grid(data, fig_size, grid_size, plot_type, target = ''):
    """
    Custom function for plotting grid of plots.
    It takes: DataFrame of data, size of a grid, type of plots, string name of target variable;
    And it outputs: grid of plots.
    """
    fig = plt.figure(figsize = fig_size)
    if plot_type == 'histplot':
        for i, column_name in enumerate(data.select_dtypes(exclude = 'object').columns):
            fig.add_subplot(grid_size[0], grid_size[1], i + 1)
            plot = sns.histplot(data[column_name], kde = True, color = 'royalblue', stat = 'count')
    if plot_type == 'boxplot':
        for i, column_name in enumerate(data.select_dtypes(exclude = 'object').columns):
            fig.add_subplot(grid_size[0], grid_size[1], i + 1)
            plot = sns.boxplot(x = data[column_name], color = 'royalblue')
    if plot_type == 'barplot':
        target = data[target]
        target_order = sorted(target.unique())
        for i, column_name in enumerate(data.drop(target.name, axis = 1).columns):
            fig.add_subplot(grid_size[0], grid_size[1], i + 1)
            new_data = data[[column_name, target.name]].groupby(target.name).mean()
            plot = sns.barplot(x = new_data.index, y = new_data[column_name], palette = 'Blues_r', order = target_order)
    plt.tight_layout()

In [None]:
plot_grid(train, (16, 36), (17, 3), 'histplot')

In [None]:
plot_grid(train, fig_size = (16, 36), grid_size = (17, 3), plot_type = 'boxplot')

In [None]:
plot_grid(pd.concat([train, target], axis = 1), (16, 36), (17, 3), 'barplot', 'target')

In [None]:
plt.figure(figsize = (16, 16))
sns.heatmap(train.corr(),
#             annot = True,
#             fmt = '.2f',
            square = True,
            cmap = 'Blues_r',
            cbar = False,
            mask = np.triu(train.corr()))

It seems that there are a few features that consist almost entirely out of zeroes. Let's look into that.

In [None]:
zeroes = pd.DataFrame()
for i, column in enumerate(train.columns):
    zeroes.loc[i, 'ColumnName'] = column
    zeroes.loc[i, 'PercentOfZeroes'] = train.loc[train[column] == 0, column].count() / train.shape[0]
#     print(f'{column} = {train.loc[train[column] == 0, column].count() / train.shape[0]}')
zeroes.sort_values(by = 'PercentOfZeroes', ascending = False).style.background_gradient('Blues')

# 3. Doing a bit of preprocessing

In [None]:
train_test = pd.concat([train, test], keys = ['train', 'test'], axis = 0)
train_test

In [None]:
train_test = (train_test - train_test.mean()) / train_test.std()
train = train_test.xs('train').copy()
test = train_test.xs('test').copy()
train

In [None]:
class_map = {
    'Class_1': 0,
    'Class_2': 1,
    'Class_3': 2,
    'Class_4': 3,
}

target = target.map(class_map).astype('int')

target

# 4. Creating and evaluating models

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
def test_estimators(X, y, estimators, labels, cv):
    ''' 
    A function for testing multiple estimators.
    It takes: full train data and target, list of estimators, 
              list of labels or names of estimators,
              cross validation splitting strategy;
    And it returns: a DataFrame of table with results of tests
    '''
    result_table = pd.DataFrame()

    row_index = 0
    for est, label in zip(estimators, labels):

        est_name = label
        result_table.loc[row_index, 'Model Name'] = est_name
        
        cv_results = cross_validate(est,
                                    X,
                                    y,
                                    cv = cv,
                                    scoring = 'neg_log_loss',
                                    n_jobs = -1)

        result_table.loc[row_index, 'Test log loss'] = -cv_results['test_score'].mean()
        result_table.loc[row_index, 'Test Std'] = cv_results['test_score'].std()
        result_table.loc[row_index, 'Fit Time'] = cv_results['fit_time'].mean()

        row_index += 1

    result_table.sort_values(by = ['Test log loss'], ascending = True, inplace = True)

    return result_table

Taking a sample to save some time.

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train, 
                                                      target, 
                                                      stratify = target,
                                                      train_size = 0.1,
                                                      random_state = 1)
y_train

In [None]:
logreg = LogisticRegression()
dt = DecisionTreeClassifier(random_state = 1)
rf = RandomForestClassifier()
xgb = XGBClassifier()
lgbm = LGBMClassifier()
cb = CatBoostClassifier(allow_writing_files = False, logging_level = 'Silent')
svc = SVC(probability = True)
gnb = GaussianNB()

estimators = [logreg,
              dt,
              rf,
              lgbm, 
              cb,
              svc,
              gnb,]
#               xgb]

labels = ['LogRegression',
          'DecisionTree',
          'RandomForest',
          'LGBM',
          'CatBoost',
          'SVC',
          'GNB',]
#           'XGB']

results = test_estimators(X_train, y_train, estimators, labels, cv = StratifiedKFold(n_splits = 5))
results.style.background_gradient(cmap = 'Blues')

# 5. Parameter tuning with Optuna

In [None]:
import optuna
from optuna.trial import TrialState

from catboost import Pool, cv

def objective(trial, model, X_train_full, y_train_full):
    if (model == 'cb'):
        train_set = Pool(X_train_full, label = y_train_full)
        
        params = {
            "objective": 'MultiClass',
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
            "depth": trial.suggest_int("depth", 1, 12),
            "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
            "bootstrap_type": trial.suggest_categorical(
                "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
            ),

            'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-1, log = True),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 1e-1, log = True),
            'min_child_samples': trial.suggest_int('min_child_samples', 2, 20),
            'random_strength': trial.suggest_float('random_strength', 0.05, 1, log = True)
        }

        if params["bootstrap_type"] == "Bayesian":
            params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
        elif params["bootstrap_type"] == "Bernoulli":
            params["subsample"] = trial.suggest_float("subsample", 0.1, 1)

        
        k = 5
        cb_cv_results = cv(
            params = params,
            pool = train_set,
            num_boost_round = 4000,
            nfold = k,
            stratified = True,
            early_stopping_rounds = 100,
            verbose_eval = False,
        )
        # Set n_estimators as a trial attribute; Accessible via study.trials_dataframe().
        trial.set_user_attr("n_estimators", len(cb_cv_results['test-MultiClass-mean']))
        # Extract the best score.
        best_score = cb_cv_results['test-MultiClass-mean'].iloc[-1]
        return best_score

In [None]:
study_cb = optuna.create_study(direction = 'minimize')
study_cb.optimize(lambda trial: objective(trial, 'cb', train, target), n_trials = 100, timeout = 3600 * 7)

In [None]:
print("Number of finished trials: ", len(study_cb.trials))
print("Best trial:")
trial = study_cb.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

print("  Number of estimators: {}".format(trial.user_attrs["n_estimators"]))

# 6. Creating a final model and submitting results

In [None]:
cb = CatBoostClassifier(allow_writing_files = False, 
                        logging_level = 'Silent', 
                        n_estimators = trial.user_attrs["n_estimators"], 
                        **study_cb.best_params)
cb.fit(train, target)
predictions = cb.predict_proba(test)
predictions

In [None]:
submission = pd.DataFrame({'id': test.index,
                           'Class_1': predictions[:, 0],
                           'Class_2': predictions[:, 1],
                           'Class_3': predictions[:, 2],
                           'Class_4': predictions[:, 3],})

submission.to_csv('submission.csv', index = False)