# **Baseline Models**

![High Jump](https://media.aws.iaaf.org/media/Original/827ec70e-b460-4b2d-bf97-7d9c6be10c5a.jpg)

[Source](https://www.worldathletics.org/disciplines/jumps/high-jump)

In [None]:
%%capture
!pip install pycaret[full]

# Introduction

Hey, thanks for viewing my Kernel!

If you like my work, please, leave an upvote: it will be really appreciated and it will motivate me in offering more content to the Kaggle community ! 😊

In [None]:
import pandas as pd
import numpy as np

titanic_train = pd.read_csv('../input/titanic/train.csv')
display(titanic_train.head())

In [None]:
print(titanic_train.dtypes)

In [None]:
titanic_train.isna().sum()

# Random Baseline Models
In the real world, data can not always be predictable.  In these such problems, the best baseline model is a dummy classifier or dummy regressor. That baseline model shows you to your ml model is actually learning or not. You can see how to use random baseline models below.

In [None]:
np.random.seed(0)
random_dim = (1000,3)
random_X = np.random.random(random_dim)
random_reg_y = np.random.random(random_dim[0])
random_clf_y = np.random.randint(random_dim[1], size=random_dim[0])

#train_reg = np.concatenate((random_X, random_reg_y.reshape(random_dim[0], 1)), axis=1)
#col_list = [str(i +1) for i in range(random_dim[1])]
#col_list.append('target')
#train_reg = pd.DataFrame(train_reg, columns=col_list)

train_clf = np.concatenate((random_X, random_clf_y.reshape(random_dim[0], 1)), axis=1)
col_list = [str(i +1) for i in range(random_dim[1])]
col_list.append('target')
train_clf = pd.DataFrame(train_clf, columns=col_list)

train_clf['target'] = train_clf['target'].astype('str')
train_clf

In [None]:
from pycaret.classification import *

clf = setup(data=train_clf, 
            target='target', 
            silent=True,
            session_id=0)

In [None]:
compare_models(sort='Accuracy')

# Machine Learning Baseline Models
If data is predictable, the second step is to create an ml baseline model. This baseline model shows us which feature is important for prediction and which is not. Generally, ml baseline models use with feature engineering.

In [None]:
from pycaret.classification import *

CAT_FEATURES = ['Sex', 'Embarked']
NUM_FEATURES = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
IGN_FEATURES = ['PassengerId', 'Name', 'Ticket', 'Cabin']

clf = setup(data=titanic_train, 
            target='Survived',
            normalize = True, #normalisation helps some algorithms
            normalize_method = 'robust', #resilient to outliers
            transformation = True, #applies transformation to target column
            transformation_method = 'quantile',
            data_split_shuffle = False, #so that we do not use "future" observations to predict "past" observations
            create_clusters = True,
            feature_interaction = True,
            categorical_features = CAT_FEATURES,
            numeric_features = NUM_FEATURES,
            ignore_features = IGN_FEATURES,
            session_id = 42,
            use_gpu = False,
            silent = True,
            fold = 10,
            n_jobs = -1)

In [None]:
compare_models(sort='Accuracy')

In [None]:
baseline_model = create_model('rf')

baseline_preds = predict_model(baseline_model, raw_score=True)
baseline_preds

In [None]:
baseline_scores = pull()
baseline_scores

## Feature Engineering

In [None]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Name
titanic_train_FeaEng = titanic_train.copy()
name_last = titanic_train_FeaEng['Name'].str.split(' ', n=1, expand=True)[1]
title = name_last.str.split(' ', n=1, expand=True)[0]
titanic_train_FeaEng['Title'] = title

name_len = titanic_train_FeaEng['Name'].str.len()
titanic_train_FeaEng['Name_len'] = name_len

tfidf_vec = TfidfVectorizer(max_features=15, token_pattern="\w+")
svd = TruncatedSVD(n_components=10)
tfidf_array = svd.fit_transform(tfidf_vec.fit_transform(titanic_train_FeaEng["Name"]))
for i in range(tfidf_array.shape[1]):
    titanic_train_FeaEng['Name_' + str(i)] = tfidf_array [:,i]

# Cabin
cabin_first = []
cabin_last = []
cabin_len = []

for cabin in titanic_train_FeaEng['Cabin']:
    try:
        re_list = re.split('(\d+)',cabin)
        if len(re_list) > 1:
            cabin_first.append(re_list[0])
            cabin_last.append(int(re_list[-2]))
            cabin_len.append(len(re_list))
        else:
            cabin_first.append('None')
            cabin_last.append(0)
            cabin_len.append(0)
    except:
        cabin_first.append('None')
        cabin_last.append(0)
        cabin_len.append(0)

titanic_train_FeaEng['Cabin_First'] = cabin_first
titanic_train_FeaEng['Cabin_Last'] = cabin_last
titanic_train_FeaEng['Cabin_Len'] = cabin_len

# Ticket
tfidf_vec = TfidfVectorizer(max_features=5, analyzer="char")
svd = TruncatedSVD(n_components=3)
tfidf_array = svd.fit_transform(tfidf_vec.fit_transform(titanic_train_FeaEng["Ticket"]))
for i in range(tfidf_array.shape[1]):
    titanic_train_FeaEng['Ticket_' + str(i)] = tfidf_array [:,i]

new_features = ['Title', 'Name_len','Name_0', 'Name_1', 'Name_2', 'Name_3', 'Name_4', 
                'Name_5', 'Name_6', 'Name_7', 'Name_8', 'Name_9', 'Cabin_First', 
                'Cabin_Last', 'Cabin_Len', 'Ticket_0', 'Ticket_1', 'Ticket_2']
old_features = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp',
                'Parch', 'Fare', 'Embarked']

## Features Importance

In [None]:
feature_score_dict = {}

for index, feature in enumerate(new_features):
    old_features_temp = old_features.copy()
    old_features_temp.append(feature)
    titanic_train_FeaEng_temp = titanic_train_FeaEng[old_features_temp].copy()
    
    clf = setup(data=titanic_train_FeaEng_temp, 
            target='Survived',
            normalize = True, #normalisation helps some algorithms
            normalize_method = 'robust', #resilient to outliers
            transformation = True, #applies transformation to target column
            transformation_method = 'quantile',
            data_split_shuffle = False, #so that we do not use "future" observations to predict "past" observations
            create_clusters = True,
            feature_interaction = True,
            session_id = 42,
            use_gpu = False,
            silent = True,
            fold = 10,
            n_jobs = -1)
    
    baseline_model = create_model('rf')
    scores = pull()
    feature_score_dict[feature] = scores

In [None]:
metric_list = []
feature_list = []
score_list = []

for key in feature_score_dict.keys():
    metric_list.extend(list(feature_score_dict[key].columns))
    score_list.extend(list(feature_score_dict[key].loc['Mean', :]))
    feature_list.extend([key for i in range(len(feature_score_dict[key].columns))])

all_scores_pd = pd.DataFrame()
all_scores_pd['Metric'] = metric_list
all_scores_pd['Feature'] = feature_list
all_scores_pd['Score'] = score_list

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

col_list = ['Accuracy', 'AUC', 'Recall', 'Prec.', 'F1', 'Kappa']
score_color = {'Accuracy':'C0', 'AUC':'C1', 'Recall':'C2', 'Prec.':'C3', 'F1':'C4', 'Kappa':'C5'}
fig, ax = plt.subplots(figsize=(24, 8))
all_scores_pd = all_scores_pd.loc[all_scores_pd['Metric'] != 'MCC', :]
sns.lineplot(data=all_scores_pd, x='Feature', y='Score', hue='Metric', ax=ax, palette=score_color)
ax.legend(loc="lower left")
for base_col in col_list:
    base_score = baseline_scores.loc[:, base_col].values[0]
    ax.plot([0, len(all_scores_pd['Feature'].unique())], [base_score, base_score], color=score_color[base_col])
    ax.text(len(all_scores_pd['Feature'].unique()), base_score, 'Base ' + base_col);

# Automated Machine Learning Baseline Models
The final baseline model is the automated ml baseline model. It is a very good model for benchmarking your ml model. If your ml model is better than the automated baseline model, it is a very strong sign that the model can become a product.

## LightAutoML

In [None]:
%%capture
!pip install -U lightautoml

In [None]:
# Imports from our package
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.dataset.roles import DatetimeRole
from lightautoml.tasks import Task

import torch

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

N_THREADS = 4 # threads cnt for lgbm and linear models
N_FOLDS = 5 # folds cnt for AutoML
RANDOM_STATE = 42 # fixed random state for various reasons
TEST_SIZE = 0.2 # Test size for metric check
TIMEOUT = 300 # Time in seconds for automl run

np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

def acc_score(y_true, y_pred, **kwargs):
    return accuracy_score(y_true, (y_pred > 0.5).astype(int), **kwargs)

def f1_metric(y_true, y_pred, **kwargs):
    return f1_score(y_true, (y_pred > 0.5).astype(int), **kwargs)

task = Task('binary', metric = acc_score)

roles = {
    'target': 'Survived',
    'drop': ['Passengerid', 'Name', 'Ticket'],
}

In [None]:
%%time 
from sklearn.model_selection import StratifiedKFold

n_fold = 3
skf = StratifiedKFold(n_splits=n_fold)
skf.get_n_splits(titanic_train)

acc_list = []
for train_index, test_index in skf.split(titanic_train, titanic_train['Survived']):
    X_train, X_test = titanic_train.loc[train_index, :], titanic_train.loc[test_index, :]
    y = X_test['Survived']
    X_test.drop(['Survived'], axis=1, inplace=True)
    
    automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {'use_algos': [['linear_l2', 'lgb', 'lgb_tuned']]},
                       reader_params = {'n_jobs': N_THREADS})
    automl.fit_predict(X_train, roles = roles)
    
    test_pred = automl.predict(X_test)
    test_pred = (test_pred.data[:, 0] > 0.5).astype(int)
    acc_list.append(acc_score(y, test_pred))
lightautoml_acc_score = sum(acc_list) / n_fold
print('lightautoml_acc_score: ', lightautoml_acc_score)

In [None]:
print('lightautoml_acc_score: ', lightautoml_acc_score)

## H2O AutoML

In [None]:
import h2o
from h2o.automl import H2OAutoML

In [None]:
h2o.init()

In [None]:
%%time
acc_list = []
for train_index, test_index in skf.split(titanic_train, titanic_train['Survived']):
    X_train, X_test = titanic_train.loc[train_index, :], titanic_train.loc[test_index, :]
    y = X_test['Survived'].astype(int)
    X_test.drop(['Survived'], axis=1, inplace=True)
    
    train_hf = h2o.H2OFrame(X_train.copy())
    test_hf = h2o.H2OFrame(X_test.copy())
    feature_columns = X_train.drop(['Survived', 'PassengerId'], axis=1).columns
    
    aml = H2OAutoML(
        seed=2022, 
        max_runtime_secs=100,
        nfolds = 3,
        exclude_algos = ["DeepLearning"]
    )
    
    aml.train(
        x=list(feature_columns), 
        y='Survived', 
        training_frame=train_hf
    )
    
    test_pred = aml.predict(test_hf)
    test_pred = test_pred.as_data_frame()
    test_pred['test_pred_int'] = (test_pred[['predict']] > 0.5)
    y_pred = test_pred['test_pred_int'].astype(int)
    h2o_acc_score = accuracy_score(y, y_pred)
    acc_list.append(h2o_acc_score)
h2o_tautoml_acc_score = sum(acc_list) / n_fold
print('h2o_tautoml_acc_score: ', h2o_tautoml_acc_score)

In [None]:
print('h2o_tautoml_acc_score: ', h2o_tautoml_acc_score)

##  AutoML Scores

In [None]:
fig, ax = plt.subplots(figsize=(24, 8))
ax.plot([0, 10], [h2o_tautoml_acc_score, h2o_tautoml_acc_score], color='r')
ax.text(10, h2o_tautoml_acc_score, 'Base_H2O')
ax.plot([0, 10], [lightautoml_acc_score, lightautoml_acc_score], color='r')
ax.text(10, lightautoml_acc_score, 'Base_LightAutoMl');