# Pointwise Methods - Baselines

In this notebook, we compare pointwise methods regarding a baseline preprocessing of just one-hot encoding every feature. First, we load the required dependencies and the data.

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
from category_encoders.binary import BinaryEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder
from datetime import timedelta
from sklearn.compose import ColumnTransformer
from time import time

from src import configuration as config
from src.features.encoder_utils import NoY
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType
from src.pipeline.pipeline_transformers import *


# load the data
train_df = config.load_traindata_for_pointwise()
pipelineFactory = PipelineFactory()

In [2]:
def set_baseline_steps(pipeline):
    pipeline.clear_steps()
    pipeline.add_new_step(ColumnKeeper(['dataset', 'model', 'tuning', 'scoring']), 'keeper')
    pipeline.add_new_step(NoY(OneHotEncoder(['dataset', 'model', 'tuning', 'scoring'])), 'baseline')

## Pointwise Regression

In [3]:
start = time()

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_REGRESSION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.CROSS_VALIDATION,
    n_folds=5,
    workers=1,
    target="rank"
)
set_baseline_steps(pipeline)
print(pipeline.get_pipeline().steps)
pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

Creating pipeline ...
[('keeper', ColumnKeeper(columns=['dataset', 'model', 'tuning', 'scoring'])), ('baseline', <src.features.encoder_utils.NoY object at 0x0000020D959964F0>), ('estimator', RandomForestRegressor())]
Starting pipeline using method: EvaluationType.CROSS_VALIDATION


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:14<00:00,  2.85s/it]

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.5633
    validation_average_spearman_fold_1: 0.5459
    validation_average_spearman_fold_2: 0.5594
    validation_average_spearman_fold_3: 0.5653
    validation_average_spearman_fold_4: 0.58
    average of all folds: 0.5628 [std=0.011]

runtime: 0:00:17 [17s]





In [4]:
start = time()

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_NORMALIZED_REGRESSION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.CROSS_VALIDATION,
    n_folds=5,
    workers=1,
    target="rank"
)
set_baseline_steps(pipeline)
print(pipeline.get_pipeline().steps)
pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

Creating pipeline ...
[('keeper', ColumnKeeper(columns=['dataset', 'model', 'tuning', 'scoring'])), ('baseline', <src.features.encoder_utils.NoY object at 0x0000020D95968E50>), ('estimator', RandomForestRegressor())]
Starting pipeline using method: EvaluationType.CROSS_VALIDATION


  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:19<00:00,  3.82s/it]

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.5887
    validation_average_spearman_fold_1: 0.5899
    validation_average_spearman_fold_2: 0.5934
    validation_average_spearman_fold_3: 0.6011
    validation_average_spearman_fold_4: 0.6055
    average of all folds: 0.5957 [std=0.0065]

runtime: 0:00:27 [27s]





## Pointwise Classification

In [4]:
start = time()

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_CLASSIFICATION_NO_SEARCH,
    evaluation=EvaluationType.CROSS_VALIDATION,
    verbose_level=1,
    n_folds=5,
    workers=16,
    target="rank"
)
set_baseline_steps(pipeline)
print(pipeline.get_pipeline().steps)
pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

Creating pipeline ...
[('keeper', ColumnKeeper(columns=['dataset', 'model', 'tuning', 'scoring'])), ('baseline', <src.features.encoder_utils.NoY object at 0x0000023F745CEBE0>), ('estimator', DecisionTreeClassifier())]
Starting pipeline using method: EvaluationType.CROSS_VALIDATION


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.60it/s]

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.3308
    validation_average_spearman_fold_1: 0.2648
    validation_average_spearman_fold_2: 0.3079
    validation_average_spearman_fold_3: 0.2905
    validation_average_spearman_fold_4: 0.3364
    average of all folds: 0.3061 [std=0.0264]

runtime: 0:00:03 [3s]





## Pointwise Ordinal Regression

In [3]:
start = time()

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_ORDINAL_REGRESSION_NO_SEARCH,
    evaluation=EvaluationType.CROSS_VALIDATION,
    verbose_level=1,
    n_folds=5,
    workers=1,
    target="rank"
)
set_baseline_steps(pipeline)
print(pipeline.get_pipeline().steps)
pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

Creating pipeline ...
[('keeper', ColumnKeeper(columns=['dataset', 'model', 'tuning', 'scoring'])), ('baseline', <src.features.encoder_utils.NoY object at 0x0000023F60F7B670>), ('estimator', DecisionTreeClassifier())]
Starting pipeline using method: EvaluationType.CROSS_VALIDATION


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:11<00:00,  2.38s/it]

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.5433
    validation_average_spearman_fold_1: 0.5142
    validation_average_spearman_fold_2: 0.5455
    validation_average_spearman_fold_3: 0.5606
    validation_average_spearman_fold_4: 0.5446
    average of all folds: 0.5416 [std=0.0151]

runtime: 0:00:12 [12s]



