# Pointwise Regression

In this notebook, we examine the pointwise regression approach, including tuning. First, we load the required dependencies and the data.

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
from category_encoders.binary import BinaryEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder
from datetime import timedelta
from sklearn.compose import ColumnTransformer
from time import time

from src import configuration as config
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType
from src.pipeline.pipeline_transformers import *


# load the data
train_df = config.load_traindata_for_pointwise()
pipelineFactory = PipelineFactory()

## Pointwise Regression without Target Normalization

In [2]:
start = time()

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_REGRESSION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.CROSS_VALIDATION,
    n_folds=5,
    workers=1,
    target="rank"
)
pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

Creating pipeline ...
Starting pipeline using method: EvaluationType.CROSS_VALIDATION


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:51<00:00, 22.38s/it]

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.841
    validation_average_spearman_fold_1: 0.8287
    validation_average_spearman_fold_2: 0.8276
    validation_average_spearman_fold_3: 0.8505
    validation_average_spearman_fold_4: 0.8412
    average of all folds: 0.8378 [std=0.0086]

runtime: 0:02:18 [138s]





The performance is quite good with 0.76. Now, let us try this approach with target normalization which is supposed to improve performance.

## Pointwise Regression with Target Normalization

In [5]:
start = time()

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_NORMALIZED_REGRESSION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.CROSS_VALIDATION,
    n_folds=5,
    workers=1,
    target="rank"
)
pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

Creating pipeline ...
Starting pipeline using method: EvaluationType.CROSS_VALIDATION


  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [02:02<00:00, 24.53s/it]

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.8825
    validation_average_spearman_fold_1: 0.8695
    validation_average_spearman_fold_2: 0.8601
    validation_average_spearman_fold_3: 0.8819
    validation_average_spearman_fold_4: 0.8696
    average of all folds: 0.8727 [std=0.0084]

runtime: 0:02:34 [154s]





In [6]:
start = time()

param_grid = {
    "encoder_transformer__batch_size": [50, 500],
}

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_NORMALIZED_REGRESSION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.GRID_SEARCH,
    param_grid=param_grid,
    n_folds=2,
    workers=1,
    target="rank"
)

pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

Creating pipeline ...
Starting pipeline using method: EvaluationType.GRID_SEARCH
Performing grid search


  0%|                                                                                                                                  | 0/2 [00:00<?, ?it/s]
  0%|                                                                                                                                  | 0/2 [00:00<?, ?it/s][A
 50%|█████████████████████████████████████████████████████████████                                                             | 1/2 [00:00<00:00,  8.43it/s][A
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  6.89it/s][A
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
 50%|█████████████████████████████████████████████████████████████                                                             | 1/2 [00:48<00:48, 48.23s/it]
  0%|                                                                              

Finished running the pipeline
Evaluation metrics:
    best_score: 0.8769
    best_params: {'encoder_transformer__batch_size': 500}

runtime: 0:01:40 [100s]





### Tuning with Bayes Search

In [7]:
start = time()

# number of optimization rounds = n_iter / n_points (e.g. 50 rounds in our case)
n_iter = 200 # how many unique parameters to examine - our default: 200
n_points = 4 # how many unique parameter combinations per optimization round - our default: 4
cv = 4 # how many fits for each unique parameter combination - our default: 4
n_jobs = -1 # how many fits in parallel (only parallelizable per round) - our default: -1

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_NORMALIZED_REGRESSION_BAYES_SEARCH,
    verbose_level=1,
    target="rank",
    bayes_n_iter=n_iter,
    bayes_n_points=n_points,
    bayes_cv=cv,
    bayes_n_jobs=n_jobs
)

pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

Creating pipeline ...
Starting pipeline using method: EvaluationType.BAYES_SEARCH
Performing bayes search
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16



Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits




Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits




Fitting 4 folds for each of 4 candidates, totalling 16 fits




Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits




Fitting 4 folds for each of 4 candidates, totalling 16 fits
Fitting 4 folds for each of 4 candidates, totalling 16 fits
best score: 0.8747597359723542
best params:
    dataset_transformer__encoder: OneHotEncoder()
    dataset_transformer__expected_pca_variance: 1.0
    dataset_transformer__nan_ratio_feature_drop_threshold: 0.5
    encoder_transformer__encoder: None
    estimator__max_depth: 50
    estimator__max_features: None
    estimator__min_samples_leaf: 1
    estimator__n_estimators: 200
    general_transformer__model_encoder: OneHotEncoder()
    general_transformer__scoring_encoder: OneHotEncoder()
    general_transformer__tuning_encoder: OneHotEncoder()
Training pipeline with best parameters...


  self._final_estimator.fit(Xt, y, **fit_params_last_step)


Evaluating pipeline with best parameters...


  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  self._final_estimator.fit(Xt, y, **fit_params_last_step)
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [11:19<00:00, 135.88s/it]

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.8867 [std=0.]
    validation_average_spearman_fold_1: 0.8822 [std=0.]
    validation_average_spearman_fold_2: 0.8688 [std=0.]
    validation_average_spearman_fold_3: 0.8848 [std=0.]
    validation_average_spearman_fold_4: 0.8811 [std=0.]
    average_spearman (5-fold): 0.8807 [std=0.0063]

runtime: 4:51:24 [17484s]



