# Pointwise Methods

In this notebook, we compare pointwise methods and apply search methods to find the best performing one. First, we load the required dependencies and the data.

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
from category_encoders.binary import BinaryEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder
from src import configuration as config
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType
from src.pipeline.pipeline_transformers import *
from sklearn.compose import ColumnTransformer


# load the data
train_df = config.load_traindata_for_pointwise()
pipelineFactory = PipelineFactory()

## Pointwise Regression

In [9]:
pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_REGRESSION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.CROSS_VALIDATION,
    n_folds=5,
    workers=1,
    target="rank"
)
print(pipeline.get_pipeline().named_steps)
pipeline.run()

Creating pipeline ...
{'keeper': ColumnKeeper(columns=['dataset', 'model', 'tuning', 'scoring', 'encoder']), 'encoder_transformer': PoincareEmbedding(batch_size=50, encoder=OneHotEncoder(), epochs=500,
                  graph=<networkx.classes.graph.Graph object at 0x00000286AC3EF610>,
                  size=3), 'dataset_transformer': OpenMLMetaFeatureTransformer(encoder=None, expected_pca_variance=0.6,
                             nan_ratio_feature_drop_threshold=0.25), 'general_transformer': GeneralPurposeEncoderTransformer(model_encoder=OneHotEncoder(),
                                 scoring_encoder=TargetEncoder(),
                                 tuning_encoder=TargetEncoder()), 'estimator': DecisionTreeRegressor()}
Starting pipeline using method: EvaluationType.CROSS_VALIDATION


100%|██████████| 5/5 [00:30<00:00,  6.14s/it]

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.7653
    validation_average_spearman_fold_1: 0.7621
    validation_average_spearman_fold_2: 0.7324
    validation_average_spearman_fold_3: 0.7901
    validation_average_spearman_fold_4: 0.7757
    average of all folds: 0.7651 [std=0.0191]





In [None]:
param_grid = {
    "encoder_transformer__batch_size": [50, 500],
}

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_REGRESSION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.GRID_SEARCH,
    param_grid=param_grid,
    n_folds=2,
    workers=1,
    target="rank"
)

pipeline.run()

## Pointwise Classification

In [8]:
# running the pipeline plain wihout parameter tuning using cross validation
pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_CLASSIFICATION_NO_SEARCH,
    evaluation=EvaluationType.CROSS_VALIDATION,
    verbose_level=1,
    n_folds=5,
    workers=16,
    target="rank"
)

#pipeline.add_new_step(PrintDataframe(verbose=1), "print_dataframe_1")
print(pipeline.get_pipeline().named_steps)
pipeline.run()

Creating pipeline ...
{'keeper': ColumnKeeper(columns=['dataset', 'model', 'tuning', 'scoring', 'encoder']), 'encoder_transformer': PoincareEmbedding(batch_size=50, encoder=OneHotEncoder(), epochs=500,
                  graph=<networkx.classes.graph.Graph object at 0x00000286A934FA90>,
                  size=3), 'dataset_transformer': OpenMLMetaFeatureTransformer(encoder=None, expected_pca_variance=0.6,
                             nan_ratio_feature_drop_threshold=0.25), 'general_transformer': GeneralPurposeEncoderTransformer(model_encoder=OneHotEncoder(),
                                 scoring_encoder=TargetEncoder(),
                                 tuning_encoder=TargetEncoder()), 'estimator': DecisionTreeClassifier()}
Starting pipeline using method: EvaluationType.CROSS_VALIDATION


100%|██████████| 5/5 [00:30<00:00,  6.11s/it]

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.7669
    validation_average_spearman_fold_1: 0.7061
    validation_average_spearman_fold_2: 0.7036
    validation_average_spearman_fold_3: 0.7566
    validation_average_spearman_fold_4: 0.7458
    average of all folds: 0.7358 [std=0.0262]





We get a very good score of > 0.7. Let's try to optimize the score

In [12]:
param_grid = {
    "encoder_transformer__batch_size": [50, 500],
    "general_transformer__model_encoder" : [BinaryEncoder(), OneHotEncoder(), OrdinalEncoder(), TargetEncoder()],
    "general_transformer__tuning_encoder" : [BinaryEncoder(), OneHotEncoder(), OrdinalEncoder(), TargetEncoder()],
    "general_transformer__scoring_encoder" : [BinaryEncoder(), OneHotEncoder(), OrdinalEncoder(), TargetEncoder()],
}

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_CLASSIFICATION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.GRID_SEARCH,
    param_grid=param_grid,
    n_folds=2,
    workers=1,
    target="rank"
)

pipeline.run()

Creating pipeline ...
Starting pipeline using method: EvaluationType.GRID_SEARCH
Performing grid search


100%|██████████| 128/128 [25:14<00:00, 11.83s/it]

Finished running the pipeline
Evaluation metrics:
    best_score: 0.7573
    best_params: {'encoder_transformer__batch_size': 500, 'general_transformer__model_encoder': OrdinalEncoder(cols=['model'],
               mapping=[{'col': 'model', 'data_type': dtype('O'),
                         'mapping': LR       1
SVC      2
LGBMC    3
DTC      4
KNC      5
NaN     -2
dtype: int64}]), 'general_transformer__tuning_encoder': OrdinalEncoder(cols=['tuning'],
               mapping=[{'col': 'tuning', 'data_type': dtype('O'),
                         'mapping': model    1
no       2
full     3
NaN     -2
dtype: int64}]), 'general_transformer__scoring_encoder': OrdinalEncoder(cols=['scoring'],
               mapping=[{'col': 'scoring', 'data_type': dtype('O'),
                         'mapping': ACC    1
AUC    2
F1     3
NaN   -2
dtype: int64}])}





The best parameters for this run are:  
batch_size: 500  
model_encoder: OrdinalEncoder  
tuning_encoder: OrdinalEncoder  
scoring_encoder: OrdinalEncoder

In [15]:
param_grid = {
    "encoder_transformer__batch_size": [700, 800, 900],
    "general_transformer__model_encoder" : [OrdinalEncoder()],
    "general_transformer__tuning_encoder" : [OrdinalEncoder()],
    "general_transformer__scoring_encoder" : [OrdinalEncoder()],
}

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_CLASSIFICATION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.GRID_SEARCH,
    param_grid=param_grid,
    n_folds=2,
    workers=1,
    target="rank"
)

pipeline.run()

Creating pipeline ...
Starting pipeline using method: EvaluationType.GRID_SEARCH
Performing grid search


100%|██████████| 3/3 [00:35<00:00, 11.96s/it]

Finished running the pipeline
Evaluation metrics:
    best_score: 0.7609
    best_params: {'encoder_transformer__batch_size': 800, 'general_transformer__model_encoder': OrdinalEncoder(cols=['model'],
               mapping=[{'col': 'model', 'data_type': dtype('O'),
                         'mapping': LR       1
SVC      2
LGBMC    3
DTC      4
KNC      5
NaN     -2
dtype: int64}]), 'general_transformer__tuning_encoder': OrdinalEncoder(cols=['tuning'],
               mapping=[{'col': 'tuning', 'data_type': dtype('O'),
                         'mapping': model    1
no       2
full     3
NaN     -2
dtype: int64}]), 'general_transformer__scoring_encoder': OrdinalEncoder(cols=['scoring'],
               mapping=[{'col': 'scoring', 'data_type': dtype('O'),
                         'mapping': ACC    1
AUC    2
F1     3
NaN   -2
dtype: int64}])}





In [18]:
param_grid = {
    "encoder_transformer__batch_size": [700, 800, 900],
    "general_transformer__model_encoder" : [OrdinalEncoder()],
    "general_transformer__tuning_encoder" : [OrdinalEncoder()],
    "general_transformer__scoring_encoder" : [OrdinalEncoder()],
    "estimator__criterion": ["gini", "entropy"],
    "estimator__splitter": ["best", "random"],
    "estimator__max_depth": [None],
    "estimator__min_samples_split": [3, 20, 40],
    "estimator__min_samples_leaf": [1, 10, 20],
    "estimator__min_weight_fraction_leaf": [0.0, 0.1, 0.2],
    "estimator__class_weight": [None, "balanced"],
}

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_CLASSIFICATION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.GRID_SEARCH,
    param_grid=param_grid,
    n_folds=2,
    workers=1,
    target="rank"
)

pipeline.run()

Creating pipeline ...
Starting pipeline using method: EvaluationType.GRID_SEARCH
Performing grid search


  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return n

Finished running the pipeline
Evaluation metrics:
    best_score: 0.7461
    best_params: {'encoder_transformer__batch_size': 800, 'general_transformer__model_encoder': OrdinalEncoder(cols=['model'],
               mapping=[{'col': 'model', 'data_type': dtype('O'),
                         'mapping': LR       1
SVC      2
LGBMC    3
DTC      4
KNC      5
NaN     -2
dtype: int64}]), 'general_transformer__tuning_encoder': OrdinalEncoder(cols=['tuning'],
               mapping=[{'col': 'tuning', 'data_type': dtype('O'),
                         'mapping': model    1
no       2
full     3
NaN     -2
dtype: int64}]), 'general_transformer__scoring_encoder': OrdinalEncoder(cols=['scoring'],
               mapping=[{'col': 'scoring', 'data_type': dtype('O'),
                         'mapping': ACC    1
AUC    2
F1     3
NaN   -2
dtype: int64}]), 'estimator__criterion': 'entropy', 'estimator__splitter': 'best', 'estimator__max_depth': None, 'estimator__min_samples_split': 3, 'estimator__min_sam




running the grid search took 127 minutes  
The best performing parameters are:  
'encoder_transformer__batch_size': 800  
'estimator__criterion': 'entropy'  
'estimator__splitter': 'best'  
'estimator__max_depth': None  
'estimator__min_samples_split': 3  
'estimator__min_samples_leaf': 1  
'estimator__min_weight_fraction_leaf': 0.0  
'estimator__class_weight': None  

In [19]:
param_grid = {
    "encoder_transformer__batch_size": [800],
    "general_transformer__model_encoder" : [OrdinalEncoder()],
    "general_transformer__tuning_encoder" : [OrdinalEncoder()],
    "general_transformer__scoring_encoder" : [OrdinalEncoder()],
    "estimator__criterion": ["gini", "entropy"],
    "estimator__splitter": ["best"],
    "estimator__max_depth": [None],
    "estimator__min_samples_split": [2, 3],
    "estimator__min_samples_leaf": [1],
    "estimator__min_weight_fraction_leaf": [0.0, 0.5],
    "estimator__class_weight": [None],
}

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_CLASSIFICATION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.GRID_SEARCH,
    param_grid=param_grid,
    n_folds=2,
    workers=1,
    target="rank"
)

pipeline.run()

Creating pipeline ...
Starting pipeline using method: EvaluationType.GRID_SEARCH
Performing grid search


  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
100%|██████████| 8/8 [01:33<00:00, 11.72s/it]

Finished running the pipeline
Evaluation metrics:
    best_score: 0.7614
    best_params: {'encoder_transformer__batch_size': 800, 'general_transformer__model_encoder': OrdinalEncoder(cols=['model'],
               mapping=[{'col': 'model', 'data_type': dtype('O'),
                         'mapping': LR       1
SVC      2
LGBMC    3
DTC      4
KNC      5
NaN     -2
dtype: int64}]), 'general_transformer__tuning_encoder': OrdinalEncoder(cols=['tuning'],
               mapping=[{'col': 'tuning', 'data_type': dtype('O'),
                         'mapping': model    1
no       2
full     3
NaN     -2
dtype: int64}]), 'general_transformer__scoring_encoder': OrdinalEncoder(cols=['scoring'],
               mapping=[{'col': 'scoring', 'data_type': dtype('O'),
                         'mapping': ACC    1
AUC    2
F1     3
NaN   -2
dtype: int64}]), 'estimator__criterion': 'entropy', 'estimator__splitter': 'best', 'estimator__max_depth': None, 'estimator__min_samples_split': 2, 'estimator__min_sam




In [21]:
param_grid = {
    "encoder_transformer__batch_size": [800],
    "general_transformer__model_encoder" : [OrdinalEncoder()],
    "general_transformer__tuning_encoder" : [OrdinalEncoder()],
    "general_transformer__scoring_encoder" : [OrdinalEncoder()],
    "estimator__n_estimators": [1, 5, 50],
    "estimator__learning_rate": [0.1, 0.5, 1.0],
    "estimator__algorithm": ["SAMME", "SAMME.R"],
}

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_CLASSIFICATION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.GRID_SEARCH,
    param_grid=param_grid,
    n_folds=2,
    workers=1,
    target="rank"
)

# trying out boosted decision trees
from sklearn.ensemble import AdaBoostClassifier

pipeline.change_estimator(AdaBoostClassifier())

pipeline.run()

Creating pipeline ...
Starting pipeline using method: EvaluationType.GRID_SEARCH
Performing grid search


  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return np.nanmean(list_spearman(rf1, rf2))
  return n

Finished running the pipeline
Evaluation metrics:
    best_score: 0.1499
    best_params: {'encoder_transformer__batch_size': 800, 'general_transformer__model_encoder': OrdinalEncoder(cols=['model'],
               mapping=[{'col': 'model', 'data_type': dtype('O'),
                         'mapping': LR       1
SVC      2
LGBMC    3
DTC      4
KNC      5
NaN     -2
dtype: int64}]), 'general_transformer__tuning_encoder': OrdinalEncoder(cols=['tuning'],
               mapping=[{'col': 'tuning', 'data_type': dtype('O'),
                         'mapping': model    1
no       2
full     3
NaN     -2
dtype: int64}]), 'general_transformer__scoring_encoder': OrdinalEncoder(cols=['scoring'],
               mapping=[{'col': 'scoring', 'data_type': dtype('O'),
                         'mapping': ACC    1
AUC    2
F1     3
NaN   -2
dtype: int64}]), 'estimator__n_estimators': 50, 'estimator__learning_rate': 1.0, 'estimator__algorithm': 'SAMME.R'}





The best score is considerably lower than the one we got with the plain decision tree.

In [25]:
param_grid = {
    "encoder_transformer__batch_size": [800],
    "general_transformer__model_encoder" : [OrdinalEncoder()],
    "general_transformer__tuning_encoder" : [OrdinalEncoder()],
    "general_transformer__scoring_encoder" : [OrdinalEncoder()],
    "estimator__n_estimators": [200, 500, 1000],
    "estimator__criterion": ["gini", "entropy"],
}

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_CLASSIFICATION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.GRID_SEARCH,
    param_grid=param_grid,
    n_folds=2,
    workers=1,
    target="rank"
)

# trying out bagged decision trees
from sklearn.ensemble import RandomForestClassifier

pipeline.change_estimator(RandomForestClassifier())

pipeline.run()

Creating pipeline ...
Starting pipeline using method: EvaluationType.GRID_SEARCH
Performing grid search


100%|██████████| 6/6 [14:51<00:00, 148.53s/it]

Finished running the pipeline
Evaluation metrics:
    best_score: 0.7113
    best_params: {'encoder_transformer__batch_size': 800, 'general_transformer__model_encoder': OrdinalEncoder(cols=['model'],
               mapping=[{'col': 'model', 'data_type': dtype('O'),
                         'mapping': LR       1
SVC      2
LGBMC    3
DTC      4
KNC      5
NaN     -2
dtype: int64}]), 'general_transformer__tuning_encoder': OrdinalEncoder(cols=['tuning'],
               mapping=[{'col': 'tuning', 'data_type': dtype('O'),
                         'mapping': model    1
no       2
full     3
NaN     -2
dtype: int64}]), 'general_transformer__scoring_encoder': OrdinalEncoder(cols=['scoring'],
               mapping=[{'col': 'scoring', 'data_type': dtype('O'),
                         'mapping': ACC    1
AUC    2
F1     3
NaN   -2
dtype: int64}]), 'estimator__n_estimators': 500, 'estimator__criterion': 'entropy'}





The best score is also slower than the plain decision tree

## Pointwise Ordinal Regression

In [None]:
pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_ORDINAL_REGRESSION_NO_SEARCH,
    verbose_level=1,
    n_folds=5,
    workers=1,
    target="rank"
)

pipeline.run()

In [None]:
param_grid = {
    "encoder_transformer__batch_size": [50, 500],
}

pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.POINTWISE_ORDINAL_REGRESSION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.GRID_SEARCH,
    param_grid=param_grid,
    n_folds=2,
    workers=1,
    target="rank"
)

pipeline.run()

## Comparison