# Pairwise Methods - Baselines
In this notebook, we compare pairwise methods regarding a baseline preprocessing of just one-hot encoding every feature.

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
from category_encoders.binary import BinaryEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder
from datetime import timedelta
from sklearn.compose import ColumnTransformer
from time import time

from src import configuration as config
from src.features.encoder_utils import NoY
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType
from src.pipeline.pipeline_transformers import *


# load the data
df_train = config.load_traindata_for_pointwise()
pipelineFactory = PipelineFactory()

In [2]:
def set_baseline_steps(pipeline):
    pipeline.clear_steps()
    #pipeline.add_new_step(NoY(OneHotEncoder(['dataset', 'model', 'tuning', 'scoring'])), 'baseline')
    pipeline.add_new_step(GeneralPurposeEncoderTransformer(
                    OneHotEncoder(),
                    OneHotEncoder(),
                    OneHotEncoder()
                ), "general_transformer")

# Decision Tree

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier

start = time()
df_train = config.load_traindata_for_pointwise()
pipeline = pipelineFactory.create_pipeline(
    train_df=df_train,
    model_type=ModelType.PAIRWISE_CLASSIFICATION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.CROSS_VALIDATION,
    n_folds=5,
    workers=1,
    target="rank",
    as_pairwise=True,
)
set_baseline_steps(pipeline)

# create a decision tree classifier
dtc = DecisionTreeClassifier()
# create a multi-output classifier using the decision tree
multi_dtc = MultiOutputClassifier(dtc)
pipeline.change_estimator(multi_dtc)

print(pipeline.get_pipeline().steps)
pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

Creating pipeline ...
[('general_transformer', GeneralPurposeEncoderTransformer(model_encoder=OneHotEncoder(),
                                 scoring_encoder=OneHotEncoder(),
                                 tuning_encoder=OneHotEncoder())), ('estimator', MultiOutputClassifier(estimator=DecisionTreeClassifier()))]
Starting pipeline using method: EvaluationType.CROSS_VALIDATION


  0%|          | 0/5 [00:00<?, ?it/s]100%|██████████| 5/5 [01:28<00:00, 17.64s/it]

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.644
    validation_average_spearman_fold_1: 0.6427
    validation_average_spearman_fold_2: 0.6412
    validation_average_spearman_fold_3: 0.6432
    validation_average_spearman_fold_4: 0.6413
    average of all folds: 0.6425 [std=0.0011]

runtime: 0:01:28 [88s]





# Random Forest

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

start = time()
df_train = config.load_traindata_for_pointwise()
pipeline = pipelineFactory.create_pipeline(
    train_df=df_train,
    model_type=ModelType.PAIRWISE_CLASSIFICATION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.CROSS_VALIDATION,
    n_folds=5,
    workers=1,
    target="rank",
    as_pairwise=True,
)
set_baseline_steps(pipeline)

# create a decision tree classifier
classifier = RandomForestClassifier()
# create a multi-output classifier using the decision tree
multi_classifier = MultiOutputClassifier(classifier)
pipeline.change_estimator(multi_classifier)

print(pipeline.get_pipeline().steps)
pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

Creating pipeline ...
[('general_transformer', GeneralPurposeEncoderTransformer(model_encoder=OneHotEncoder(),
                                 scoring_encoder=OneHotEncoder(),
                                 tuning_encoder=OneHotEncoder())), ('estimator', MultiOutputClassifier(estimator=RandomForestClassifier()))]
Starting pipeline using method: EvaluationType.CROSS_VALIDATION


100%|██████████| 5/5 [31:20<00:00, 376.15s/it]

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.1355
    validation_average_spearman_fold_1: 0.133
    validation_average_spearman_fold_2: 0.1359
    validation_average_spearman_fold_3: 0.1363
    validation_average_spearman_fold_4: 0.1346
    average of all folds: 0.1351 [std=0.0012]

runtime: 0:31:20 [1880s]





# Gradient Boosting Classifier

In [6]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier

start = time()
df_train = config.load_traindata_for_pointwise()
pipeline = pipelineFactory.create_pipeline(
    train_df=df_train,
    model_type=ModelType.PAIRWISE_CLASSIFICATION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.CROSS_VALIDATION,
    n_folds=5,
    workers=1,
    target="rank",
    as_pairwise=True,
)
set_baseline_steps(pipeline)

# create a decision tree classifier
classifier = GradientBoostingClassifier()
# create a multi-output classifier using the decision tree
multi_classifier = MultiOutputClassifier(classifier)
pipeline.change_estimator(multi_classifier)

print(pipeline.get_pipeline().steps)
pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

Creating pipeline ...
[('general_transformer', GeneralPurposeEncoderTransformer(model_encoder=OneHotEncoder(),
                                 scoring_encoder=OneHotEncoder(),
                                 tuning_encoder=OneHotEncoder())), ('estimator', MultiOutputClassifier(estimator=GradientBoostingClassifier()))]
Starting pipeline using method: EvaluationType.CROSS_VALIDATION


100%|██████████| 5/5 [52:53<00:00, 634.68s/it] 

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.5596
    validation_average_spearman_fold_1: 0.5593
    validation_average_spearman_fold_2: 0.559
    validation_average_spearman_fold_3: 0.5588
    validation_average_spearman_fold_4: 0.5594
    average of all folds: 0.5592 [std=0.0003]

runtime: 0:53:27 [3207s]





# Support Vector Machine

In [7]:
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier

start = time()
df_train = config.load_traindata_for_pointwise()
pipeline = pipelineFactory.create_pipeline(
    train_df=df_train,
    model_type=ModelType.PAIRWISE_CLASSIFICATION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.CROSS_VALIDATION,
    n_folds=5,
    workers=1,
    target="rank",
    as_pairwise=True,
)
set_baseline_steps(pipeline)

# create a decision tree classifier
classifier = SVC()
# create a multi-output classifier using the decision tree
multi_classifier = MultiOutputClassifier(classifier)
pipeline.change_estimator(multi_classifier)

print(pipeline.get_pipeline().steps)
pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

Creating pipeline ...
[('general_transformer', GeneralPurposeEncoderTransformer(model_encoder=OneHotEncoder(),
                                 scoring_encoder=OneHotEncoder(),
                                 tuning_encoder=OneHotEncoder())), ('estimator', MultiOutputClassifier(estimator=SVC()))]
Starting pipeline using method: EvaluationType.CROSS_VALIDATION


100%|██████████| 5/5 [08:08<00:00, 97.63s/it] 

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.1827
    validation_average_spearman_fold_1: 0.1827
    validation_average_spearman_fold_2: 0.1827
    validation_average_spearman_fold_3: 0.1827
    validation_average_spearman_fold_4: 0.1827
    average of all folds: 0.1827 [std=0.0]

runtime: 0:08:08 [488s]





# Neural Network

In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier

start = time()
df_train = config.load_traindata_for_pointwise()
pipeline = pipelineFactory.create_pipeline(
    train_df=df_train,
    model_type=ModelType.PAIRWISE_CLASSIFICATION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.CROSS_VALIDATION,
    n_folds=5,
    workers=1,
    target="rank",
    as_pairwise=True,
)
set_baseline_steps(pipeline)

# create a decision tree classifier
classifier = MLPClassifier()
# create a multi-output classifier using the decision tree
multi_classifier = MultiOutputClassifier(classifier)
pipeline.change_estimator(multi_classifier)

print(pipeline.get_pipeline().steps)
pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

Creating pipeline ...
[('general_transformer', GeneralPurposeEncoderTransformer(model_encoder=OneHotEncoder(),
                                 scoring_encoder=OneHotEncoder(),
                                 tuning_encoder=OneHotEncoder())), ('estimator', MultiOutputClassifier(estimator=MLPClassifier()))]
Starting pipeline using method: EvaluationType.CROSS_VALIDATION


100%|██████████| 5/5 [1:12:35<00:00, 871.05s/it] 

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: 0.1605
    validation_average_spearman_fold_1: 0.1797
    validation_average_spearman_fold_2: 0.1613
    validation_average_spearman_fold_3: 0.1376
    validation_average_spearman_fold_4: 0.1626
    average of all folds: 0.1603 [std=0.0134]

runtime: 1:12:35 [4355s]



