# Pairwise Methods - Baselines
In this notebook, we compare pairwise methods regarding a baseline preprocessing of just one-hot encoding every feature.

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
from category_encoders.binary import BinaryEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder
from datetime import timedelta
from sklearn.compose import ColumnTransformer
from time import time

from src import configuration as config
from src.features.encoder_utils import NoY
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType
from src.pipeline.pipeline_transformers import *


# load the data
df_train = config.load_traindata_for_pointwise()
pipelineFactory = PipelineFactory()

In [2]:
def set_baseline_steps(pipeline):
    pipeline.clear_steps()
    #pipeline.add_new_step(NoY(OneHotEncoder(['dataset', 'model', 'tuning', 'scoring'])), 'baseline')
    pipeline.add_new_step(GeneralPurposeEncoderTransformer(
                    OneHotEncoder(),
                    OneHotEncoder(),
                    OneHotEncoder()
                ), "general_transformer")

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier

start = time()
df_train = config.load_traindata_for_pointwise()
pipeline = pipelineFactory.create_pipeline(
    train_df=df_train,
    model_type=ModelType.PAIRWISE_CLASSIFICATION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.CROSS_VALIDATION,
    n_folds=5,
    workers=1,
    target="rank",
    as_pairwise=True,
)
#set_baseline_steps(pipeline)

# create a decision tree classifier
dtc = DecisionTreeClassifier()
# create a multi-output classifier using the decision tree
multi_dtc = MultiOutputClassifier(dtc)
pipeline.change_estimator(multi_dtc)

print(pipeline.get_pipeline().steps)
pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

Creating pipeline ...
[('dataset_transformer', OpenMLMetaFeatureTransformer(encoder=None, expected_pca_variance=0.6,
                             nan_ratio_feature_drop_threshold=0.25)), ('general_transformer', GeneralPurposeEncoderTransformer(model_encoder=OneHotEncoder(),
                                 scoring_encoder=OneHotEncoder(),
                                 tuning_encoder=OneHotEncoder())), ('estimator', MultiOutputClassifier(estimator=DecisionTreeClassifier()))]
Starting pipeline using method: EvaluationType.CROSS_VALIDATION


  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [01:27<00:00, 17.59s/it]

Finished running the pipeline
Evaluation metrics:
    validation_average_spearman_fold_0: -0.0007
    validation_average_spearman_fold_1: 0.0
    validation_average_spearman_fold_2: 0.0002
    validation_average_spearman_fold_3: -0.0001
    validation_average_spearman_fold_4: 0.0011
    average of all folds: 0.0001 [std=0.0006]

runtime: 0:01:28 [88s]



