# Pairwise Methods - Baselines
In this notebook, we compare pairwise methods regarding a baseline preprocessing of just one-hot encoding every feature.

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
from category_encoders.binary import BinaryEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder
from datetime import timedelta
from sklearn.compose import ColumnTransformer
from time import time

from src import configuration as config
from src.features.encoder_utils import NoY
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType
from src.pipeline.pipeline_transformers import *


# load the data
X_train, y_train = config.load_traindata_for_pairwise()
pipelineFactory = PipelineFactory()

print(f"X_train.shape: {X_train.shape}")
print(f"y_train.shape: {y_train.shape}")
print(f"X_train.columns: {X_train.columns}")
print(f"y_train.columns: {y_train.columns}")

X_train.shape: (1161, 4)
y_train.shape: (1161, 992)
X_train.columns: Index(['dataset', 'model', 'tuning', 'scoring'], dtype='object')
y_train.columns: Index([   ('BUCV2RGLMME', 'BUCV2TE'),    ('BUCV2TE', 'BUCV2RGLMME'),
              ('BUCV2RGLMME', 'CBE'),        ('CBE', 'BUCV2RGLMME'),
               ('BUCV2RGLMME', 'CE'),         ('CE', 'BUCV2RGLMME'),
       ('BUCV2RGLMME', 'CV10RGLMME'), ('CV10RGLMME', 'BUCV2RGLMME'),
           ('BUCV2RGLMME', 'CV10TE'),     ('CV10TE', 'BUCV2RGLMME'),
       ...
                       ('OHE', 'TE'),                 ('TE', 'OHE'),
                     ('OHE', 'WOEE'),               ('WOEE', 'OHE'),
                       ('OHE', 'SE'),                 ('SE', 'OHE'),
                        ('SE', 'TE'),                  ('TE', 'SE'),
                      ('SE', 'WOEE'),                ('WOEE', 'SE')],
      dtype='object', length=992)


In [2]:
def set_baseline_steps(pipeline):
    pipeline.clear_steps()
    pipeline.add_new_step(NoY(OneHotEncoder(['dataset', 'model', 'tuning', 'scoring'])), 'baseline')

In [3]:
# check types of X_train
print(f"X_train.dtypes: \n {X_train.dtypes}")
print(f"Xtrain head: \n {X_train.head(2)}")

# transform all columns to string
X_train = X_train.astype(str)

print(f"X_train.dtypes: \n {X_train.dtypes}")
print(f"Xtrain head: \n {X_train.head(2)}")

X_train.dtypes: 
 dataset     int64
model      object
tuning     object
scoring    object
dtype: object
Xtrain head: 
    dataset model tuning scoring
0        3   DTC   full     ACC
1        3   DTC   full     AUC
X_train.dtypes: 
 dataset    object
model      object
tuning     object
scoring    object
dtype: object
Xtrain head: 
   dataset model tuning scoring
0       3   DTC   full     ACC
1       3   DTC   full     AUC


In [4]:
# check the types of y_train
print(f"y_train.dtypes: \n {y_train.dtypes}")
print(f"y_train head: \n {y_train.head(2)}")

# transform all columns to string
y_train = y_train.astype(str)

print(f"y_train.dtypes: \n {y_train.dtypes}")
print(f"y_train head: \n {y_train.head(2)}")

y_train.dtypes: 
 (BUCV2RGLMME, BUCV2TE)    float64
(BUCV2TE, BUCV2RGLMME)    float64
(BUCV2RGLMME, CBE)        float64
(CBE, BUCV2RGLMME)        float64
(BUCV2RGLMME, CE)         float64
                           ...   
(SE, OHE)                 float64
(SE, TE)                  float64
(TE, SE)                  float64
(SE, WOEE)                float64
(WOEE, SE)                float64
Length: 992, dtype: object
y_train head: 
    (BUCV2RGLMME, BUCV2TE)  (BUCV2TE, BUCV2RGLMME)  (BUCV2RGLMME, CBE)  \
0                     0.0                     0.0                 1.0   
1                     0.0                     0.0                 1.0   

   (CBE, BUCV2RGLMME)  (BUCV2RGLMME, CE)  (CE, BUCV2RGLMME)  \
0                 0.0                0.0                0.0   
1                 0.0                0.0                0.0   

   (BUCV2RGLMME, CV10RGLMME)  (CV10RGLMME, BUCV2RGLMME)  \
0                        0.0                        0.0   
1                        0.0         

In [6]:
start = time()
df_train = config.load_traindata_for_pointwise()
pipeline = pipelineFactory.create_pipeline(
    train_df=X_train,
    model_type=ModelType.PAIRWISE_CLASSIFICATION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.BASIC,
    n_folds=5,
    workers=1,
    target="rank",
    split_factors=[]
)
set_baseline_steps(pipeline)

print(pipeline.get_pipeline().steps)
pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

Creating pipeline ...
[('baseline', <src.features.encoder_utils.NoY object at 0x000001411CFB0FD0>), ('estimator', DecisionTreeClassifier())]
Starting pipeline using method: EvaluationType.BASIC


KeyError: "['rank'] not found in axis"