In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
from category_encoders.binary import BinaryEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.target_encoder import TargetEncoder
from datetime import timedelta
from sklearn.compose import ColumnTransformer
from time import time

from src import configuration as config
from src.features.encoder_utils import NoY
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType
from src.pipeline.pipeline_transformers import *


# load the data
df_train = config.load_traindata_for_pointwise()
pipelineFactory = PipelineFactory()

PLEASE DELETE  
Hier nur ein Beispiel wie man Optuna nutzen kann.

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier

param_grid = {
    "general_transformer__model_encoder" : [BinaryEncoder(), OneHotEncoder()],
    "general_transformer__tuning_encoder" : [BinaryEncoder()],
    "general_transformer__scoring_encoder" : [BinaryEncoder(), OneHotEncoder()],
    "estimator__estimator__max_depth" : [2, 10], # IMPORTANT only use 2 values here. These values represent the range optuna is using to search for the best parameter. So all values between 2 and 10 in this case.
}

start = time()
df_train = config.load_traindata_for_pointwise()
pipeline = pipelineFactory.create_pipeline(
    train_df=df_train,
    model_type=ModelType.PAIRWISE_CLASSIFICATION_NO_SEARCH,
    verbose_level=1,
    evaluation=EvaluationType.OPTUNA,
    n_folds=2,
    workers=1,
    target="rank",
    as_pairwise=True,
    param_grid=param_grid,
    opt_iterations=4 # The number of iterations optuna should use to find the best hyperparameters
)
#set_baseline_steps(pipeline)

# create a decision tree classifier
dtc = DecisionTreeClassifier()
# create a multi-output classifier using the decision tree
multi_dtc = MultiOutputClassifier(dtc)
pipeline.change_estimator(multi_dtc)

print(pipeline.get_pipeline().steps)
pipeline.run()

runtime = int(time() - start)
print('\nruntime: ' + str(timedelta(seconds=runtime)) + ' [' + str(runtime) + 's]')

[I 2023-07-14 23:41:55,089] A new study created in memory with name: no-name-3d6f47b7-0e6f-4e3d-aad8-fa2b52ed367b


Creating pipeline ...
[('dataset_transformer', OpenMLMetaFeatureTransformer(encoder=None, expected_pca_variance=0.6,
                             nan_ratio_feature_drop_threshold=0.25)), ('general_transformer', GeneralPurposeEncoderTransformer(model_encoder=OneHotEncoder(),
                                 scoring_encoder=OneHotEncoder(),
                                 tuning_encoder=OneHotEncoder())), ('estimator', MultiOutputClassifier(estimator=DecisionTreeClassifier()))]
Starting pipeline using method: EvaluationType.OPTUNA


  0%|          | 0/4 [00:00<?, ?it/s]


[A

[A[A


[A[A[A


[A[A[A

[A[A
[A


100%|██████████| 2/2 [03:08<00:00, 94.47s/it]



[A[A[A                                    
[A                                          

[A[A                                        

[A[A
[A

[I 2023-07-14 23:45:04,188] Trial 3 finished with value: 0.3638 and parameters: {'general_transformer__model_encoder': 1, 'general_transformer__tuning_encoder': 0, 'general_transformer__scoring_encoder': 0, 'estimator__estimator__max_depth': 3}. Best is trial 3 with value: 0.3638.




100%|██████████| 2/2 [03:15<00:00, 97.76s/it]

[A                                          

[A[A                                        
[A

[I 2023-07-14 23:45:10,744] Trial 2 finished with value: 0.3637 and parameters: {'general_transformer__model_encoder': 0, 'general_transformer__tuning_encoder': 0, 'general_transformer__scoring_encoder': 0, 'estimator__estimator__max_depth': 7}. Best is trial 3 with value: 0.3638.


100%|██████████| 2/2 [03:16<00:00, 98.22s/it]


[A[A                                        
[A

[I 2023-07-14 23:45:11,599] Trial 0 finished with value: 0.3637 and parameters: {'general_transformer__model_encoder': 1, 'general_transformer__tuning_encoder': 0, 'general_transformer__scoring_encoder': 1, 'estimator__estimator__max_depth': 7}. Best is trial 3 with value: 0.3638.



100%|██████████| 2/2 [03:19<00:00, 99.93s/it]


[I 2023-07-14 23:45:15,034] Trial 1 finished with value: 0.3638 and parameters: {'general_transformer__model_encoder': 1, 'general_transformer__tuning_encoder': 0, 'general_transformer__scoring_encoder': 0, 'estimator__estimator__max_depth': 8}. Best is trial 3 with value: 0.3638.
Finished running the pipeline
Evaluation metrics:
  Best Score: 0.3638
  Best Params: {'general_transformer__model_encoder': BinaryEncoder(cols=['model'],
              mapping=[{'col': 'model',
                        'mapping':     model_0  model_1  model_2
 1        0        0        1
 2        0        1        0
 3        0        1        1
 4        1        0        0
 5        1        0        1
-1        0        0        0
-2        0        0        0}]), 'general_transformer__tuning_encoder': BinaryEncoder(cols=['tuning'],
              mapping=[{'col': 'tuning',
                        'mapping':     tuning_0  tuning_1
 1         0         1
 2         1         0
 3         1         1
-1    