In [4]:
# imports
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pandas as pd
from src import configuration as config
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType

from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer

from category_encoders.one_hot import OneHotEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.binary import BinaryEncoder
from category_encoders.ordinal import OrdinalEncoder

from src.pipeline.pipeline_transformers import ColumnKeeper, PrintDataframe, GeneralPurposeEncoderTransformer

from lightgbm import LGBMRegressor 

In [5]:
pd.set_option('display.expand_frame_repr', False)

In [6]:
# load the data
train_df = config.load_traindata_for_regression()
pipelineFactory = PipelineFactory()

# create the baseline pipeline
pipeline = pipelineFactory.create_pipeline(train_df,
                                                    ModelType.REGRE_BASELINE,
                                                    verbose_level=1,
                                                    evaluation=EvaluationType.BASIC)
pipeline.run()

# starting from the baseline model we can now change the estimator
pipeline.change_estimator(LGBMRegressor())
train_df.head()

Starting pipeline using method: EvaluationType.BASIC
Finished running the pipeline
Evaluation metrics:
    validation_rmse: 0.2206 [std=0.]
    validation_mae: 0.1783 [std=0.]
    validation_r2: -0.005 [std=0.]
    validation_average_spearman: 0. [std=0.]


Unnamed: 0,dataset,model,tuning,scoring,encoder,cv_score
0,1169,KNC,model,ACC,BUCV2RGLMME,0.650562
1,1169,KNC,model,ACC,BUCV2TE,0.650712
2,1169,KNC,model,ACC,CBE,0.626492
3,1169,KNC,model,ACC,CE,0.625839
4,1169,KNC,model,ACC,CV10RGLMME,0.652093


OneHotEncoder for model, tuning, scoring:

In [9]:
from src.pipeline.pipeline_transformers import PoincareEmbedding, OpenMLMetaFeatureTransformer
from src.features.encoder_utils import load_graph

graph = load_graph(config.DATA_DIR / "external/graphs/encodings_graph.adjlist")

pipeline.add_new_step(PoincareEmbedding(graph), "poincare")
pipeline.add_new_step(OpenMLMetaFeatureTransformer(), "openml_meta")
# add the column transformer to the pipeline
pipeline.add_new_step(GeneralPurposeEncoderTransformer(OneHotEncoder(),OneHotEncoder(),OneHotEncoder()), 'general')

pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_1")

print(pipeline.get_pipeline().named_steps)
pipeline.change_estimator(LGBMRegressor())

pipeline.run()

{'poincare': PoincareEmbedding(graph=<networkx.classes.graph.Graph object at 0x0000025F8DF42B90>), 'openml_meta': OpenMLMetaFeatureTransformer(), 'general': GeneralPurposeEncoderTransformer(model_encoder=OneHotEncoder(),
                                 scoring_encoder=OneHotEncoder(),
                                 tuning_encoder=OneHotEncoder()), 'print_df_1': PrintDataframe(verbose=1), 'estimator': LGBMRegressor()}
Starting pipeline using method: EvaluationType.BASIC
----------------------------------------
Printing dataframe:
   poincare_embedding_dim1  poincare_embedding_dim2  dataset_metafeature_0  dataset_metafeature_1  dataset_metafeature_2  dataset_metafeature_3  dataset_metafeature_4  dataset_metafeature_5  dataset_metafeature_6  dataset_metafeature_7  ...  model_2  model_3  model_4  model_5  tuning_1  tuning_2  tuning_3  scoring_1  scoring_2  scoring_3
0                 0.007151                 0.085302                -1.3669                0.78024               0.031194  

Average spearman: 0

BinaryEncoder for model, tuning, scoring:

In [11]:
pipeline.remove_step("general")

# add the column transformer to the pipeline
pipeline.add_new_step_at_position(GeneralPurposeEncoderTransformer(BinaryEncoder(),BinaryEncoder(),BinaryEncoder()), 'general', 2)

print(pipeline.get_pipeline().named_steps)

pipeline.run()

{'poincare': PoincareEmbedding(graph=<networkx.classes.graph.Graph object at 0x0000025F8DF42B90>), 'openml_meta': OpenMLMetaFeatureTransformer(), 'general': GeneralPurposeEncoderTransformer(model_encoder=BinaryEncoder(),
                                 scoring_encoder=BinaryEncoder(),
                                 tuning_encoder=BinaryEncoder()), 'print_df_1': PrintDataframe(verbose=1), 'estimator': LGBMRegressor()}
Starting pipeline using method: EvaluationType.BASIC
----------------------------------------
Printing dataframe:
   poincare_embedding_dim1  poincare_embedding_dim2  dataset_metafeature_0  dataset_metafeature_1  dataset_metafeature_2  dataset_metafeature_3  dataset_metafeature_4  dataset_metafeature_5  dataset_metafeature_6  dataset_metafeature_7  ...  dataset_metafeature_36  dataset_metafeature_37   dataset  model_0  model_1  model_2  tuning_0  tuning_1  scoring_0  scoring_1
0                 0.007151                 0.085302                -1.3669                0.78

Average spearman: 0

OrdinalEncoder for model, tuning, scoring: 

In [12]:
pipeline.remove_step("general")

# add the column transformer to the pipeline
pipeline.add_new_step_at_position(GeneralPurposeEncoderTransformer(OrdinalEncoder(),OrdinalEncoder(),OrdinalEncoder()), 'general', 2)

print(pipeline.get_pipeline().named_steps)

pipeline.run()

{'poincare': PoincareEmbedding(graph=<networkx.classes.graph.Graph object at 0x0000025F8DF42B90>), 'openml_meta': OpenMLMetaFeatureTransformer(), 'general': GeneralPurposeEncoderTransformer(model_encoder=OrdinalEncoder(),
                                 scoring_encoder=OrdinalEncoder(),
                                 tuning_encoder=OrdinalEncoder()), 'print_df_1': PrintDataframe(verbose=1), 'estimator': LGBMRegressor()}
Starting pipeline using method: EvaluationType.BASIC
----------------------------------------
Printing dataframe:
   poincare_embedding_dim1  poincare_embedding_dim2  dataset_metafeature_0  dataset_metafeature_1  dataset_metafeature_2  dataset_metafeature_3  dataset_metafeature_4  dataset_metafeature_5  dataset_metafeature_6  dataset_metafeature_7  ...  dataset_metafeature_32  dataset_metafeature_33  dataset_metafeature_34  dataset_metafeature_35  dataset_metafeature_36  dataset_metafeature_37   dataset  model  tuning  scoring
0                 0.007151              