In [28]:
# imports
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pandas as pd
from src import configuration as config
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType

from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer

from category_encoders.one_hot import OneHotEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.binary import BinaryEncoder
from category_encoders.ordinal import OrdinalEncoder

from src.pipeline.pipeline_transformers import ColumnKeeper, PrintDataframe, GeneralPurposeEncoderTransformer

from lightgbm import LGBMRegressor 

Create a pipeline object

In [29]:
# load the data
train_df = config.load_traindata_for_regression()
pipelineFactory = PipelineFactory()

# create the baseline pipeline
pipeline = pipelineFactory.create_pipeline(train_df,
                                                    ModelType.REGRE_BASELINE,
                                                    verbose_level=1,
                                                    evaluation=EvaluationType.BASIC)
pipeline.run()

# starting from the baseline model we can now change the estimator
pipeline.change_estimator(LGBMRegressor())
train_df.head()

Starting pipeline using method: EvaluationType.BASIC
0.0
Finished running the pipeline


Unnamed: 0,dataset,model,tuning,scoring,encoder,cv_score
0,1169,KNC,model,ACC,BUCV2RGLMME,0.650562
1,1169,KNC,model,ACC,BUCV2TE,0.650712
2,1169,KNC,model,ACC,CBE,0.626492
3,1169,KNC,model,ACC,CE,0.625839
4,1169,KNC,model,ACC,CV10RGLMME,0.652093


One-Hot-Encoder for model, tuning, scoring:

In [31]:
pipeline.clear_steps()

pipeline.add_new_step(ColumnKeeper(['model', 'tuning', 'scoring']), 'column_keeper')
#pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_1")

# add the column transformer to the pipeline
#pipeline.add_new_step(preprocessor, "preprocessor")
pipeline.add_new_step(GeneralPurposeEncoderTransformer(OneHotEncoder(),OneHotEncoder(),OneHotEncoder()), 'general')

#pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")


print(pipeline.get_pipeline().named_steps)

pipeline.run()

{'column_keeper': ColumnKeeper(columns=['model', 'tuning', 'scoring']), 'general': GeneralPurposeEncoderTransformer(model_encoder=OneHotEncoder(),
                                 scoring_encoder=OneHotEncoder(),
                                 tuning_encoder=OneHotEncoder()), 'estimator': LGBMRegressor()}
Starting pipeline using method: EvaluationType.BASIC


0.0
Finished running the pipeline


The score is 0.0 with one-hot-encoder.

Binary-Encoder for model, tuning, scoring:

In [32]:
pipeline.clear_steps()

pipeline.add_new_step(ColumnKeeper(['model', 'tuning', 'scoring']), 'column_keeper')
#pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_1")

# add the column transformer to the pipeline
#pipeline.add_new_step(preprocessor, "preprocessor")
pipeline.add_new_step(GeneralPurposeEncoderTransformer(BinaryEncoder(),BinaryEncoder(),BinaryEncoder()), 'general')

#pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")


print(pipeline.get_pipeline().named_steps)

pipeline.run()

{'column_keeper': ColumnKeeper(columns=['model', 'tuning', 'scoring']), 'general': GeneralPurposeEncoderTransformer(), 'estimator': LGBMRegressor()}
Starting pipeline using method: EvaluationType.BASIC
0.0
Finished running the pipeline


The score is 0.0 with binary-encoder.

Ordinal Encoder for model, tuning, scoring:

In [33]:
pipeline.clear_steps()

pipeline.add_new_step(ColumnKeeper(['model', 'tuning', 'scoring']), 'column_keeper')
#pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_1")

# add the column transformer to the pipeline
#pipeline.add_new_step(preprocessor, "preprocessor")
pipeline.add_new_step(GeneralPurposeEncoderTransformer(OrdinalEncoder(),OrdinalEncoder(),OrdinalEncoder()), 'general')

#pipeline.add_new_step(PrintDataframe(verbose=pipeline._verbose_level), "print_df_2")


print(pipeline.get_pipeline().named_steps)

pipeline.run()

{'column_keeper': ColumnKeeper(columns=['model', 'tuning', 'scoring']), 'general': GeneralPurposeEncoderTransformer(model_encoder=OrdinalEncoder(),
                                 scoring_encoder=OrdinalEncoder(),
                                 tuning_encoder=OrdinalEncoder()), 'estimator': LGBMRegressor()}
Starting pipeline using method: EvaluationType.BASIC
0.0
Finished running the pipeline


Let's try something different:

In [50]:
from sklearn.model_selection import train_test_split

df = config.load_traindata_for_regression()
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [52]:
from sklearn.pipeline import Pipeline
#from sklearn.preprocessing import OneHotEncoder

# [tuning, scoring, model, dataset]

categorical_features = ['model', 'tuning', 'scoring']

categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)
    
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [56]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("regressor", LinearRegression())]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.092
