In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src import configuration as config
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType

from sklearn.tree import DecisionTreeRegressor
from category_encoders.one_hot import OneHotEncoder
from lightgbm import LGBMRegressor


Grid Search for LGBM-Regressor:

In [6]:
# LGBM-Regressor
param_grid = {
    'estimator__learning_rate': [0.1, 0.01, 0.001],
    'estimator__n_estimators': [100, 200, 500],
    'estimator__max_depth': [3, 5, 7],
    'estimator__num_leaves': [31, 50, 100],
    'estimator__subsample': [0.8, 1.0],
    'estimator__colsample_bytree': [0.8, 1.0]
}

In [7]:
# load the data
train_df = config.load_traindata_for_regression()
pipelineFactory = PipelineFactory()

# create the baseline pipeline
pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.REGRE_PREPROCESSED,
    verbose_level=1,
    evaluation=EvaluationType.GRID_SEARCH,
    param_grid=param_grid
)
pipeline.clear_steps()
pipeline.add_new_step(OneHotEncoder(), 'onehot')
pipeline.change_estimator(LGBMRegressor())

pipeline.run()
scores = {}

Starting pipeline using method: EvaluationType.GRID_SEARCH
Performing grid search


100%|██████████| 324/324 [1:09:00<00:00, 12.78s/it]

Finished running the pipeline
Evaluation metrics:
    best_score: 0.3675
    best_params: {'estimator__learning_rate': 0.1, 'estimator__n_estimators': 500, 'estimator__max_depth': 7, 'estimator__num_leaves': 100, 'estimator__subsample': 0.8, 'estimator__colsample_bytree': 1.0}





Grid Search for DecisionTree Regressor

In [9]:
parameters={"estimator__splitter":["best","random"],
            "estimator__max_depth" : [1,3,5,7,9],
           "estimator__min_samples_leaf":[1,2,3,4,],
           "estimator__max_features":["auto","log2","sqrt",None],
           "estimator__max_leaf_nodes":[None,10,20,30,40] }

In [10]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# load the data
train_df = config.load_traindata_for_regression()
pipelineFactory = PipelineFactory()

# create the baseline pipeline
pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.REGRE_PREPROCESSED,
    verbose_level=1,
    evaluation=EvaluationType.GRID_SEARCH,
    param_grid=parameters
)
pipeline.clear_steps()
pipeline.add_new_step(OneHotEncoder(), 'onehot')
pipeline.change_estimator(DecisionTreeRegressor())

pipeline.run()
scores = {}

Starting pipeline using method: EvaluationType.GRID_SEARCH
Performing grid search


100%|██████████| 800/800 [1:59:44<00:00,  8.98s/it]  

Finished running the pipeline
Evaluation metrics:
    best_score: 0.2319
    best_params: {'estimator__splitter': 'best', 'estimator__max_depth': 9, 'estimator__min_samples_leaf': 1, 'estimator__max_features': 'auto', 'estimator__max_leaf_nodes': None}



