In [16]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src import configuration as config
from src.pipeline.pipeline_factory import PipelineFactory, ModelType, EvaluationType

from sklearn.tree import DecisionTreeRegressor
from category_encoders.one_hot import OneHotEncoder
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor


Grid Search for LGBM-Regressor:

In [6]:
# LGBM-Regressor
param_grid = {
    'estimator__learning_rate': [0.1, 0.01, 0.001],
    'estimator__n_estimators': [100, 200, 500],
    'estimator__max_depth': [3, 5, 7],
    'estimator__num_leaves': [31, 50, 100],
    'estimator__subsample': [0.8, 1.0],
    'estimator__colsample_bytree': [0.8, 1.0]
}

In [7]:
# load the data
train_df = config.load_traindata_for_regression()
pipelineFactory = PipelineFactory()

# create the baseline pipeline
pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.REGRE_PREPROCESSED,
    verbose_level=1,
    evaluation=EvaluationType.GRID_SEARCH,
    param_grid=param_grid
)
pipeline.clear_steps()
pipeline.add_new_step(OneHotEncoder(), 'onehot')
pipeline.change_estimator(LGBMRegressor())

pipeline.run()
scores = {}

Starting pipeline using method: EvaluationType.GRID_SEARCH
Performing grid search


100%|██████████| 324/324 [1:09:00<00:00, 12.78s/it]

Finished running the pipeline
Evaluation metrics:
    best_score: 0.3675
    best_params: {'estimator__learning_rate': 0.1, 'estimator__n_estimators': 500, 'estimator__max_depth': 7, 'estimator__num_leaves': 100, 'estimator__subsample': 0.8, 'estimator__colsample_bytree': 1.0}





Grid Search for DecisionTree Regressor

In [9]:
parameters={"estimator__splitter":["best","random"],
            "estimator__max_depth" : [1,3,5,7,9],
           "estimator__min_samples_leaf":[1,2,3,4,],
           "estimator__max_features":["auto","log2","sqrt",None],
           "estimator__max_leaf_nodes":[None,10,20,30,40] }

In [10]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# load the data
train_df = config.load_traindata_for_regression()
pipelineFactory = PipelineFactory()

# create the baseline pipeline
pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.REGRE_PREPROCESSED,
    verbose_level=1,
    evaluation=EvaluationType.GRID_SEARCH,
    param_grid=parameters
)
pipeline.clear_steps()
pipeline.add_new_step(OneHotEncoder(), 'onehot')
pipeline.change_estimator(DecisionTreeRegressor())

pipeline.run()
scores = {}

Starting pipeline using method: EvaluationType.GRID_SEARCH
Performing grid search


100%|██████████| 800/800 [1:59:44<00:00,  8.98s/it]  

Finished running the pipeline
Evaluation metrics:
    best_score: 0.2319
    best_params: {'estimator__splitter': 'best', 'estimator__max_depth': 9, 'estimator__min_samples_leaf': 1, 'estimator__max_features': 'auto', 'estimator__max_leaf_nodes': None}





Grid Search for Random Forest Classifier

In [17]:
param_grid = {
    'estimator__n_estimators': [100, 200],
    'estimator__max_depth': [None, 5, 10],  
    'estimator__min_samples_leaf': [1, 2, 4],
    'estimator__max_features': ['auto', 'sqrt'], 
}

In [18]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# load the data
train_df = config.load_traindata_for_regression()
pipelineFactory = PipelineFactory()

# create the baseline pipeline
pipeline = pipelineFactory.create_pipeline(
    train_df,
    ModelType.REGRE_PREPROCESSED,
    verbose_level=1,
    evaluation=EvaluationType.GRID_SEARCH,
    param_grid=parameters
)
pipeline.clear_steps()
pipeline.add_new_step(OneHotEncoder(), 'onehot')
pipeline.change_estimator(RandomForestRegressor())

pipeline.run()
scores = {}

Starting pipeline using method: EvaluationType.GRID_SEARCH
Performing grid search


  0%|          | 0/800 [00:00<?, ?it/s]

  0%|          | 1/800 [00:00<11:57,  1.11it/s]

Invalid parameter 'splitter' for estimator RandomForestRegressor(). Valid parameters are: ['bootstrap', 'ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].


  0%|          | 2/800 [00:02<13:34,  1.02s/it]

Invalid parameter 'splitter' for estimator RandomForestRegressor(). Valid parameters are: ['bootstrap', 'ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].


  0%|          | 3/800 [00:02<13:16,  1.00it/s]

Invalid parameter 'splitter' for estimator RandomForestRegressor(). Valid parameters are: ['bootstrap', 'ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].


  0%|          | 4/800 [00:03<13:11,  1.01it/s]

Invalid parameter 'splitter' for estimator RandomForestRegressor(). Valid parameters are: ['bootstrap', 'ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].


  1%|          | 5/800 [00:05<14:06,  1.06s/it]

Invalid parameter 'splitter' for estimator RandomForestRegressor(). Valid parameters are: ['bootstrap', 'ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].


  1%|          | 6/800 [00:06<15:29,  1.17s/it]

Invalid parameter 'splitter' for estimator RandomForestRegressor(). Valid parameters are: ['bootstrap', 'ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'max_samples', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_jobs', 'oob_score', 'random_state', 'verbose', 'warm_start'].


  1%|          | 6/800 [00:06<15:21,  1.16s/it]

CTRL+C pressed. Stopping the execution...
Finished running the pipeline
Evaluation metrics:
    best_score: -9223372036854775807
    best_params: {}



