In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
# from mlxtend.feature_selection import ColumnSelector
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer,  OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from joblib import Memory


## DataLoad  

In [11]:
def fetch_adult_data():
    from sklearn.datasets import  fetch_openml
    
    from sklearn.datasets import  fetch_openml
    openml_ds = fetch_openml(data_id=179, as_frame=True, parser='pandas')
    dataset = openml_ds['frame']
    return dataset

dataset = fetch_adult_data()

target = dataset.pop('class')

print(dataset.shape, target.shape)
dataset.head()

(48842, 14) (48842,)


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
0,2,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States
1,3,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States
2,2,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States
3,3,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States
4,1,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba


In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             48842 non-null  category
 1   workclass       46043 non-null  category
 2   fnlwgt          48842 non-null  int64   
 3   education       48842 non-null  category
 4   education-num   48842 non-null  int64   
 5   marital-status  48842 non-null  category
 6   occupation      46033 non-null  category
 7   relationship    48842 non-null  category
 8   race            48842 non-null  category
 9   sex             48842 non-null  category
 10  capitalgain     48842 non-null  category
 11  capitalloss     48842 non-null  category
 12  hoursperweek    48842 non-null  category
 13  native-country  47985 non-null  category
dtypes: category(12), int64(2)
memory usage: 1.3 MB


## Preprocessor Builder

In [13]:
def bool_to_number(x: np.ndarray) -> np.ndarray:
        return np.multiply(x, 1)

BooleanTransformer = FunctionTransformer(bool_to_number, feature_names_out = 'one-to-one')

def build_preprocessor_pipeline(dataset: pd.DataFrame, n_jobs_: int = -1, verbose_: bool = False) -> ColumnTransformer:
    numerical_columns = make_column_selector(dtype_include=[np.number])(dataset)
    categorical_columns = make_column_selector(dtype_include=['category'])(dataset)
    boolean_columns = make_column_selector(dtype_include=['bool'])(dataset)
    
    transformers_ = []
    
    if numerical_columns: 
        transformers_.append(("transformer_n", SimpleImputer(strategy="mean"), numerical_columns ))
    if categorical_columns: 
        transformer_c = Pipeline(
            steps=[
                ("imputer_c", SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
                ("encoder_c", OrdinalEncoder(handle_unknown="use_encoded_value",
                                             dtype=np.int8, 
                                             encoded_missing_value=-1,
                                             unknown_value=-1)
                ),
            ],
            verbose = False,
            memory= None
        )
        transformers_.append(("transformer_c", transformer_c, categorical_columns ))   
    if boolean_columns: 
        transformer_b = Pipeline(
            steps=[("to_int", BooleanTransformer), 
                   ("imputer_c", SimpleImputer(missing_values=np.nan, strategy='most_frequent'))
                  ],
            verbose = False,
            memory= None
        ) 
        transformers_.append(("transformer_b", transformer_b, boolean_columns ))    
    
    preprocessor = ColumnTransformer(
        transformers=transformers_,
        n_jobs = n_jobs_,
        remainder='drop',
        verbose_feature_names_out=False,
        verbose=verbose_
    ).set_output(transform='pandas')

    return preprocessor

In [14]:
preprocessor = build_preprocessor_pipeline(dataset)
preprocessor

In [15]:
dataset_trf = preprocessor.fit_transform(dataset)

dataset_trf.head()

Unnamed: 0,fnlwgt,education-num,age,workclass,education,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
0,77516.0,13.0,2,6,9,4,0,1,4,1,1,0,2,38
1,83311.0,13.0,3,5,9,2,3,0,4,1,0,0,0,38
2,215646.0,9.0,2,3,11,0,5,1,4,1,0,0,2,38
3,234721.0,7.0,3,3,1,2,5,0,2,1,0,0,2,38
4,338409.0,13.0,1,3,9,2,9,5,2,0,0,0,2,4


## Setting Estimator to Pipeline

In [16]:
args = {
    "random_state": 10,
    "n_jobs": -1,
}


model = Pipeline(
    steps=[("transformer", preprocessor ), 
           ("estimator", RandomForestClassifier(**args))
          ],
    verbose = False,
    memory = None
)

## Fitting Data

In [18]:
import tempfile
from sklearn.base import clone
import joblib
import  time

from optuna import samplers, create_study
from optuna.distributions import FloatDistribution, IntDistribution, CategoricalDistribution, IntUniformDistribution
from optuna.integration import OptunaSearchCV

param_distributions = {
    "estimator__n_estimators": IntDistribution(10, 300, step=10),
    "estimator__max_depth": IntDistribution(1, 11),
    "estimator__min_impurity_decrease": FloatDistribution(0.000000001, 0.5, log=True),
    "estimator__max_features": FloatDistribution(0.4, 1),
    "estimator__max_features": CategoricalDistribution([1.0, "sqrt", "log2"]),
    "estimator__bootstrap": CategoricalDistribution([True, False]),
}



storage_string_ = "sqlite:///./test_2.db" #  optional
sampler_ = samplers.TPESampler(seed=10)
study_ = create_study(storage=storage_string_, 
                      study_name='Randomforest Tuner',
                      direction="maximize", 
                      sampler=sampler_)



cv_result, best_params, best_model, best_score = None, None, None, None
try:
    st_time = time.time()
    tempdir = tempfile.TemporaryDirectory()
    model_ = clone(model)
    memory_ = Memory(tempdir.name, verbose=0) ## use for hypermeter tunning,
    model_.memory = memory_ 
    model_.verbose = False
    optuna_search = OptunaSearchCV(model_,
                   param_distributions,
                   cv=5,
                   #max_iter=20,
                   n_trials = 5,
                   n_jobs=-1,
                   random_state=10,
                   refit=True,
                   verbose = 10,
                   timeout = 60*60,
                   study=study_
                   )         
    optuna_search.fit(dataset,target)
except Exception as err:
    print(err)
else:
    cv_result = pd.DataFrame().from_dict(optuna_search.cv_results_)
    best_score = optuna_search.best_score_
    best_model = optuna_search.best_estimator_
    best_params = optuna_search.best_params_
    #print(optuna_search.best_params_, optuna_search.best_index_)
    best_model.memory =  None
finally:    
    memory_.clear()
    tempdir.cleanup()
    print(f'End 2 End  Time - {time.time() - st_time} secs')

[I 2024-01-15 22:04:51,540] A new study created in RDB with name: Randomforest Tuner
  optuna_search = OptunaSearchCV(model_,
[I 2024-01-15 22:04:51,579] Searching the best hyperparameters using 48842 samples...
[I 2024-01-15 22:05:09,718] Trial 1 finished with value: 0.7607182362198229 and parameters: {'estimator__n_estimators': 100, 'estimator__max_depth': 10, 'estimator__min_impurity_decrease': 0.3810441382224819, 'estimator__max_features': 'log2', 'estimator__bootstrap': False}. Best is trial 1 with value: 0.7607182362198229.
[I 2024-01-15 22:05:11,399] Trial 3 finished with value: 0.8387248805305925 and parameters: {'estimator__n_estimators': 170, 'estimator__max_depth': 6, 'estimator__min_impurity_decrease': 0.0006576437138114963, 'estimator__max_features': 'sqrt', 'estimator__bootstrap': True}. Best is trial 3 with value: 0.8387248805305925.
[I 2024-01-15 22:05:12,046] Trial 0 finished with value: 0.8497195138074449 and parameters: {'estimator__n_estimators': 150, 'estimator__ma

End 2 End  Time - 22.852312326431274 secs
