In [12]:
import numpy as np
np.mean([0.9811991983356031, 0.9775999181916318, 0.9819993984396399, 0.9807997584156638, 0.9851998387837438, 0.9818009987438286, 0.9837987984635966, 0.9854000388397711, 0.9836002787437711, 0.9806006385757661, 0.9862001189197934, 0.9812001585277184])

0.9824499285817105

In [8]:
np.mean([0.46279279558254016, 0.4274040750073275, 0.37198526681218375, 0.4565850335335094, 0.40799655195011414, 0.45617911231679237, 0.4838104407929939, 0.4703862348778743, 0.43080723591176123, 0.5305794783420267, 0.43219963450287135, 0.39239475034164834])

0.4435933841643036

In [11]:
np.median([0.46279279558254016, 0.4274040750073275, 0.37198526681218375, 0.4565850335335094])

0.44199455427041845

In [22]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

# Load a sample dataset (in this case, the Iris dataset)
iris = load_iris()
X, y = iris.data, iris.target

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    # Initialize the classifier variable
    classifier = None

    # Choose a classifier from the available options
    classifier_name = trial.suggest_categorical('classifier', ['LogisticRegression', 'SGDClassifier', 'RandomForest', 
                                                               'AdaBoost', 'GradientBoost', 'BaggingClassifier', 
                                                               'ExtraTreesClassifier', 'HistGradientBoostingClassifier', 
                                                               'DecisionTreeClassifier', 'XGBClassifier', 'KNeighborsClassifier'])

    # Instantiate the selected classifier with hyperparameters suggested by Optuna
    if classifier_name == 'LogisticRegression':
        classifier = LogisticRegression(C=trial.suggest_loguniform('C', 1e-5, 1e5))
    elif classifier_name == 'SGDClassifier':
        classifier = SGDClassifier(alpha=trial.suggest_loguniform('alpha', 1e-5, 1e-1))
    elif classifier_name == 'RandomForest':
        classifier = RandomForestClassifier(n_estimators=trial.suggest_int('n_estimators', 10, 100))
    elif classifier_name == 'AdaBoost':
        classifier = AdaBoostClassifier(n_estimators=trial.suggest_int('n_estimators', 10, 100))
    elif classifier_name == 'GradientBoost':
        classifier = GradientBoostingClassifier(n_estimators=trial.suggest_int('n_estimators', 10, 100))
    elif classifier_name == 'BaggingClassifier':
        classifier = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                       n_estimators=trial.suggest_int('n_estimators', 10, 100))
    elif classifier_name == 'ExtraTreesClassifier':
        classifier = ExtraTreesClassifier(n_estimators=trial.suggest_int('n_estimators', 10, 100))
    elif classifier_name == 'HistGradientBoostingClassifier':
        classifier = HistGradientBoostingClassifier(max_iter=trial.suggest_int('max_iter', 10, 100))
    elif classifier_name == 'DecisionTreeClassifier':
        classifier = DecisionTreeClassifier()
    elif classifier_name == 'XGBClassifier':
        classifier = XGBClassifier(n_estimators=trial.suggest_int('n_estimators', 10, 100))
    elif classifier_name == 'KNeighborsClassifier':
        classifier = KNeighborsClassifier(n_neighbors=trial.suggest_int('n_neighbors', 1, 10))

    if classifier is None:
        raise ValueError(f"Invalid classifier name: {classifier_name}")

    # Training the classifier
    classifier.fit(X_train, y_train)

    # Pruning callback to stop the trial if it's unpromising
    trial.report(accuracy_score(y_val, classifier.predict(X_val)), step=trial.number)
    if trial.should_prune():
        raise optuna.TrialPruned()

    return accuracy_score(y_val, classifier.predict(X_val))

# Set up the optimization study
# Set up the optimization study with SuccessiveHalvingPruner
study = optuna.create_study(direction='maximize', pruner=optuna.pruners.SuccessiveHalvingPruner())
study.optimize(objective, n_trials=100)


# Print the best parameters and their corresponding accuracy
print("Best trial:")
trial = study.best_trial
print("Classifier:", trial.params['classifier'])
print("Parameters:", trial.params)
print("Accuracy:", trial.value)

study.trials_dataframe()[study.trials_dataframe()['state']!='PRUNED']

[I 2024-01-27 09:58:41,597] A new study created in memory with name: no-name-d0421ffb-4dab-4854-9df1-515f576732b3


[I 2024-01-27 09:58:41,788] Trial 0 finished with value: 1.0 and parameters: {'classifier': 'RandomForest', 'n_estimators': 93}. Best is trial 0 with value: 1.0.
[I 2024-01-27 09:58:41,887] Trial 1 finished with value: 1.0 and parameters: {'classifier': 'RandomForest', 'n_estimators': 31}. Best is trial 0 with value: 1.0.
[I 2024-01-27 09:58:42,013] Trial 2 finished with value: 1.0 and parameters: {'classifier': 'AdaBoost', 'n_estimators': 33}. Best is trial 0 with value: 1.0.
[I 2024-01-27 09:58:42,061] Trial 3 finished with value: 1.0 and parameters: {'classifier': 'ExtraTreesClassifier', 'n_estimators': 34}. Best is trial 0 with value: 1.0.
[I 2024-01-27 09:58:42,155] Trial 4 finished with value: 1.0 and parameters: {'classifier': 'RandomForest', 'n_estimators': 49}. Best is trial 0 with value: 1.0.
[I 2024-01-27 09:58:42,281] Trial 5 finished with value: 1.0 and parameters: {'classifier': 'ExtraTreesClassifier', 'n_estimators': 56}. Best is trial 0 with value: 1.0.
  classifier = S

Best trial:
Classifier: RandomForest
Parameters: {'classifier': 'RandomForest', 'n_estimators': 93}
Accuracy: 1.0


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_alpha,params_classifier,params_max_iter,params_n_estimators,params_n_neighbors,system_attrs_completed_rung_0,system_attrs_completed_rung_1,system_attrs_completed_rung_2,system_attrs_completed_rung_3,state
0,0,1.0,2024-01-27 09:58:41.599115,2024-01-27 09:58:41.787290,0 days 00:00:00.188175,,,RandomForest,,93.0,,,,,,COMPLETE
1,1,1.0,2024-01-27 09:58:41.790297,2024-01-27 09:58:41.887513,0 days 00:00:00.097216,,,RandomForest,,31.0,,1.0,,,,COMPLETE
2,2,1.0,2024-01-27 09:58:41.889842,2024-01-27 09:58:42.013166,0 days 00:00:00.123324,,,AdaBoost,,33.0,,1.0,,,,COMPLETE
3,3,1.0,2024-01-27 09:58:42.014188,2024-01-27 09:58:42.061273,0 days 00:00:00.047085,,,ExtraTreesClassifier,,34.0,,1.0,,,,COMPLETE
4,4,1.0,2024-01-27 09:58:42.063248,2024-01-27 09:58:42.155646,0 days 00:00:00.092398,,,RandomForest,,49.0,,1.0,1.0,,,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,1.0,2024-01-27 09:58:50.612026,2024-01-27 09:58:50.763600,0 days 00:00:00.151574,,,GradientBoost,,41.0,,1.0,1.0,1.0,1.0,COMPLETE
96,96,1.0,2024-01-27 09:58:50.765597,2024-01-27 09:58:51.004163,0 days 00:00:00.238566,,,RandomForest,,96.0,,1.0,1.0,1.0,1.0,COMPLETE
97,97,1.0,2024-01-27 09:58:51.005211,2024-01-27 09:58:51.162212,0 days 00:00:00.157001,,,RandomForest,,91.0,,1.0,1.0,1.0,1.0,COMPLETE
98,98,1.0,2024-01-27 09:58:51.163211,2024-01-27 09:58:51.279006,0 days 00:00:00.115795,,,AdaBoost,,30.0,,1.0,1.0,1.0,1.0,COMPLETE


In [23]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier

# Load a sample dataset (in this case, the Iris dataset)
iris = load_iris()
X, y = iris.data, iris.target

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    # Choose a classifier from the available options
    classifier_name = trial.suggest_categorical('classifier', ['LogisticRegression', 'SGDClassifier', 'RandomForest', 
                                                               'AdaBoost', 'GradientBoost', 'BaggingClassifier', 
                                                               'ExtraTreesClassifier', 'HistGradientBoostingClassifier', 
                                                               'DecisionTreeClassifier', 'XGBClassifier', 'KNeighborsClassifier'])

    # Initialize the classifier variable
    classifier = None

    # Instantiate the selected classifier with hyperparameters suggested by Optuna
    if classifier_name == 'LogisticRegression':
        classifier = LogisticRegression(C=trial.suggest_loguniform('C', 1e-5, 1e5))
    elif classifier_name == 'SGDClassifier':
        classifier = SGDClassifier(alpha=trial.suggest_loguniform('alpha', 1e-5, 1e-1))
    elif classifier_name == 'RandomForest':
        classifier = RandomForestClassifier(n_estimators=trial.suggest_int('n_estimators', 10, 100))
    elif classifier_name == 'AdaBoost':
        classifier = AdaBoostClassifier(n_estimators=trial.suggest_int('n_estimators', 10, 100))
    elif classifier_name == 'GradientBoost':
        classifier = GradientBoostingClassifier(n_estimators=trial.suggest_int('n_estimators', 10, 100))
    elif classifier_name == 'BaggingClassifier':
        classifier = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                       n_estimators=trial.suggest_int('n_estimators', 10, 100))
    elif classifier_name == 'ExtraTreesClassifier':
        classifier = ExtraTreesClassifier(n_estimators=trial.suggest_int('n_estimators', 10, 100))
    elif classifier_name == 'HistGradientBoostingClassifier':
        classifier = HistGradientBoostingClassifier(max_iter=trial.suggest_int('max_iter', 10, 100))
    elif classifier_name == 'DecisionTreeClassifier':
        classifier = DecisionTreeClassifier()
    elif classifier_name == 'XGBClassifier':
        classifier = XGBClassifier(n_estimators=trial.suggest_int('n_estimators', 10, 100))
    elif classifier_name == 'KNeighborsClassifier':
        classifier = KNeighborsClassifier(n_neighbors=trial.suggest_int('n_neighbors', 1, 10))

    if classifier is None:
        raise ValueError(f"Invalid classifier name: {classifier_name}")

    # Training the classifier
    classifier.fit(X_train, y_train)

    # Pruning callback to stop the trial if it's unpromising
    trial.report(accuracy_score(y_val, classifier.predict(X_val)), step=trial.number)
    if trial.should_prune():
        raise optuna.TrialPruned()

    return accuracy_score(y_val, classifier.predict(X_val))

# Set up the optimization study for each classifier
study_lr = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study_sgd = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study_rf = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
# ... Create separate studies for other classifiers ...

# Optimize each study independently
study_lr.optimize(objective, n_trials=100)
study_sgd.optimize(objective, n_trials=100)
study_rf.optimize(objective, n_trials=100)
# ... Optimize other studies for other classifiers ...

# Print the best parameters and their corresponding accuracy for each classifier
print("Best trial for Logistic Regression:")
print(study_lr.best_trial.params)
print("Accuracy:", study_lr.best_trial.value)

print("\nBest trial for SGD Classifier:")
print(study_sgd.best_trial.params)
print("Accuracy:", study_sgd.best_trial.value)

print("\nBest trial for Random Forest:")
print(study_rf.best_trial.params)
print("Accuracy:", study_rf.best_trial.value)
# ... Print results for other classifiers ...

study.trials_dataframe()[study.trials_dataframe()['state']=='PRUNED']

[I 2024-01-27 09:59:05,912] A new study created in memory with name: no-name-de6337b4-b08d-4211-b8cf-6e9e7c507129
[I 2024-01-27 09:59:05,914] A new study created in memory with name: no-name-503b9f9d-d982-40ee-826f-fde865a510e9
[I 2024-01-27 09:59:05,918] A new study created in memory with name: no-name-5a9abe3a-b079-4370-80c8-e26acf4246a9
  classifier = SGDClassifier(alpha=trial.suggest_loguniform('alpha', 1e-5, 1e-1))
[I 2024-01-27 09:59:05,923] Trial 0 finished with value: 0.9333333333333333 and parameters: {'classifier': 'SGDClassifier', 'alpha': 0.0001046372968062504}. Best is trial 0 with value: 0.9333333333333333.
  classifier = SGDClassifier(alpha=trial.suggest_loguniform('alpha', 1e-5, 1e-1))
[I 2024-01-27 09:59:05,929] Trial 1 finished with value: 1.0 and parameters: {'classifier': 'SGDClassifier', 'alpha': 0.006237915040117193}. Best is trial 1 with value: 1.0.
[I 2024-01-27 09:59:05,953] Trial 2 finished with value: 1.0 and parameters: {'classifier': 'XGBClassifier', 'n_est

Best trial for Logistic Regression:
{'classifier': 'SGDClassifier', 'alpha': 0.006237915040117193}
Accuracy: 1.0

Best trial for SGD Classifier:
{'classifier': 'KNeighborsClassifier', 'n_neighbors': 5}
Accuracy: 1.0

Best trial for Random Forest:
{'classifier': 'AdaBoost', 'n_estimators': 21}
Accuracy: 1.0


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_alpha,params_classifier,params_max_iter,params_n_estimators,params_n_neighbors,system_attrs_completed_rung_0,system_attrs_completed_rung_1,system_attrs_completed_rung_2,system_attrs_completed_rung_3,state
6,6,0.8,2024-01-27 09:58:42.284169,2024-01-27 09:58:42.295514,0 days 00:00:00.011345,,0.000492,SGDClassifier,,,,0.8,,,,PRUNED
12,12,0.966667,2024-01-27 09:58:42.827190,2024-01-27 09:58:42.847765,0 days 00:00:00.020575,0.049516,,LogisticRegression,,,,0.966667,,,,PRUNED
24,24,0.9,2024-01-27 09:58:44.083941,2024-01-27 09:58:44.101490,0 days 00:00:00.017549,,0.086437,SGDClassifier,,,,0.9,,,,PRUNED
37,37,0.733333,2024-01-27 09:58:45.559953,2024-01-27 09:58:45.570953,0 days 00:00:00.011000,,1.2e-05,SGDClassifier,,,,0.733333,,,,PRUNED
46,46,0.3,2024-01-27 09:58:46.174582,2024-01-27 09:58:46.198597,0 days 00:00:00.024015,1.5e-05,,LogisticRegression,,,,0.3,,,,PRUNED
57,57,0.966667,2024-01-27 09:58:47.073593,2024-01-27 09:58:47.091676,0 days 00:00:00.018083,,0.093272,SGDClassifier,,,,0.966667,,,,PRUNED
78,78,0.966667,2024-01-27 09:58:49.342817,2024-01-27 09:58:49.354818,0 days 00:00:00.012001,,0.001109,SGDClassifier,,,,0.966667,,,,PRUNED


In [2]:
pwd

'f:\\iNeuron\\Projects\\scania_failures_2\\notebooks'

In [25]:
import os
os.chdir("f:\\iNeuron\\Projects\\scania_failures_2")

from src.utils import (load_yaml,save_yaml,save_binary,
                       eval_metrics, parameter_tuning, best_model_finder, 
                       stacking_clf_trainer, voting_clf_trainer, model_trainer, mlflow_logger)
from src.constants import *
from src.components.stage_3_data_split import data_splitting_component
from src.components.stage_4_final_preprocessing import stage_4_final_processing_component
from src.config.configuration_manager import ConfigurationManager
from src.entity.entity_config import (Stage2ProcessingConf,
                                      ModelMetricsConf, 
                                      ModelTrainerConf, 
                                      PreprocessorConf, 
                                      DataSplitConf,
                                      Stage1ProcessingConf)
from src import logger

import optuna
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold

from src.utils import eval_metrics
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.impute import KNNImputer
from imblearn.combine import SMOTETomek


In [16]:
conf_obj = ConfigurationManager()
stage_2_obj = conf_obj.get_stage2_processing_config()
model_metrics_obj = conf_obj.get_metric_config()
model_config_obj = conf_obj.get_model_config()
data_split_obj = conf_obj.get_data_split_config()
preprocessor_obj = conf_obj.get_preprocessor_config()
stage_1_obj = conf_obj.get_stage1_processing_config()

size = 2000
stage_3_data_split_obj = data_splitting_component(data_split_conf = data_split_obj,
                                                          stage1_processor_conf = stage_1_obj)
pre_train_df, pre_test_df = stage_3_data_split_obj.data_splitting(size)

stage_4_final_processing_obj = stage_4_final_processing_component(data_split_conf = data_split_obj,
                                                                    stage_2_processor_conf = stage_2_obj,
                                                                    preprocessor_conf = preprocessor_obj)
train_df, test_df = stage_4_final_processing_obj.final_processing(pre_train_df, pre_test_df)


print ("Train data's shape: ", train_df.shape)
print ("Test data's shape: ", test_df.shape)

x_train = train_df.drop(columns = 'class')
y_train = train_df['class']

x_test = test_df.drop(columns = 'class')
y_test = test_df['class']

print(f"\nx_train shape: {x_train.shape}, y_train shape: {y_train.shape}")
print(f"x_test shape: {x_test.shape}, y_test shape: {y_test.shape}")
print(f"\nNA values in x_train: {x_train.isna().sum().unique()}")
print(f"NA values in x_test: {x_test.isna().sum().unique()}")
print(f"\nTarget value counts in y_train: {y_train.value_counts()}")
print(f"\nTarget value counts in y_test: {y_test.value_counts()}")

[2024-01-27 14:04:05,103: INFO: utils: config.yaml yaml_file is loaded]
[2024-01-27 14:04:05,113: INFO: utils: params.yaml yaml_file is loaded]
[2024-01-27 14:04:05,117: INFO: utils: schema.yaml yaml_file is loaded]
Size:  2000
Pre_train_data shape:  (1500, 171) 
Pre_test_data shape:  (500, 171)
[2024-01-27 14:04:06,291: INFO: utils: config.yaml yaml_file is loaded]
[2024-01-27 14:04:06,293: INFO: utils: params.yaml yaml_file is loaded]
[2024-01-27 14:04:06,295: INFO: utils: schema.yaml yaml_file is loaded]
[2024-01-27 14:04:06,297: INFO: utils: schema.yaml yaml_file is loaded]
[2024-01-27 14:04:06,299: INFO: utils: Stage 2 Processing Commencing]
[2024-01-27 14:04:06,306: INFO: utils: Pipeline created with KnnImputer, RobustScaler]
[2024-01-27 14:04:06,307: INFO: utils: SmoteTomek obj created]
[2024-01-27 14:04:06,315: INFO: utils: Commencing pipeline transformation]
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.7s
[Pipeline] ..... (step 2 of 2) Processing Robust_

In [14]:
params = {"learning_rate":optuna.distributions.CategoricalDistribution(['constant', 'invscaling', 'adaptive']),
          'hidden_layer_sizes': optuna.distributions.CategoricalDistribution([(500, 300, 200, 150, 50), (700, 500, 300, 100, ), (1500, 800, 400, 200, )])}
optuna_search = optuna.integration.OptunaSearchCV(estimator = MLPClassifier(),
                                  param_distributions = params,
                                  cv = 5,
                                  enable_pruning = True,
                                  n_jobs = -1,
                                  max_iter = 5,
                                  n_trials = 10,
                                  refit = True,
                                  scoring = 'accuracy')
optuna_search.fit(x_train,y_train)

[I 2024-01-27 13:14:59,193] A new study created in memory with name: no-name-eb4071f9-4ec7-421e-9524-1cadc2659d00
[I 2024-01-27 13:16:01,763] Trial 0 finished with value: 0.8616340389904122 and parameters: {'learning_rate': 'invscaling', 'hidden_layer_sizes': (500, 300, 200, 150, 50)}. Best is trial 0 with value: 0.8616340389904122.
[I 2024-01-27 13:16:01,904] Trial 6 finished with value: 0.8313144292433906 and parameters: {'learning_rate': 'adaptive', 'hidden_layer_sizes': (500, 300, 200, 150, 50)}. Best is trial 0 with value: 0.8616340389904122.
[I 2024-01-27 13:16:01,965] Trial 2 finished with value: 0.8660621776720875 and parameters: {'learning_rate': 'constant', 'hidden_layer_sizes': (500, 300, 200, 150, 50)}. Best is trial 2 with value: 0.8660621776720875.
[I 2024-01-27 13:16:02,091] Trial 5 finished with value: 0.8234663441691715 and parameters: {'learning_rate': 'adaptive', 'hidden_layer_sizes': (500, 300, 200, 150, 50)}. Best is trial 2 with value: 0.8660621776720875.
[I 2024-

In [2]:
import pandas as pd
data_path = "F:\iNeuron\Projects\scania_failures_2\\artifacts\data\processed\stage_1_processing\preprocessed_train_data.csv"
data = pd.read_csv(data_path).iloc[:2000,:]

In [21]:
data['class'].value_counts()

class
0    1962
1      38
Name: count, dtype: int64

In [22]:
data.shape

(2000, 171)

In [23]:
data.isna().sum().unique()

array([   0, 1544,  115,  483,   78,   23,   21,   18,  147,   22,   17,
         85,   24,  750,  891, 1297, 1461, 1557, 1608, 1638, 1657,   26,
          4,  103,   11,  145,   25,   13,  303,  319,  447,  135,  321],
      dtype=int64)

In [9]:
import numpy as np

skf = StratifiedKFold(n_splits=5,shuffle= False, random_state=None)

def objective(trial, data = data):
  # train_x,test_x,y_train,y_test=train_test_split(data,target,test_size=0.20,random_state=25)
  # param= {list(config['optuna']['ExtraTreesClassifier'].keys())[0] : eval(config['optuna']['ExtraTreesClassifier']['penalty'])}

  # preprocessor_config = obj.get_preprocessor_config()
  # schema = load_yaml(obj.schema)
  # target = list(schema.Target.keys())[0]
  pipeline = Pipeline(steps=[('Knn_imputer',KNNImputer()),
                             ('Robust_Scaler',RobustScaler())],
                            verbose=True)
  smote = SMOTETomek(n_jobs=-1,sampling_strategy='minority',random_state=8)
  X = data.drop(columns='class')
  y = data['class']
  # space = {}
  score = []
  space = {
    'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
    'criterion': trial.suggest_categorical('criterion', ['log_loss', 'entropy', 'gini']),
    'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
    'class_weight': trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample'])}
  # obj = ConfigurationManager()
  # for key,value in obj.config_path['optuna']['ExtraTreesClassifier'].items():
  #   space[key] = eval(value)
  for fold, (train_indices, test_indices) in enumerate(skf.split(X, y)):
    print ("Fold: ",fold)
    x_train_ = data.drop(columns = 'class').iloc[train_indices]
    y_train_ = data['class'].iloc[train_indices]
    x_test_  = data.drop(columns = 'class').iloc[test_indices]
    y_test_  = data['class'].iloc[test_indices]

    print(x_train_.shape, y_train_.shape, x_test_.shape, y_test_.shape)
    print("Starting pipeline transformation of Xtrain")
    X_train_transformed = pipeline.fit_transform(X = x_train_, y = y_train_)
    print("Starting SMOTE transformation of Xtrain,Ytrain")
    X_train_smote,y_train_smote = smote.fit_resample(X = X_train_transformed,y = y_train_)

    print("Starting pipeline transformation of Xtest")
    X_test_transformed = pipeline.transform(X = x_test_)
    print("Starting SMOTE transformation of Xtest,Ytest")
    X_test_smote,y_test_smote = smote.fit_resample(X = X_test_transformed,y = y_test_)


    print(X_train_smote.shape, y_train_smote.shape, X_test_smote.shape, y_test_smote.shape)

    print (y_train_smote.value_counts())
    print (y_test_smote.value_counts())
    log_reg=ExtraTreesClassifier(**space)
    print("Fitting model")
    log_reg.fit(X_train_smote,y_train_smote)
    y_predict=log_reg.predict(X_test_smote)
    cost = eval_metrics(y_true = y_test_smote , y_pred = y_predict)['Cost']
    print (f"Cost in fold {fold}: {cost}")
    trial.report(cost, fold)
    if trial.should_prune():
        raise optuna.TrialPruned()
    else:
       score.append(cost)
  return np.mean(score)

pruner=optuna.pruners.MedianPruner()
find_param=optuna.create_study(storage='mysql://root:qwerty12345@localhost/example',
                               load_if_exists=True,direction = "minimize",
                               pruner=pruner)
find_param.optimize(objective,n_trials=10)

Cost in fold 3: 79060.0
Fold:  4
(1600, 170) (1600,) (400, 170) (400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.9s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
(3118, 170) (3118,) (774, 170) (774,)
class
0    1559
1    1559
Name: count, dtype: int64
class
1    387
0    387
Name: count, dtype: int64
Fitting model


[I 2024-01-27 14:20:24,447] Trial 1 finished with value: 48446.0 and parameters: {'n_estimators': 660, 'criterion': 'log_loss', 'max_features': None, 'class_weight': 'balanced_subsample'}. Best is trial 0 with value: 34360.0.


Cost in fold 4: 31030.0
Fold:  0
(1600, 170) (1600,) (400, 170) (400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.8s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
(3118, 170) (3118,) (778, 170) (778,)
class
1    1559
0    1559
Name: count, dtype: int64
class
0    389
1    389
Name: count, dtype: int64
Fitting model
Cost in fold 0: 32550.0
Fold:  1
(1600, 170) (1600,) (400, 170) (400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.8s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
(3116, 170) (3116,) (786, 170) (786,)
class
0    1558
1    1558
Na

[I 2024-01-27 14:20:34,076] Trial 2 finished with value: 33158.0 and parameters: {'n_estimators': 278, 'criterion': 'log_loss', 'max_features': 'log2', 'class_weight': 'balanced'}. Best is trial 2 with value: 33158.0.


Cost in fold 4: 33560.0
Fold:  0
(1600, 170) (1600,) (400, 170) (400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.9s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
(3118, 170) (3118,) (778, 170) (778,)
class
1    1559
0    1559
Name: count, dtype: int64
class
0    389
1    389
Name: count, dtype: int64
Fitting model
Cost in fold 0: 34050.0
Fold:  1
(1600, 170) (1600,) (400, 170) (400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.9s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
(3116, 170) (3116,) (786, 170) (786,)
class
0    1558
1    1558
Na

[I 2024-01-27 14:20:46,191] Trial 3 finished with value: 34656.0 and parameters: {'n_estimators': 428, 'criterion': 'gini', 'max_features': 'log2', 'class_weight': 'balanced'}. Best is trial 2 with value: 33158.0.


Cost in fold 4: 33540.0
Fold:  0
(1600, 170) (1600,) (400, 170) (400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.8s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
(3118, 170) (3118,) (778, 170) (778,)
class
1    1559
0    1559
Name: count, dtype: int64
class
0    389
1    389
Name: count, dtype: int64
Fitting model
Cost in fold 0: 33540.0
Fold:  1
(1600, 170) (1600,) (400, 170) (400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.8s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
(3116, 170) (3116,) (786, 170) (786,)
class
0    1558
1    1558
Na

[I 2024-01-27 14:21:00,858] Trial 4 finished with value: 38350.0 and parameters: {'n_estimators': 510, 'criterion': 'gini', 'max_features': 'sqrt', 'class_weight': 'balanced_subsample'}. Best is trial 2 with value: 33158.0.


Cost in fold 4: 35530.0
Fold:  0
(1600, 170) (1600,) (400, 170) (400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   1.0s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
(3118, 170) (3118,) (778, 170) (778,)
class
1    1559
0    1559
Name: count, dtype: int64
class
0    389
1    389
Name: count, dtype: int64
Fitting model
Cost in fold 0: 32050.0
Fold:  1
(1600, 170) (1600,) (400, 170) (400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.9s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
(3116, 170) (3116,) (786, 170) (786,)
class
0    1558
1    1558
Na

[I 2024-01-27 14:21:10,766] Trial 5 finished with value: 33060.0 and parameters: {'n_estimators': 254, 'criterion': 'log_loss', 'max_features': 'log2', 'class_weight': 'balanced_subsample'}. Best is trial 5 with value: 33060.0.


Cost in fold 4: 32560.0
Fold:  0
(1600, 170) (1600,) (400, 170) (400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.8s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
(3118, 170) (3118,) (778, 170) (778,)
class
1    1559
0    1559
Name: count, dtype: int64
class
0    389
1    389
Name: count, dtype: int64
Fitting model


[I 2024-01-27 14:21:12,684] Trial 6 pruned. 


Cost in fold 0: 37550.0
Fold:  0
(1600, 170) (1600,) (400, 170) (400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.8s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
(3118, 170) (3118,) (778, 170) (778,)
class
1    1559
0    1559
Name: count, dtype: int64
class
0    389
1    389
Name: count, dtype: int64
Fitting model
Cost in fold 0: 33550.0
Fold:  1
(1600, 170) (1600,) (400, 170) (400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.8s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
(3116, 170) (3116,) (786, 170) (786,)
class
0    1558
1    1558
Na

[I 2024-01-27 14:21:25,792] Trial 7 finished with value: 33458.0 and parameters: {'n_estimators': 583, 'criterion': 'entropy', 'max_features': 'log2', 'class_weight': 'balanced_subsample'}. Best is trial 5 with value: 33060.0.


Cost in fold 4: 35060.0
Fold:  0
(1600, 170) (1600,) (400, 170) (400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.8s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
(3118, 170) (3118,) (778, 170) (778,)
class
1    1559
0    1559
Name: count, dtype: int64
class
0    389
1    389
Name: count, dtype: int64
Fitting model


[I 2024-01-27 14:21:29,007] Trial 8 pruned. 


Cost in fold 0: 34550.0
Fold:  0
(1600, 170) (1600,) (400, 170) (400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.8s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
(3118, 170) (3118,) (778, 170) (778,)
class
1    1559
0    1559
Name: count, dtype: int64
class
0    389
1    389
Name: count, dtype: int64
Fitting model
Cost in fold 0: 32050.0
Fold:  1
(1600, 170) (1600,) (400, 170) (400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.9s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
(3116, 170) (3116,) (786, 170) (786,)
class
0    1558
1    1558
Na

[I 2024-01-27 14:21:44,615] Trial 9 finished with value: 33756.0 and parameters: {'n_estimators': 770, 'criterion': 'log_loss', 'max_features': 'log2', 'class_weight': 'balanced'}. Best is trial 5 with value: 33060.0.


Cost in fold 4: 33060.0


In [20]:
find_param.trials_dataframe()[find_param.trials_dataframe()['state']=='COMPLETE']

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_class_weight,params_criterion,params_max_features,params_n_estimators,state
0,0,34360.0,2024-01-27 14:19:19,2024-01-27 14:19:32,0 days 00:00:13,balanced,entropy,log2,713,COMPLETE
1,1,48446.0,2024-01-27 14:19:32,2024-01-27 14:20:24,0 days 00:00:52,balanced_subsample,log_loss,,660,COMPLETE
2,2,33158.0,2024-01-27 14:20:24,2024-01-27 14:20:34,0 days 00:00:10,balanced,log_loss,log2,278,COMPLETE
3,3,34656.0,2024-01-27 14:20:34,2024-01-27 14:20:46,0 days 00:00:12,balanced,gini,log2,428,COMPLETE
4,4,38350.0,2024-01-27 14:20:46,2024-01-27 14:21:01,0 days 00:00:15,balanced_subsample,gini,sqrt,510,COMPLETE
5,5,33060.0,2024-01-27 14:21:01,2024-01-27 14:21:11,0 days 00:00:10,balanced_subsample,log_loss,log2,254,COMPLETE
7,7,33458.0,2024-01-27 14:21:13,2024-01-27 14:21:26,0 days 00:00:13,balanced_subsample,entropy,log2,583,COMPLETE
9,9,33756.0,2024-01-27 14:21:29,2024-01-27 14:21:45,0 days 00:00:16,balanced,log_loss,log2,770,COMPLETE


In [16]:
find_param.best_value

33060.0

In [26]:
import numpy as np

skf = StratifiedKFold(n_splits=5,shuffle= False, random_state=None)

def objective(trial, data = data):
  # train_x,test_x,y_train,y_test=train_test_split(data,target,test_size=0.20,random_state=25)
  # param= {list(config['optuna']['ExtraTreesClassifier'].keys())[0] : eval(config['optuna']['ExtraTreesClassifier']['penalty'])}

  # preprocessor_config = obj.get_preprocessor_config()
  # schema = load_yaml(obj.schema)
  # target = list(schema.Target.keys())[0]
  pipeline = Pipeline(steps=[('Knn_imputer',KNNImputer()),
                             ('Robust_Scaler',RobustScaler())],
                            verbose=True)
  smote = SMOTETomek(n_jobs=-1,sampling_strategy='minority',random_state=8)
  X = data.drop(columns='class').iloc[:2000,:]
  y = data['class'].iloc[:2000]
  # space = {}
  score = []
  # space = {
  #   'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
  #   'criterion': trial.suggest_categorical('criterion', ['log_loss', 'entropy', 'gini']),
  #   'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
  #   'class_weight': trial.suggest_categorical('class_weight', ['balanced', 'balanced_subsample'])}
  space = {
         'penalty': trial.suggest_categorical('penalty', ['l2', None])
  }
  # obj = ConfigurationManager()
  # for key,value in obj.config_path['optuna']['ExtraTreesClassifier'].items():
  #   space[key] = eval(value)
  for fold, (train_indices, test_indices) in enumerate(skf.split(X, y)):
    print ("\n\nFold: ",fold)
    x_train_ = data.drop(columns = 'class').iloc[train_indices]
    y_train_ = data['class'].iloc[train_indices]
    x_test_  = data.drop(columns = 'class').iloc[test_indices]
    y_test_  = data['class'].iloc[test_indices]

    print(f"x_train_.shape: {x_train_.shape}, y_train_.shape: {y_train_.shape},\nx_test_.shape: {x_test_.shape}, y_test_.shape:{y_test_.shape}")
    print("Starting pipeline transformation of Xtrain")
    X_train_transformed = pipeline.fit_transform(X = x_train_, y = y_train_)
    print("Starting SMOTE transformation of Xtrain,Ytrain")
    X_train_smote,y_train_smote = smote.fit_resample(X = X_train_transformed,y = y_train_)

    print("Starting pipeline transformation of Xtest")
    X_test_transformed = pipeline.transform(X = x_test_)
    print("Starting SMOTE transformation of Xtest,Ytest")
    X_test_smote,y_test_smote = smote.fit_resample(X = X_test_transformed,y = y_test_)

    print(f"x_train_smote.shape: {X_train_smote.shape}, y_train_smote.shape: {y_train_smote.shape},\nx_test_smote.shape: {X_test_smote.shape}, y_test_smote.shape:{y_test_smote.shape}")
    # print(X_train_smote.shape, y_train_smote.shape, X_test_smote.shape, y_test_smote.shape)

    print (y_train_smote.value_counts())
    print (y_test_smote.value_counts())
    log_reg=LogisticRegression(**space)
    print("Fitting model")
    log_reg.fit(X_train_smote,y_train_smote)
    y_predict=log_reg.predict(X_test_smote)
    cost = eval_metrics(y_true = y_test_smote , y_pred = y_predict)['Cost']
    print (f"\nCost in fold {fold}: {cost}")
    trial.report(cost, fold)
    if trial.should_prune():
        raise optuna.TrialPruned()
    else:
       score.append(cost)
  return np.mean(score)

pruner=optuna.pruners.MedianPruner()
find_param=optuna.create_study(storage='mysql://root:qwerty12345@localhost/example',
                               load_if_exists=True,direction = "minimize",
                               pruner=pruner)
find_param.optimize(objective,n_trials=10)

[I 2024-01-27 15:10:05,271] A new study created in RDB with name: no-name-b2e005f9-b0c3-4de7-b0e0-468fafe85a05




Fold:  0
x_train_.shape: (1600, 170), y_train_.shape: (1600,),
x_test_.shape: (400, 170), y_test_.shape:(400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.8s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3118, 170), y_train_smote.shape: (3118,),
x_test_smote.shape: (778, 170), y_test_smote.shape:(778,)
class
1    1559
0    1559
Name: count, dtype: int64
class
0    389
1    389
Name: count, dtype: int64
Fitting model

Cost in fold 0: 2220.0


Fold:  1
x_train_.shape: (1600, 170), y_train_.shape: (1600,),
x_test_.shape: (400, 170), y_test_.shape:(400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.8s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting

[I 2024-01-27 15:10:11,186] Trial 0 finished with value: 23910.0 and parameters: {'penalty': 'l2'}. Best is trial 0 with value: 23910.0.


Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3118, 170), y_train_smote.shape: (3118,),
x_test_smote.shape: (774, 170), y_test_smote.shape:(774,)
class
0    1559
1    1559
Name: count, dtype: int64
class
1    387
0    387
Name: count, dtype: int64
Fitting model

Cost in fold 4: 43060.0


Fold:  0
x_train_.shape: (1600, 170), y_train_.shape: (1600,),
x_test_.shape: (400, 170), y_test_.shape:(400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.7s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3118, 170), y_train_smote.shape: (3118,),
x_test_smote.shape: (778, 170), y_test_smote.shape:(778,)
class
1    1559
0    1559
Name: count, dtype: int64
class
0    389
1    389
Name: count, dtype: int64
Fitting model

Cost in fold 0: 2220.

[I 2024-01-27 15:10:16,914] Trial 1 finished with value: 23910.0 and parameters: {'penalty': 'l2'}. Best is trial 0 with value: 23910.0.


Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3118, 170), y_train_smote.shape: (3118,),
x_test_smote.shape: (774, 170), y_test_smote.shape:(774,)
class
0    1559
1    1559
Name: count, dtype: int64
class
1    387
0    387
Name: count, dtype: int64
Fitting model

Cost in fold 4: 43060.0


Fold:  0
x_train_.shape: (1600, 170), y_train_.shape: (1600,),
x_test_.shape: (400, 170), y_test_.shape:(400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.7s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3118, 170), y_train_smote.shape: (3118,),
x_test_smote.shape: (778, 170), y_test_smote.shape:(778,)
class
1    1559
0    1559
Name: count, dtype: int64
class
0    389
1    389
Name: count, dtype: int64
Fitting model

Cost in fold 0: 2200.

[I 2024-01-27 15:10:22,647] Trial 2 finished with value: 26046.0 and parameters: {'penalty': None}. Best is trial 0 with value: 23910.0.


Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3118, 170), y_train_smote.shape: (3118,),
x_test_smote.shape: (774, 170), y_test_smote.shape:(774,)
class
0    1559
1    1559
Name: count, dtype: int64
class
1    387
0    387
Name: count, dtype: int64
Fitting model

Cost in fold 4: 41130.0


Fold:  0
x_train_.shape: (1600, 170), y_train_.shape: (1600,),
x_test_.shape: (400, 170), y_test_.shape:(400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.8s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3118, 170), y_train_smote.shape: (3118,),
x_test_smote.shape: (778, 170), y_test_smote.shape:(778,)
class
1    1559
0    1559
Name: count, dtype: int64
class
0    389
1    389
Name: count, dtype: int64
Fitting model

Cost in fold 0: 2220.

[I 2024-01-27 15:10:28,339] Trial 3 finished with value: 23910.0 and parameters: {'penalty': 'l2'}. Best is trial 0 with value: 23910.0.


Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3118, 170), y_train_smote.shape: (3118,),
x_test_smote.shape: (774, 170), y_test_smote.shape:(774,)
class
0    1559
1    1559
Name: count, dtype: int64
class
1    387
0    387
Name: count, dtype: int64
Fitting model

Cost in fold 4: 43060.0


Fold:  0
x_train_.shape: (1600, 170), y_train_.shape: (1600,),
x_test_.shape: (400, 170), y_test_.shape:(400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.8s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3118, 170), y_train_smote.shape: (3118,),
x_test_smote.shape: (778, 170), y_test_smote.shape:(778,)
class
1    1559
0    1559
Name: count, dtype: int64
class
0    389
1    389
Name: count, dtype: int64
Fitting model

Cost in fold 0: 2220.

[I 2024-01-27 15:10:34,444] Trial 4 finished with value: 23910.0 and parameters: {'penalty': 'l2'}. Best is trial 0 with value: 23910.0.


Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3118, 170), y_train_smote.shape: (3118,),
x_test_smote.shape: (774, 170), y_test_smote.shape:(774,)
class
0    1559
1    1559
Name: count, dtype: int64
class
1    387
0    387
Name: count, dtype: int64
Fitting model

Cost in fold 4: 43060.0


Fold:  0
x_train_.shape: (1600, 170), y_train_.shape: (1600,),
x_test_.shape: (400, 170), y_test_.shape:(400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.8s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3118, 170), y_train_smote.shape: (3118,),
x_test_smote.shape: (778, 170), y_test_smote.shape:(778,)
class
1    1559
0    1559
Name: count, dtype: int64
class
0    389
1    389
Name: count, dtype: int64
Fitting model

Cost in fold 0: 2220.

[I 2024-01-27 15:10:40,575] Trial 5 finished with value: 23910.0 and parameters: {'penalty': 'l2'}. Best is trial 0 with value: 23910.0.


Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3118, 170), y_train_smote.shape: (3118,),
x_test_smote.shape: (774, 170), y_test_smote.shape:(774,)
class
0    1559
1    1559
Name: count, dtype: int64
class
1    387
0    387
Name: count, dtype: int64
Fitting model

Cost in fold 4: 43060.0


Fold:  0
x_train_.shape: (1600, 170), y_train_.shape: (1600,),
x_test_.shape: (400, 170), y_test_.shape:(400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.8s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3118, 170), y_train_smote.shape: (3118,),
x_test_smote.shape: (778, 170), y_test_smote.shape:(778,)
class
1    1559
0    1559
Name: count, dtype: int64
class
0    389
1    389
Name: count, dtype: int64
Fitting model

Cost in fold 0: 2220.

[I 2024-01-27 15:10:46,658] Trial 6 finished with value: 23910.0 and parameters: {'penalty': 'l2'}. Best is trial 0 with value: 23910.0.


Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3118, 170), y_train_smote.shape: (3118,),
x_test_smote.shape: (774, 170), y_test_smote.shape:(774,)
class
0    1559
1    1559
Name: count, dtype: int64
class
1    387
0    387
Name: count, dtype: int64
Fitting model

Cost in fold 4: 43060.0


Fold:  0
x_train_.shape: (1600, 170), y_train_.shape: (1600,),
x_test_.shape: (400, 170), y_test_.shape:(400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.8s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3118, 170), y_train_smote.shape: (3118,),
x_test_smote.shape: (778, 170), y_test_smote.shape:(778,)
class
1    1559
0    1559
Name: count, dtype: int64
class
0    389
1    389
Name: count, dtype: int64
Fitting model

Cost in fold 0: 2220.

[I 2024-01-27 15:10:52,863] Trial 7 finished with value: 23910.0 and parameters: {'penalty': 'l2'}. Best is trial 0 with value: 23910.0.


Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3118, 170), y_train_smote.shape: (3118,),
x_test_smote.shape: (774, 170), y_test_smote.shape:(774,)
class
0    1559
1    1559
Name: count, dtype: int64
class
1    387
0    387
Name: count, dtype: int64
Fitting model

Cost in fold 4: 43060.0


Fold:  0
x_train_.shape: (1600, 170), y_train_.shape: (1600,),
x_test_.shape: (400, 170), y_test_.shape:(400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.8s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3118, 170), y_train_smote.shape: (3118,),
x_test_smote.shape: (778, 170), y_test_smote.shape:(778,)
class
1    1559
0    1559
Name: count, dtype: int64
class
0    389
1    389
Name: count, dtype: int64
Fitting model

Cost in fold 0: 2200.

[I 2024-01-27 15:10:57,734] Trial 8 pruned. 


Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3124, 170), y_train_smote.shape: (3124,),
x_test_smote.shape: (780, 170), y_test_smote.shape:(780,)
class
0    1562
1    1562
Name: count, dtype: int64
class
1    390
0    390
Name: count, dtype: int64
Fitting model

Cost in fold 3: 17270.0


Fold:  0
x_train_.shape: (1600, 170), y_train_.shape: (1600,),
x_test_.shape: (400, 170), y_test_.shape:(400,)
Starting pipeline transformation of Xtrain
[Pipeline] ....... (step 1 of 2) Processing Knn_imputer, total=   0.8s
[Pipeline] ..... (step 2 of 2) Processing Robust_Scaler, total=   0.0s
Starting SMOTE transformation of Xtrain,Ytrain
Starting pipeline transformation of Xtest
Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3118, 170), y_train_smote.shape: (3118,),
x_test_smote.shape: (778, 170), y_test_smote.shape:(778,)
class
1    1559
0    1559
Name: count, dtype: int64
class
0    389
1    389
Name: count, dtype: int64
Fitting model

Cost in fold 0: 2200.

[I 2024-01-27 15:11:02,636] Trial 9 pruned. 


Starting SMOTE transformation of Xtest,Ytest
x_train_smote.shape: (3124, 170), y_train_smote.shape: (3124,),
x_test_smote.shape: (780, 170), y_test_smote.shape:(780,)
class
0    1562
1    1562
Name: count, dtype: int64
class
1    390
0    390
Name: count, dtype: int64
Fitting model

Cost in fold 3: 17270.0


In [27]:
find_param.best_value

23910.0

In [28]:
import optuna
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    # Define hyperparameter search space
    C = trial.suggest_loguniform('C', 1e-5, 1e5)
    max_iter = trial.suggest_int('max_iter', 100, 1000)
    
    # Create and train Logistic Regression model
    model = LogisticRegression(C=C, max_iter=max_iter, random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate accuracy as the metric to optimize
    accuracy = accuracy_score(y_test, y_pred)
    
    # Report intermediate result for pruning
    trial.report(accuracy, step=trial.number)
    
    # Handle pruning based on the intermediate result
    if trial.should_prune():
        raise optuna.TrialPruned()

    return 1.0 - accuracy  # Optuna minimizes the objective function

    # Create Optuna study with MedianPruner and optimize hyperparameters
study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=100)

# Print the best hyperparameters found
print("Best hyperparameters:", study.best_params)

# Train the model with the best hyperparameters
best_model = LogisticRegression(**study.best_params, random_state=42)
best_model.fit(X_train, y_train)

# Evaluate the best model on the test set
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)

print(f"Accuracy of the best model: {accuracy_best:.4f}")


[I 2024-01-27 17:19:37,414] A new study created in memory with name: no-name-8d0d4a65-bfd8-4076-a102-6efd1cd92995
[I 2024-01-27 17:19:37,431] Trial 0 finished with value: 0.0 and parameters: {'C': 0.31529490967331053, 'max_iter': 973}. Best is trial 0 with value: 0.0.
[I 2024-01-27 17:19:37,456] Trial 1 finished with value: 0.0 and parameters: {'C': 4128.610575565414, 'max_iter': 221}. Best is trial 0 with value: 0.0.
[I 2024-01-27 17:19:37,473] Trial 2 finished with value: 0.0 and parameters: {'C': 12.232739556326395, 'max_iter': 905}. Best is trial 0 with value: 0.0.
[I 2024-01-27 17:19:37,504] Trial 3 finished with value: 0.0 and parameters: {'C': 466.77354273570484, 'max_iter': 788}. Best is trial 0 with value: 0.0.
[I 2024-01-27 17:19:37,519] Trial 4 finished with value: 0.0 and parameters: {'C': 20043.405694105564, 'max_iter': 537}. Best is trial 0 with value: 0.0.
[I 2024-01-27 17:19:37,564] Trial 5 finished with value: 0.0 and parameters: {'C': 461.2370687050697, 'max_iter': 30

Best hyperparameters: {'C': 4.977965509022495e-05, 'max_iter': 696}
Accuracy of the best model: 0.3000


In [31]:
study.trials_dataframe()[study.trials_dataframe()['state']=='PRUNED']

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_max_iter,state
