In [215]:
!pip install catboost optuna lightgbm



In [216]:
import numpy as np
import pandas as pd
from scipy.stats import boxcox
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, \
cross_val_score, cross_validate, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline
import optuna
from optuna.importance import get_param_importances, FanovaImportanceEvaluator
from xgboost import XGBClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
f1_score, roc_auc_score, make_scorer
from optuna.pruners import ThresholdPruner
import calendar
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, ClusterCentroids

In [217]:
updated_columns = ['customer_id', 'purchase_date', 'is_local', 'age', 'gender', 'mobile_model', 'price',
                   'is_from_facebook_page', 'is_facebook_page_follower', 'is_returning_customer',
                   'awareness_through_marketing']

In [218]:
def get_data_frame(apply_price_transformation:bool = False,
                   is_returning_customer_dependent_variable:bool = False)->pd.DataFrame:
    """
    Loads data from CSV file into DataFrame and returns the same.
    :return: DataFrame holding data from CSV file
    """
    dataset = pd.read_csv('TechCorner_Sales_update.csv')
    dataset.columns = updated_columns

    # Date Handling
    dataset['purchase_date'] = pd.to_datetime(dataset['purchase_date'], errors='raise', dayfirst=True)

    # Periodic Features
    dataset['day_of_year'] = dataset['purchase_date'].dt.dayofyear

    # Check leap year (366 days if leap year, else 365)
    dataset['year'] = dataset['purchase_date'].dt.year
    dataset['days_in_year'] = dataset['year'].apply(lambda x: 366 if calendar.isleap(x) else 365)

    # sine/cosine transformations
    dataset['sin_day_of_year'] = np.sin(2*np.pi*dataset['day_of_year']/dataset['days_in_year'])
    dataset['cos_day_of_year'] = np.cos(2 * np.pi * dataset['day_of_year'] / dataset['days_in_year'])

    # Drop purchase date column
    dataset.drop(columns=['customer_id', 'purchase_date', 'day_of_year', 'year', 'days_in_year'], inplace=True)


    # map yes/no to 1/0, instead of performing one hot encoding
    dataset['is_local'] = dataset['is_local'].map({'Rangamati Sadar':1, 'Inside Rangamati':1, 'Outside Rangamati':0})
    dataset['gender'] = dataset['gender'].map({'F':1, 'M':0})
    dataset['is_from_facebook_page'] = dataset['is_from_facebook_page'].map({'Yes':1, 'No':0})
    dataset['is_facebook_page_follower'] = dataset['is_facebook_page_follower'].map({'Yes':1, 'No':0})
    dataset['is_returning_customer'] = dataset['is_returning_customer'].map({'Yes':1, 'No':0})
    dataset['awareness_through_marketing'] = dataset['awareness_through_marketing'].map({'Yes':1, 'No':0})

    if apply_price_transformation:
        dataset['price'], lambda_bc = boxcox(dataset['price'])

    last_column = ['is_from_facebook_page']
    if is_returning_customer_dependent_variable:
        last_column = ['is_returning_customer']

    columns_at_start = ["age", "price", "mobile_model"]
    re_ordered_columns = (columns_at_start +
                          [col for col in dataset.columns if col not in columns_at_start + last_column] +
                          last_column)

    dataset = dataset[re_ordered_columns]
    return dataset

In [219]:
pd.set_option('display.max_columns', None)

# Load data set
dataset = get_data_frame(False, True)
print(dataset.columns)

# # Separate dependent and independent variables
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]


# Split data into Training and Test Set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)
# print(len(y[y == 0]))
# print(len(y[y == 1]))


# Apply Standard/Robust Scaling/one hot encoding
standard_scaling_features = [0] # Age
robust_scaling_features = [1] # Price
categorical_features = [2] # Mobile Model

preprocessor = ColumnTransformer([
    ('standardscaler', StandardScaler(), standard_scaling_features)
    , ('robustscaler', RobustScaler(), robust_scaling_features)
    ,('onehotencoder', OneHotEncoder(handle_unknown='ignore'), categorical_features)
], remainder='passthrough')

pipeline = Pipeline([('preprocessor', preprocessor)])
X_train_transformed = pipeline.fit_transform(X_train)



Index(['age', 'price', 'mobile_model', 'is_local', 'gender',
       'is_from_facebook_page', 'is_facebook_page_follower',
       'awareness_through_marketing', 'sin_day_of_year', 'cos_day_of_year',
       'is_returning_customer'],
      dtype='object')


We will first perform hyper paramater tuning (using optuna) to various machine learning models to identify parameter values corresponding to maximum F1 score. Then we will use those parameter values to get corresponding accuracy of these models, so that we will have balance between F1 score and accuracy. As this is imabalanced data set, we used StratifiedKFold validation. You can find commented code to Prune Trials, which you can use in case you want prune trials that are not reachining targeted scores.

## Optuna Objective Functions

In [221]:
from math import log
def xgb_objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 2, 6),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500, log=True),
        'subsample':trial.suggest_float("subsample", 0.6, 1, log=True),
        'colsample_bytree':trial.suggest_float("colsample_bytree", 0.6, 1, log=True),
        'scale_pos_weight': 6677/2194
    }
    # Params leading to Max f1 score. Use them to find corresponding accuracy
    # params = {
    #     "max_depth": 2,
    #     "learning_rate": 0.01219299570440381,
    #     "n_estimators": 131,
    #     'subsample':0.8091079599945468,
    #     'colsample_bytree':0.6333120861160056,
    #     'scale_pos_weight': 6677/2194
    # }

    model = XGBClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True,random_state=42)
    score = cross_val_score(model, X_train_transformed, y_train, cv=cv, scoring="f1").mean()
    # Report accuracy at step 1
    # trial.report(score, step=1)
    # Prune if accuracy is below 0.75
    # if trial.should_prune():
    #   raise optuna.TrialPruned()
    return score

In [223]:
def logistic_regression_objective(trial):

  tol = trial.suggest_float('tol' , 0.001, 1000)
  c = trial.suggest_float('C', 1e-3, 1e3, log=True)
  penalty = trial.suggest_categorical('penalty',['l1', 'l2', 'elasticnet'])
  solver = trial.suggest_categorical('solver',['lbfgs', 'liblinear', 'newton-cg','saga'])
  max_iterations = trial.suggest_int('max_iterations', 100, 1000)
  l1_ratio = trial.suggest_float('l1_ratio', 0, 1) if penalty == 'elasticnet' else None
  class_weight = trial.suggest_categorical('class_weight',['balanced', None])
  # Ensure solver compatability
  if penalty == 'l1' and solver not in ['liblinear','saga']:
    return float('-inf')
  if penalty == 'elasticnet' and solver != 'saga':
    return float('-inf')

  model =  LogisticRegression(tol=tol, C=c, penalty=penalty, solver=solver, max_iter=max_iterations, l1_ratio = l1_ratio
                               ,class_weight=class_weight)
  # Params leading to Max f1 score. Use them to find corresponding accuracy
  # {'tol': 540.1788332371493, 'C': 22.514489375753627, 'penalty': 'l1', 'solver': 'saga', 'max_iterations': 163,
  # 'class_weight': 'balanced'}
  # model =  LogisticRegression(C=22.514489375753627, tol=540.1788332371493, solver='saga', penalty='l1', max_iter=163,
  #                              class_weight='balanced')
  cv = StratifiedKFold(n_splits=5, shuffle=True,random_state=42)
  score = cross_val_score(model, X_train_transformed, y_train, cv=cv, scoring='f1').mean()
  # Report accuracy at step 1
  # trial.report(score, step=1)
  # Prune if accuracy is below Threshold
  # if trial.should_prune():
  #   raise optuna.TrialPruned()
  return score


In [224]:
def knn_objective(trial):
  params = {
      'n_neighbors' : trial.suggest_int('n_neighbors', 5, 100),
      'weights' : trial.suggest_categorical('weights', ['uniform','distance']),
      'algorithm' : trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
      'leaf_size' : trial.suggest_int('leaf_size', 10, 300),
      'p' : trial.suggest_int('p', 1,2)
      }
  # Params leading to Max f1 score. Use them to find corresponding accuracy
  # params={'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'brute', 'leaf_size': 296, 'p': 1}

  model = KNeighborsClassifier(**params, n_jobs=-1)

  cv = StratifiedKFold(n_splits=5, shuffle=True,random_state=42)
  score = cross_val_score(model, X_train_transformed, y_train, cv=cv, scoring='f1', n_jobs=-1).mean()
  # trial.report(score, step=1)

  # if trial.should_prune():
  #   raise optuna.TrialPruned()
  return score


In [225]:
def linear_svc_objective(trial):
  penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
  loss = trial.suggest_categorical('loss', ['hinge','squared_hinge'])
  dual = trial.suggest_categorical('dual', [False, 'auto'])
  tol = trial.suggest_float('tol', 1e-6, 1e-2, log=True)
  C = trial.suggest_float('C', 1e-3, 1e3, log=True)
  fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
  class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])
  max_iter = trial.suggest_int('max_iter',1000, 10000)

  if penalty == 'l1' and loss == 'hinge':
    return float('-inf')
  if penalty == 'l2' and loss == 'hinge' and dual == False:
    return float('-inf')

  model = LinearSVC(penalty=penalty, loss=loss, dual=dual, tol=tol,
                    C=C, fit_intercept=fit_intercept,
                    class_weight=class_weight, max_iter=max_iter)

  # Params leading to Max f1 score. Use them to find corresponding accuracy
  # params={'penalty': 'l1', 'loss': 'squared_hinge', 'dual': False, 'tol': 1.2930365756581555e-05,
  #         'C': 1.0361104265170653, 'fit_intercept': True, 'class_weight': 'balanced', 'max_iter': 6805}
  # model = LinearSVC(**params)
  cv = StratifiedKFold(n_splits=5, shuffle=True,random_state=42)
  score = cross_val_score(model, X_train_transformed, y_train, cv=cv, scoring='f1', n_jobs=-1).mean()
  # trial.report(score, step=1)

  # if trial.should_prune():
  #   raise optuna.TrialPruned()
  return score



In [226]:
def svc_objective(trial):

  C = trial.suggest_float('C', 1e-3, 1e3, log=True)
  kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
  # Parameters based on kernel Choice
  degree = trial.suggest_int('degree', 1, 5) if kernel == 'poly' else 0
  gamma = trial.suggest_float('gamma', 1e-4, 1e1, log=True) if kernel in ['poly', 'rbf', 'sigmoid'] else 0
  coef0 = trial.suggest_float('coef0', 0, 1) if kernel in ['poly','sigmoid'] else 0
  shrinking = trial.suggest_categorical('shrinking', [True, False])
  tol = trial.suggest_float('tol', 1e-6, 1e-2, log=True)
  class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])

  if kernel == 'poly' and degree is None:
    return float('-inf')
  if kernel in ['poly', 'sigmoid'] and coef0 is None:
    return float('-inf')

  model = SVC(C=C, kernel=kernel, degree=degree, gamma = gamma, coef0=coef0,
              shrinking=shrinking, tol=tol, class_weight=class_weight, max_iter=2000)
  # Params leading to Max f1 score. Use them to find corresponding accuracy
  # params={'C': 0.013411234582791877, 'kernel': 'rbf', 'gamma': 1.4470227444863837, 'shrinking': True,
  #         'tol': 0.00040054839483054177, 'class_weight': 'balanced'}
  # model = SVC(**params)
  cv = StratifiedKFold(n_splits=5, shuffle=True,random_state=42)
  score = cross_val_score(model, X_train_transformed, y_train, cv=cv, n_jobs=-1, scoring='f1').mean()
  # trial.report(score, step=1)

  # if trial.should_prune():
  #   raise optuna.TrialPruned()

  return score

In [227]:
def randomforest_objective(trial):

  params = {
    'n_estimators' : trial.suggest_int('n_estimators', 50, 500),
    'max_depth' : trial.suggest_int('max_depth', 5, 50),
    'min_samples_split' : trial.suggest_int('min_samples_split', 2, 20),
    'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 1, 10),
    'max_features' : trial.suggest_categorical('max_features', ['sqrt','log2', None]),
    'bootstrap' : trial.suggest_categorical('bootstrap', [True, False]),
    'class_weight' : trial.suggest_categorical('class_weight', ['balanced','balanced_subsample', None]),
    'criterion' : trial.suggest_categorical('criterion', ['gini','entropy','log_loss'])
  }
  # Params leading to Max f1 score. Use them to find corresponding accuracy
  # params={'n_estimators': 465, 'max_depth': 7, 'min_samples_split': 12, 'min_samples_leaf': 4, 'max_features': None,
  #         'bootstrap': False, 'class_weight': 'balanced', 'criterion': 'gini'}
  model = RandomForestClassifier(**params)
  cv = StratifiedKFold(n_splits=5, shuffle=True,random_state=42)
  score = cross_val_score(model,X_train_transformed, y_train, cv=cv, n_jobs=-1,scoring='f1').mean()

  # trial.report(score, step=1)

  # if trial.should_prune():
  #   raise optuna.TrialPruned()

  return score


In [229]:
def decission_tree_objective(trial):

  params = {
      'criterion' : trial.suggest_categorical('criterion', ['gini','entropy']),
      'max_depth' : trial.suggest_int('max_depth', 3, 15),
      'min_samples_split' : trial.suggest_int('min_samples_split', 2, 50),
      'min_samples_leaf' : trial.suggest_int('min_samples_leaf', 1, 50),
      'max_features' : trial.suggest_float('max_features', 0.1, 1.0),
      'ccp_alpha' : trial.suggest_float('ccp_alpha', 0.001, 0.1),
      'class_weight' : trial.suggest_categorical('class_weight', ['balanced', None])
  }
  # Params leading to Max f1 score. Use them to find corresponding accuracy
  # params={'criterion': 'gini', 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 9,
  #         'max_features': 0.45192524410340684, 'ccp_alpha': 0.033735311554373926, 'class_weight': 'balanced'}
  model = DecisionTreeClassifier(splitter='best', **params)
  cv = StratifiedKFold(n_splits=5, shuffle=True,random_state=42)
  score = cross_val_score(model, X_train_transformed, y_train, cv=cv, n_jobs=-1, scoring='f1').mean()
  # trial.report(score, step=1)

  # if trial.should_prune():
  #   raise optuna.TrialPruned()

  return score

In [230]:
def lightgbm_objective(trial):

  params = {
      'num_leaves' : trial.suggest_int('num_leaves', 10, 50, log=True),
      'max_depth' : trial.suggest_int('max_depth', 3, 8, log=True),
      'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 10, 100, log=True),
      "learning_rate" : trial.suggest_float("learning_rate", 0.001, 0.05, log=True),
      'n_estimators' : trial.suggest_int('n_estimators', 100, 1000, log=True),
      'max_bin' : trial.suggest_int('max_bin', 32, 128, log=True),
      'num_iterations' : trial.suggest_int('num_iterations', 100, 1000, log=True)
  }
  # Params leading to Max f1 score. Use them to find corresponding accuracy
  # params={'num_leaves': 41, 'max_depth': 3, 'min_data_in_leaf': 14, 'learning_rate': 0.049741212348164456,
  #         'n_estimators': 252, 'max_bin': 107, 'num_iterations': 130}
  model = LGBMClassifier(**params, is_unbalance=True)
  cv = StratifiedKFold(n_splits=5, shuffle=True,random_state=42)
  score = cross_val_score(model, X_train_transformed, y_train, cv=cv, n_jobs=-1, scoring='f1').mean()
  # trial.report(score, step=1)

  # if trial.should_prune():
  #   raise optuna.TrialPruned()

  return score

In [232]:
def catboost_objective(trial):
  params = {
      'iterations' : trial.suggest_int('iterations', 100, 1000, log=True),
      'depth' : trial.suggest_int('depth', 3, 8, log=True),
      'learning_rate' : trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
      'auto_class_weights' : trial.suggest_categorical('auto_class_weights', ['None', 'Balanced', 'SqrtBalanced'])
  }
  # Params leading to Max f1 score. Use them to find corresponding accuracy
  # params={'iterations': 188, 'depth': 4, 'learning_rate': 0.013800184228352417, 'auto_class_weights': 'Balanced'}
  model = CatBoostClassifier(**params) #
  cv = StratifiedKFold(n_splits=5, shuffle=True,random_state=42)
  score = cross_val_score(model, X_train_transformed, y_train, cv=cv, n_jobs=-1, scoring='f1').mean()
  # trial.report(score, step=1)

  # if trial.should_prune():
  #   raise optuna.TrialPruned()

  return score

### Stacking Machine Learning Models

Stacking CatBoostClassifier, LGBMClassifier, XGBClassifier together by using LogisticRegression model as meta model(Final Estimator)

In [233]:
def stacking_objective(trial):

  params_cat_boost={'iterations': 188, 'depth': 4, 'learning_rate': 0.013800184228352417, 'auto_class_weights': 'Balanced'}
  model_cat_boost = CatBoostClassifier(**params_cat_boost, random_state=42)

  params_lgbm={'num_leaves': 41, 'max_depth': 3, 'min_data_in_leaf': 14, 'learning_rate': 0.049741212348164456,
          'n_estimators': 252, 'max_bin': 107, 'num_iterations': 130}
  model_lgbm = LGBMClassifier(**params_lgbm, is_unbalance=True, random_state=42)

  params_xgb = {
        "max_depth": 2,
        "learning_rate": 0.01219299570440381,
        "n_estimators": 131,
        'subsample':0.8091079599945468,
        'colsample_bytree':0.6333120861160056,
        'scale_pos_weight': 6677/2194
    }

  model_xgb = XGBClassifier(**params_xgb, random_state=42)

  meta_model =  LogisticRegression(C=22.514489375753627, tol=540.1788332371493, solver='saga', penalty='l1', max_iter=163,
                               class_weight='balanced', random_state=42)

  stacked_model = StackingClassifier(estimators=[('catboost', model_cat_boost), ('lgbm',model_lgbm),
                                                 ('xgb',model_xgb)], final_estimator=meta_model)

  cv = StratifiedKFold(n_splits=5, shuffle=True,random_state=42)
  score = cross_val_score(stacked_model, X_train_transformed, y_train, cv=cv, n_jobs=-1, scoring='f1').mean()

  return score



### Probablity Threshold Objective Function

Varying Probabiliity thresholds between 0.5 to 0.6 to see if it can improve F1 and accuracy metrics

In [234]:
def prob_threshold_objective(trial):
  threshold = trial.suggest_float('threshold', 0.5, 0.6, log=True)
  params_xgb={'max_depth': 6, 'learning_rate': 0.11047765334239668, 'n_estimators': 194, 'subsample': 0.6464190923360026,
        'colsample_bytree': 0.7941654326235098,'scale_pos_weight': 6677/2194}

  model_xgb = XGBClassifier(**params_xgb, random_state=42)
  cv = StratifiedKFold(n_splits=5, shuffle=True,random_state=42)
  f1_scores = []

  for train_idx, val_idx in cv.split(X_train_transformed, y_train):
    X_cv_train, X_cv_val = X_train_transformed[train_idx], X_train_transformed[val_idx]
    y_cv_train, y_cv_val = y_train[train_idx], y_train[val_idx]
    model_xgb.fit(X_cv_train, y_cv_train)
    # Get probability predictions
    y_proba = model_xgb.predict_proba(X_cv_val)[:, 1]

    # Apply tuned probability threshold
    y_predict = (y_proba >= threshold).astype(int)
    f1_scores.append(precision_score(y_cv_val, y_predict))

  return np.mean(f1_scores)




### SMOTE Sampling

As the data set is imbalanced performed smote sampling to increase counts for minority class, there by trying to see if it can improve F1 and accuracy scores

In [235]:
def smote_objective(trial):
  smote_sampling_ratio = trial.suggest_float("sampling_strategy",0.4, 1.0)
  # smote_tomek_sampling_ratio = trial.suggest_float("sampling_strategy",0.5, smote_sampling_ratio)
  # smote_enn_sampling_ratio = trial.suggest_float("sampling_strategy",0.4, smote_tomek_sampling_ratio)
  dataset = get_data_frame(False, True)
  # print(dataset.columns)
  # Separate dependent variable vector
  y = dataset.iloc[:, -1].values
  # categorical encoding of mobile_model
  dataset = pd.get_dummies(dataset.drop(columns=['is_returning_customer']), drop_first=False, columns=['mobile_model'])
  # print(dataset.columns)

  # Separate independent variables
  X = dataset.iloc[:, :].values


  # Split data into Training and Test Set
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=0)

  # smote = SMOTE(sampling_strategy=smote_sampling_ratio, random_state=0)
  # X_train, y_train = smote.fit_resample(X_train, y_train)

  # smote_tomek = SMOTETomek(sampling_strategy=smote_sampling_ratio, random_state=0)
  # X_train, y_train = smote_tomek.fit_resample(X_train, y_train)

  smote_enn = SMOTEENN(sampling_strategy=smote_sampling_ratio, random_state=0)
  X_train, y_train = smote_enn.fit_resample(X_train, y_train)

  # Apply Standard/Robust Scaling
  standard_scaling_features = [0] # Age
  robust_scaling_features = [1] # Price

  preprocessor = ColumnTransformer([
      ('standardscaler', StandardScaler(), standard_scaling_features)
      , ('robustscaler', RobustScaler(), robust_scaling_features)
  ], remainder='passthrough')

  pipeline = Pipeline([('preprocessor', preprocessor)])
  X_train_transformed = pipeline.fit_transform(X_train)
  X_test = pipeline.transform(X_test)

  params_lgbm={'num_leaves': 14, 'max_depth': 5, 'min_data_in_leaf': 13, 'learning_rate': 0.012116103805631632,
               'n_estimators': 571, 'max_bin': 66, 'num_iterations': 888}
  model_lgbm = LGBMClassifier(**params_lgbm, verbose=-1) #94/61
  model_lgbm.fit(X_train_transformed, y_train)

  y_smote_predict = model_lgbm.predict(X_test)
  return f1_score(y_test, y_smote_predict)



### Under Sampling

As the data set is imbalanced performed under sampling to decrease counts for majority class, there by trying to see if it can improve F1 and accuracy scores

In [236]:
def under_sampling_objective(trial):
  # under_sampling_ratio = trial.suggest_float('sampling_strategy', 0.35, 1)
  dataset = get_data_frame(False, True)
  # print(dataset.columns)
  # Separate dependent variable vector
  y = dataset.iloc[:, -1].values
  # categorical encoding of mobile_model
  dataset = pd.get_dummies(dataset.drop(columns=['is_returning_customer']), drop_first=False, columns=['mobile_model'])
  # print(dataset.columns)

  # Separate independent variables
  X = dataset.iloc[:, :].values

  # Split data into Training and Test Set
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y, random_state=0)

  # Apply Standard/Robust Scaling
  standard_scaling_features = [0] # Age
  robust_scaling_features = [1] # Price

  preprocessor = ColumnTransformer([
      ('standardscaler', StandardScaler(), standard_scaling_features)
      , ('robustscaler', RobustScaler(), robust_scaling_features)
  ], remainder='passthrough')

  pipeline = Pipeline([('preprocessor', preprocessor)])
  X_train_transformed = pipeline.fit_transform(X_train)
  X_test = pipeline.transform(X_test)

  under_sampler = RandomUnderSampler(random_state=0, sampling_strategy=0.9776536095920548)
  # under_sampler = ClusterCentroids()
  X_train_transformed, y_train = under_sampler.fit_resample(X_train_transformed, y_train)

  params_lgbm={'num_leaves': 14, 'max_depth': 5, 'min_data_in_leaf': 13, 'learning_rate': 0.012116103805631632,
               'n_estimators': 571, 'max_bin': 66, 'num_iterations': 888}
  model_lgbm = LGBMClassifier(**params_lgbm, verbose=-1, random_state=0) #94/61
  model_lgbm.fit(X_train_transformed, y_train)

  y_predict = model_lgbm.predict(X_test)
  return accuracy_score(y_test, y_predict)



In [237]:
study = optuna.create_study(directions=['maximize']) #, pruner=ThresholdPruner(lower=0.65)
study.optimize(xgb_objective, n_trials=100, n_jobs=-1)
print(study.best_trial)
# importance = get_param_importances(study,evaluator=FanovaImportanceEvaluator())
# print(importance)

[I 2025-06-13 16:45:56,648] A new study created in memory with name: no-name-c2e6f5c0-4271-455f-9173-275428627950
[I 2025-06-13 16:45:58,395] Trial 0 finished with value: 0.33425943984768713 and parameters: {'max_depth': 4, 'learning_rate': 0.019626857389982832, 'n_estimators': 71, 'subsample': 0.8831754870335521, 'colsample_bytree': 0.6233775899205083}. Best is trial 0 with value: 0.33425943984768713.
[I 2025-06-13 16:45:59,294] Trial 1 finished with value: 0.3382886900148102 and parameters: {'max_depth': 2, 'learning_rate': 0.022962242529698428, 'n_estimators': 225, 'subsample': 0.6980660824266193, 'colsample_bytree': 0.9847893692326589}. Best is trial 1 with value: 0.3382886900148102.
[I 2025-06-13 16:46:01,645] Trial 3 finished with value: 0.30392888564145626 and parameters: {'max_depth': 6, 'learning_rate': 0.07628828571592199, 'n_estimators': 60, 'subsample': 0.7210600688318589, 'colsample_bytree': 0.6860444743638795}. Best is trial 1 with value: 0.3382886900148102.
[I 2025-06-13

FrozenTrial(number=90, state=1, values=[0.35293929677112384], datetime_start=datetime.datetime(2025, 6, 13, 16, 47, 7, 672542), datetime_complete=datetime.datetime(2025, 6, 13, 16, 47, 8, 286190), params={'max_depth': 2, 'learning_rate': 0.015338655903040864, 'n_estimators': 53, 'subsample': 0.7260901466089422, 'colsample_bytree': 0.9330173253181531}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_depth': IntDistribution(high=6, log=False, low=2, step=1), 'learning_rate': FloatDistribution(high=0.3, log=True, low=0.01, step=None), 'n_estimators': IntDistribution(high=500, log=True, low=50, step=1), 'subsample': FloatDistribution(high=1.0, log=True, low=0.6, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=True, low=0.6, step=None)}, trial_id=90, value=None)
