In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
from IPython.display import clear_output
import time
import catboost
import re
import optuna
import json
import sys
sys.path.append('../..')
import main

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (FunctionTransformer, StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer,
                                   OneHotEncoder)
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold, StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin

col_names = []
with open('../data/Faults27x7_var','r') as f:
    for line in f:
        col_names.append(line.strip())
        
train_org = pd.read_csv('../data/train.csv')
test_org = pd.read_csv('../data/test.csv')
org_data = pd.read_csv('../data/Faults.NNA', delimiter='\s', engine='python', names=col_names)

X = train_org.drop(['id','Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps','Other_Faults'], axis=1)
pastry = train_org['Pastry'].copy()
z_scratch = train_org['Z_Scratch'].copy()
k_scatch = train_org['K_Scatch'].copy()
stains = train_org['Stains'].copy()
dirtiness = train_org['Dirtiness'].copy()
bumps = train_org['Bumps'].copy()
other_faults = train_org['Other_Faults'].copy()

ys = [pastry, z_scratch, k_scatch, stains, dirtiness, bumps, other_faults]
y_names = ['pastry', 'z_scratch', 'k_scatch', 'stains', 'dirtiness', 'bumps', 'other_faults']

class KMeansTransformer(BaseEstimator,TransformerMixin):
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_init=10, random_state=0)
        
    def fit(self,X, y=None):
        self.kmeans.fit(X)
        return self
        
    def transform(self,X):
        labels = self.kmeans.predict(X)
        return np.c_[X, labels]

In [2]:
pastry_params_dict = None

In [3]:
condition = True
trial_count = 2000
iteration_count = 1
result = 0
n_chances = 0
prms = {'n_estimators':[100,500],
        'max_depth':[2,128],
        'max_leaves': [2,256],
        'learning_rate': [0.001,0.3],
        'gamma':[0.001,50],
        'min_child_weight':[0.01,50],
        'subsample': [0.33,0.9],
        'colsample_bytree': [0.33,0.9],
        'reg_alpha': [0.0001, 1],
        'reg_lambda':[0.0001,1]}

In [4]:
def objective_xgb(trial):

        cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        params = dict(
            n_estimators = trial.suggest_int('n_estimators',prms['n_estimators'][0],prms['n_estimators'][1]),
            max_depth = trial.suggest_int('max_depth',prms['max_depth'][0],prms['max_depth'][1]),
            max_leaves = trial.suggest_int('max_leaves',prms['max_leaves'][0], prms['max_leaves'][1]),
            learning_rate = trial.suggest_float('learning_rate', prms['learning_rate'][0], prms['learning_rate'][1]),
            gamma = trial.suggest_float('gamma', prms['gamma'][0], prms['gamma'][1]),
            min_child_weight = trial.suggest_float('min_child_weight', prms['min_child_weight'][0], prms['min_child_weight'][1]),
            subsample = trial.suggest_float('subsample', prms['subsample'][0], prms['subsample'][1]),
            colsample_bytree = trial.suggest_float('colsample_bytree', prms['colsample_bytree'][0], prms['colsample_bytree'][1]),
            reg_alpha=trial.suggest_float('reg_alpha', prms['reg_alpha'][0], prms['reg_alpha'][1]),
            reg_lambda = trial.suggest_float('reg_lambda', prms['reg_lambda'][0], prms['reg_lambda'][1])
        )
        
        xgbc = xgb.XGBClassifier(random_state= 0, objective='binary:logistic', **params)
        
        pipe = Pipeline(
            steps = [
                ('scaler', StandardScaler()),
                ('kmeans', KMeansTransformer(n_clusters=4)),
                ('model', xgbc)
            ]
        )
        
        score = np.mean(cross_val_score(pipe, X,pastry, scoring='roc_auc', cv= cvo))
        return score

In [5]:
study_lgb = optuna.create_study(direction='maximize')

[I 2024-03-14 05:20:19,353] A new study created in memory with name: no-name-6ce3a7bb-ef1b-43fc-bdb0-70ea0d45992e


In [6]:
while condition:
    if n_chances == 10:
        break
    print(f'pastry: trial_count: {iteration_count}')
    
    
    study_lgb.optimize(objective_xgb, n_trials= 500, n_jobs=-1, show_progress_bar=True)
    
    best_score = study_lgb.best_value
    time.sleep(10)
    
    if best_score > result:
        result = best_score
        # trial_df = study_lgb.trials_dataframe().drop(['number', 'datetime_start', 'datetime_complete', 'duration','state'],axis=1)
        # trial_df = trial_df.sort_values(by = 'value', ascending=False)
        # trial_df_top = trial_df.iloc[:100]
        iteration_count += 1
        pastry_params_dict = study_lgb.best_params
        n_chances = 0
        # if trial_count > 1000:
        #     trial_count -= 200
        clear_output()
    else:
        n_chances +=1 

pastry: trial_count: 4


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-03-14 06:30:45,820] Trial 2502 finished with value: 0.8621440016283938 and parameters: {'n_estimators': 468, 'max_depth': 111, 'max_leaves': 2, 'learning_rate': 0.03996472932389167, 'gamma': 4.9545143874728454, 'min_child_weight': 0.07415240415402152, 'subsample': 0.7134728324408304, 'colsample_bytree': 0.4231595142230105, 'reg_alpha': 0.9312255781496088, 'reg_lambda': 0.7249468903645856}. Best is trial 2482 with value: 0.8726950384345035.
[I 2024-03-14 06:30:46,029] Trial 2504 finished with value: 0.8617491507204832 and parameters: {'n_estimators': 467, 'max_depth': 111, 'max_leaves': 2, 'learning_rate': 0.038130448234705835, 'gamma': 5.326593058627414, 'min_child_weight': 0.08950261490824563, 'subsample': 0.7115894622797234, 'colsample_bytree': 0.4230211865638292, 'reg_alpha': 0.9315283117918048, 'reg_lambda': 0.7215504802029064}. Best is trial 2482 with value: 0.8726950384345035.
[I 2024-03-14 06:30:46,088] Trial 2500 finished with value: 0.8613063365681267 and parameters: {

  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-03-14 06:44:27,949] Trial 3011 finished with value: 0.86578502415246 and parameters: {'n_estimators': 494, 'max_depth': 113, 'max_leaves': 15, 'learning_rate': 0.04174806552782419, 'gamma': 10.007431872978923, 'min_child_weight': 1.0271938882790141, 'subsample': 0.658759949150968, 'colsample_bytree': 0.4101553242017549, 'reg_alpha': 0.9035040655043035, 'reg_lambda': 0.8496904255796045}. Best is trial 2482 with value: 0.8726950384345035.
[I 2024-03-14 06:44:29,755] Trial 3008 finished with value: 0.8682013676369946 and parameters: {'n_estimators': 486, 'max_depth': 114, 'max_leaves': 15, 'learning_rate': 0.04103520224174859, 'gamma': 7.386259742435013, 'min_child_weight': 0.07533061620667669, 'subsample': 0.679608899889225, 'colsample_bytree': 0.40951892870399076, 'reg_alpha': 0.9020949041107316, 'reg_lambda': 0.8517337906185564}. Best is trial 2482 with value: 0.8726950384345035.
[I 2024-03-14 06:44:29,889] Trial 3003 finished with value: 0.8682130579341667 and parameters: {'n_

  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-03-14 06:59:37,631] Trial 3510 finished with value: 0.8664256782130554 and parameters: {'n_estimators': 100, 'max_depth': 105, 'max_leaves': 6, 'learning_rate': 0.2416805138278993, 'gamma': 3.500541020293405, 'min_child_weight': 1.7555786387241838, 'subsample': 0.6791569285483342, 'colsample_bytree': 0.4633166789149773, 'reg_alpha': 0.9050622860719136, 'reg_lambda': 0.7604439450312788}. Best is trial 2482 with value: 0.8726950384345035.
[I 2024-03-14 06:59:45,229] Trial 3511 finished with value: 0.8615252422632496 and parameters: {'n_estimators': 480, 'max_depth': 105, 'max_leaves': 7, 'learning_rate': 0.022952989971193632, 'gamma': 15.442125116884768, 'min_child_weight': 1.829369376650493, 'subsample': 0.6759107952970725, 'colsample_bytree': 0.4637269308141288, 'reg_alpha': 0.9017888528490774, 'reg_lambda': 0.7623160475344034}. Best is trial 2482 with value: 0.8726950384345035.
[I 2024-03-14 06:59:48,134] Trial 3502 finished with value: 0.8711349411690128 and parameters: {'n_e

  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-03-14 07:14:57,246] Trial 4009 finished with value: 0.8525439082392486 and parameters: {'n_estimators': 287, 'max_depth': 109, 'max_leaves': 10, 'learning_rate': 0.031093295080462394, 'gamma': 34.08170378181901, 'min_child_weight': 5.197317991780663, 'subsample': 0.6650124666686744, 'colsample_bytree': 0.42142063501645166, 'reg_alpha': 0.9837329296352078, 'reg_lambda': 0.8566109110196806}. Best is trial 2482 with value: 0.8726950384345035.
[I 2024-03-14 07:14:59,896] Trial 4008 finished with value: 0.8645568966342685 and parameters: {'n_estimators': 462, 'max_depth': 108, 'max_leaves': 9, 'learning_rate': 0.09283609943096541, 'gamma': 10.786291807854615, 'min_child_weight': 4.867033376816532, 'subsample': 0.6673168764894373, 'colsample_bytree': 0.4210576639055216, 'reg_alpha': 0.8941074539501411, 'reg_lambda': 0.7116292143783044}. Best is trial 2482 with value: 0.8726950384345035.
[I 2024-03-14 07:15:01,385] Trial 4005 finished with value: 0.8679917167159108 and parameters: {'n

  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-03-14 09:09:54,635] Trial 4508 finished with value: 0.8663766908093031 and parameters: {'n_estimators': 464, 'max_depth': 105, 'max_leaves': 10, 'learning_rate': 0.19245823397015793, 'gamma': 5.40348781994923, 'min_child_weight': 0.011127671676308681, 'subsample': 0.7099241789404924, 'colsample_bytree': 0.4101703408229574, 'reg_alpha': 0.9451406190813024, 'reg_lambda': 0.9115061991183577}. Best is trial 2482 with value: 0.8726950384345035.
[I 2024-03-14 09:09:56,509] Trial 4510 finished with value: 0.8553028017356539 and parameters: {'n_estimators': 466, 'max_depth': 104, 'max_leaves': 10, 'learning_rate': 0.014982256574751024, 'gamma': 25.11671070854373, 'min_child_weight': 4.57868254981932, 'subsample': 0.7118598562117159, 'colsample_bytree': 0.4097662361988874, 'reg_alpha': 0.9520188919169317, 'reg_lambda': 0.8473521444708944}. Best is trial 2482 with value: 0.8726950384345035.
[I 2024-03-14 09:09:58,905] Trial 4502 finished with value: 0.8595558641711459 and parameters: {'n

  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-03-14 09:26:45,555] Trial 5010 finished with value: 0.8631110007021204 and parameters: {'n_estimators': 478, 'max_depth': 97, 'max_leaves': 2, 'learning_rate': 0.06927769415211378, 'gamma': 7.390582789351719, 'min_child_weight': 5.25918017577205, 'subsample': 0.6837141547390782, 'colsample_bytree': 0.4380713416224418, 'reg_alpha': 0.9344239246603329, 'reg_lambda': 0.7590203187523445}. Best is trial 2482 with value: 0.8726950384345035.
[I 2024-03-14 09:26:46,870] Trial 5005 finished with value: 0.8603737056283691 and parameters: {'n_estimators': 470, 'max_depth': 107, 'max_leaves': 2, 'learning_rate': 0.030218293905574796, 'gamma': 0.08279700850520566, 'min_child_weight': 5.022519023704667, 'subsample': 0.685746708883588, 'colsample_bytree': 0.4432118921479913, 'reg_alpha': 0.9407495850617731, 'reg_lambda': 0.7594327353864528}. Best is trial 2482 with value: 0.8726950384345035.
[I 2024-03-14 09:26:46,949] Trial 5006 finished with value: 0.8602390878489411 and parameters: {'n_est

KeyboardInterrupt: 

In [5]:
def objective_xgb(trial):

    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params = dict(
        n_estimators = trial.suggest_int('n_estimators',prms['n_estimators'][0],prms['n_estimators'][1]),
        max_depth = trial.suggest_int('max_depth',prms['max_depth'][0],prms['max_depth'][1]),
        max_leaves = trial.suggest_int('max_leaves',prms['max_leaves'][0], prms['max_leaves'][1]),
        learning_rate = trial.suggest_float('learning_rate', prms['learning_rate'][0], prms['learning_rate'][1]),
        gamma = trial.suggest_float('gamma', prms['gamma'][0], prms['gamma'][1]),
        min_child_weight = trial.suggest_float('min_child_weight', prms['min_child_weight'][0], prms['min_child_weight'][1]),
        subsample = trial.suggest_float('subsample', prms['subsample'][0], prms['subsample'][1]),
        colsample_bytree = trial.suggest_float('colsample_bytree', prms['colsample_bytree'][0], prms['colsample_bytree'][1]),
        reg_alpha=trial.suggest_float('reg_alpha', prms['reg_alpha'][0], prms['reg_alpha'][1]),
        reg_lambda = trial.suggest_float('reg_lambda', prms['reg_lambda'][0], prms['reg_lambda'][1])
    )
    
    xgbc = xgb.XGBClassifier(random_state= 0, objective='binary:logistic', **params)
    
    pipe = Pipeline(
        steps = [
            ('scaler', StandardScaler()),
            ('kmeans', KMeansTransformer(n_clusters=4)),
            ('model', xgbc)
        ]
    )
    
    score = np.mean(cross_val_score(pipe, X,pastry, scoring='roc_auc', cv= cvo))
    return score
    
study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective_xgb, n_trials= 3, n_jobs=-1, show_progress_bar=True)

[I 2024-03-13 11:10:52,155] A new study created in memory with name: no-name-482268e6-883e-415d-a7b4-f49f9fd0efe9


  0%|          | 0/3 [00:00<?, ?it/s]

[I 2024-03-13 11:10:55,838] Trial 0 finished with value: 0.8508063346985892 and parameters: {'n_estimators': 100, 'max_depth': 115, 'max_leaves': 138, 'learning_rate': 0.29363111960193405, 'gamma': 29.906670579354852, 'min_child_weight': 40.487784151016285, 'subsample': 0.551523587604274, 'colsample_bytree': 0.3515826060138617, 'reg_alpha': 0.7495992344875034, 'reg_lambda': 0.44250024495795925}. Best is trial 0 with value: 0.8508063346985892.
[I 2024-03-13 11:10:55,906] Trial 2 finished with value: 0.8472237715434462 and parameters: {'n_estimators': 100, 'max_depth': 83, 'max_leaves': 16, 'learning_rate': 0.25294885177154747, 'gamma': 44.437067386588744, 'min_child_weight': 11.098040687340992, 'subsample': 0.5121547184754389, 'colsample_bytree': 0.7114813176772874, 'reg_alpha': 0.17492129491217426, 'reg_lambda': 0.6856112629698355}. Best is trial 0 with value: 0.8508063346985892.
[I 2024-03-13 11:10:56,386] Trial 1 finished with value: 0.8667161376615328 and parameters: {'n_estimators'

In [17]:
study_lgb.trials[0].params

{'n_estimators': 100,
 'max_depth': 115,
 'max_leaves': 138,
 'learning_rate': 0.29363111960193405,
 'gamma': 29.906670579354852,
 'min_child_weight': 40.487784151016285,
 'subsample': 0.551523587604274,
 'colsample_bytree': 0.3515826060138617,
 'reg_alpha': 0.7495992344875034,
 'reg_lambda': 0.44250024495795925}

In [23]:
trails_df = study_lgb.trials_dataframe().drop(['number', 'datetime_start', 'datetime_complete', 'duration','state'],axis=1) 
trails_df = trails_df.sort_values(by = 'value', ascending=False)

In [26]:
trials_df_top = trails_df.iloc[:2]; trials_df_top

Unnamed: 0,value,params_colsample_bytree,params_gamma,params_learning_rate,params_max_depth,params_max_leaves,params_min_child_weight,params_n_estimators,params_reg_alpha,params_reg_lambda,params_subsample
1,0.866716,0.864916,1.808122,0.067666,124,143,29.069754,100,0.968157,0.745992,0.545719
0,0.850806,0.351583,29.906671,0.293631,115,138,40.487784,100,0.749599,0.4425,0.551524


In [29]:
for key in prms:
    mod_key = 'params_' + key
    prms[key][0] = trials_df_top[mod_key].min()
    prms[key][1] = trials_df_top[mod_key].max()

In [30]:
prms

{'n_estimators': [100, 100],
 'max_depth': [115, 124],
 'max_leaves': [138, 143],
 'learning_rate': [0.06766589891749072, 0.29363111960193405],
 'gamma': [1.8081217885080474, 29.906670579354852],
 'min_child_weight': [29.069754293706445, 40.487784151016285],
 'subsample': [0.5457190296010062, 0.551523587604274],
 'colsample_bytree': [0.3515826060138617, 0.8649163415929932],
 'reg_alpha': [0.7495992344875034, 0.9681573328050053],
 'reg_lambda': [0.44250024495795925, 0.74599217290967]}