In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
from IPython.display import clear_output
import time
import catboost
import re
import optuna
import json
import sys
sys.path.append('../..')
import main

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (FunctionTransformer, StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer,
                                   OneHotEncoder)
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold, StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin

col_names = []
with open('../data/Faults27x7_var','r') as f:
    for line in f:
        col_names.append(line.strip())
        
train_org = pd.read_csv('../data/train.csv')
test_org = pd.read_csv('../data/test.csv')
org_data = pd.read_csv('../data/Faults.NNA', delimiter='\s', engine='python', names=col_names)

X = train_org.drop(['id','Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps','Other_Faults'], axis=1)
pastry = train_org['Pastry'].copy()
z_scratch = train_org['Z_Scratch'].copy()
k_scatch = train_org['K_Scatch'].copy()
stains = train_org['Stains'].copy()
dirtiness = train_org['Dirtiness'].copy()
bumps = train_org['Bumps'].copy()
other_faults = train_org['Other_Faults'].copy()

ys = [pastry, z_scratch, k_scatch, stains, dirtiness, bumps, other_faults]
y_names = ['pastry', 'z_scratch', 'k_scatch', 'stains', 'dirtiness', 'bumps', 'other_faults']

class KMeansTransformer(BaseEstimator,TransformerMixin):
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_init=10, random_state=0)
        
    def fit(self,X, y=None):
        self.kmeans.fit(X)
        return self
        
    def transform(self,X):
        labels = self.kmeans.predict(X)
        return np.c_[X, labels]

In [2]:
pastry_params_dict = None

In [3]:
condition = True
trial_count = 2000
iteration_count = 1
result = 0
n_chances = 0
prms = {'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'n_estimators':[409,986],
        'max_depth':[4,128],
        'max_leaves': [8,276],
        'learning_rate': [0.006,0.05],
        'gamma':[0.003,5],
        'min_child_weight':[0.01,10],
        'subsample': [0.33,0.9],
        'colsample_bytree': [0.33,0.5],
        'reg_alpha': [0.0001, 1],
        'reg_lambda':[0.03,1]}

In [4]:
def objective_xgb(trial):

        cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        params = dict(
            n_estimators = trial.suggest_int('n_estimators',prms['n_estimators'][0],prms['n_estimators'][1]),
            max_depth = trial.suggest_int('max_depth',prms['max_depth'][0],prms['max_depth'][1]),
            max_leaves = trial.suggest_int('max_leaves',prms['max_leaves'][0], prms['max_leaves'][1]),
            learning_rate = trial.suggest_float('learning_rate', prms['learning_rate'][0], prms['learning_rate'][1]),
            gamma = trial.suggest_float('gamma', prms['gamma'][0], prms['gamma'][1]),
            min_child_weight = trial.suggest_float('min_child_weight', prms['min_child_weight'][0], prms['min_child_weight'][1]),
            subsample = trial.suggest_float('subsample', prms['subsample'][0], prms['subsample'][1]),
            colsample_bytree = trial.suggest_float('colsample_bytree', prms['colsample_bytree'][0], prms['colsample_bytree'][1]),
            reg_alpha=trial.suggest_float('reg_alpha', prms['reg_alpha'][0], prms['reg_alpha'][1]),
            reg_lambda = trial.suggest_float('reg_lambda', prms['reg_lambda'][0], prms['reg_lambda'][1])
        )
        
        xgbc = xgb.XGBClassifier(random_state= 0, objective='binary:logistic', **params)
        
        pipe = Pipeline(
            steps = [
                ('scaler', StandardScaler()),
                ('kmeans', KMeansTransformer(n_clusters=4)),
                ('model', xgbc)
            ]
        )
        
        score = np.mean(cross_val_score(pipe, X,pastry, scoring='roc_auc', cv= cvo))
        return score

In [5]:
study_xgb = optuna.create_study(direction='maximize')

[I 2024-03-17 22:36:39,268] A new study created in memory with name: no-name-769f27a1-5003-4f1c-85d0-783e8f476965


In [6]:
try:
    while condition:
        if n_chances == 10:
            break
        print(f'pastry: trial_count: {iteration_count}')
        if n_chances > 0:
            print(f'n_chance = {n_chances}')
        
        
        study_xgb.optimize(objective_xgb, n_trials= 500, n_jobs=-1, show_progress_bar=True)
        
        best_score = study_xgb.best_value
        time.sleep(10)
        
        iteration_count += 1
        
        if best_score > result:
            result = best_score
            # trial_df = study_xgb.trials_dataframe().drop(['number', 'datetime_start', 'datetime_complete', 'duration','state'],axis=1)
            # trial_df = trial_df.sort_values(by = 'value', ascending=False)
            # trial_df_top = trial_df.iloc[:100]
            # iteration_count += 1
            pastry_params_dict = study_xgb.best_params
            n_chances = 0
            # if trial_count > 1000:
            #     trial_count -= 200
            clear_output()
        else:
            n_chances +=1
            clear_output()
except:
    hpt_df = study_xgb.trials_dataframe()
    hpt_df_main = hpt_df.drop(['number', 'datetime_start', 'datetime_complete', 'duration', 'state'], axis=1)
    hpt_df_main_sorted = hpt_df_main.sort_values(by='value', ascending=False)
    hpt_df_main_top_300 = hpt_df_main_sorted.iloc[:300]
    
    hpt_df_main_top_300.to_csv('hpt_trials.csv', index=False)
    

pastry: trial_count: 2


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-03-17 23:06:49,266] Trial 504 finished with value: 0.8711977483140302 and parameters: {'n_estimators': 838, 'max_depth': 96, 'max_leaves': 8, 'learning_rate': 0.012793918530841341, 'gamma': 0.7895499799336179, 'min_child_weight': 2.1044927284481987, 'subsample': 0.5416386118494145, 'colsample_bytree': 0.3506663117527833, 'reg_alpha': 0.06639139724517437, 'reg_lambda': 0.7736251773189354}. Best is trial 434 with value: 0.8725323925745391.
[I 2024-03-17 23:06:50,337] Trial 511 finished with value: 0.8712551745277848 and parameters: {'n_estimators': 839, 'max_depth': 93, 'max_leaves': 8, 'learning_rate': 0.012651375830605624, 'gamma': 1.3874308530695756, 'min_child_weight': 2.1067351155717198, 'subsample': 0.5425057439111565, 'colsample_bytree': 0.3506103648516399, 'reg_alpha': 0.07191795538585018, 'reg_lambda': 0.7758686370076066}. Best is trial 434 with value: 0.8725323925745391.
[I 2024-03-17 23:06:50,391] Trial 508 finished with value: 0.8714515369100138 and parameters: {'n_es

In [6]:
while condition:
    if n_chances == 10:
        break
    print(f'pastry: trial_count: {iteration_count}')
    if n_chances > 0:
        print(f'n_chances = {n_chances}')
    
    
    study_lgb.optimize(objective_xgb, n_trials= 500, n_jobs=-1, show_progress_bar=True)
    
    best_score = study_lgb.best_value
    time.sleep(10)
    
    iteration_count += 1
    
    if best_score > result:
        result = best_score
        # trial_df = study_lgb.trials_dataframe().drop(['number', 'datetime_start', 'datetime_complete', 'duration','state'],axis=1)
        # trial_df = trial_df.sort_values(by = 'value', ascending=False)
        # trial_df_top = trial_df.iloc[:100]
        # iteration_count += 1
        pastry_params_dict = study_lgb.best_params
        n_chances = 0
        # if trial_count > 1000:
        #     trial_count -= 200
        clear_output()
    else:
        n_chances +=1
        clear_output()

pastry: trial_count: 9


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-03-17 21:54:13,759] Trial 12002 finished with value: 0.8659485697800633 and parameters: {'n_estimators': 442, 'max_depth': 17, 'max_leaves': 38, 'learning_rate': 0.03877621984652126, 'gamma': 10.406644717561619, 'min_child_weight': 10.710942448189078, 'subsample': 0.760297926407476, 'colsample_bytree': 0.39278099242256664, 'reg_alpha': 0.03668896905289171, 'reg_lambda': 0.8743487941269563}. Best is trial 11138 with value: 0.8723688546775923.
[I 2024-03-17 21:54:16,305] Trial 12004 finished with value: 0.8689133899299222 and parameters: {'n_estimators': 479, 'max_depth': 14, 'max_leaves': 54, 'learning_rate': 0.0537464707931119, 'gamma': 5.96962860763957, 'min_child_weight': 8.774751988137073, 'subsample': 0.7480000718422569, 'colsample_bytree': 0.3908291400736731, 'reg_alpha': 0.03476019772436192, 'reg_lambda': 0.8428829470567415}. Best is trial 11138 with value: 0.8723688546775923.
[I 2024-03-17 21:54:16,568] Trial 12011 finished with value: 0.8685838905573435 and parameters: 

KeyboardInterrupt: 

In [7]:
len(study_lgb.trials)

12364

In [8]:
hpt_df = study_lgb.trials_dataframe()

In [11]:
hpt_df_main = hpt_df.drop(['number', 'datetime_start', 'datetime_complete', 'duration', 'state'], axis=1)

In [15]:
hpt_df_main_sorted = hpt_df_main.sort_values(by='value', ascending=False)

In [16]:
hpt_df_main_top_300 = hpt_df_main_sorted.iloc[:300]

In [17]:
len(hpt_df_main_top_300)

300

In [18]:
for col in hpt_df_main_top_300.columns:
    print(f'{col}_min_value: {hpt_df_main_top_300[col].min()}')
    print(f'{col}_max_value: {hpt_df_main_top_300[col].max()}')
    print()

value_min_value: 0.8715719005025558
value_max_value: 0.8723688546775923

params_colsample_bytree_min_value: 0.33000052991418827
params_colsample_bytree_max_value: 0.4617981045706496

params_gamma_min_value: 0.004646967844698716
params_gamma_max_value: 4.83554806770142

params_learning_rate_min_value: 0.007353454937800716
params_learning_rate_max_value: 0.041732144999748894

params_max_depth_min_value: 4
params_max_depth_max_value: 128

params_max_leaves_min_value: 8
params_max_leaves_max_value: 276

params_min_child_weight_min_value: 0.011363554392128883
params_min_child_weight_max_value: 9.340647520039548

params_n_estimators_min_value: 409
params_n_estimators_max_value: 986

params_reg_alpha_min_value: 0.00013976860949944968
params_reg_alpha_max_value: 0.9596314250625866

params_reg_lambda_min_value: 0.03789756619143175
params_reg_lambda_max_value: 0.9999448758422064

params_subsample_min_value: 0.3418312164513462
params_subsample_max_value: 0.8656926212379339



In [5]:
def objective_xgb(trial):

    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params = dict(
        n_estimators = trial.suggest_int('n_estimators',prms['n_estimators'][0],prms['n_estimators'][1]),
        max_depth = trial.suggest_int('max_depth',prms['max_depth'][0],prms['max_depth'][1]),
        max_leaves = trial.suggest_int('max_leaves',prms['max_leaves'][0], prms['max_leaves'][1]),
        learning_rate = trial.suggest_float('learning_rate', prms['learning_rate'][0], prms['learning_rate'][1]),
        gamma = trial.suggest_float('gamma', prms['gamma'][0], prms['gamma'][1]),
        min_child_weight = trial.suggest_float('min_child_weight', prms['min_child_weight'][0], prms['min_child_weight'][1]),
        subsample = trial.suggest_float('subsample', prms['subsample'][0], prms['subsample'][1]),
        colsample_bytree = trial.suggest_float('colsample_bytree', prms['colsample_bytree'][0], prms['colsample_bytree'][1]),
        reg_alpha=trial.suggest_float('reg_alpha', prms['reg_alpha'][0], prms['reg_alpha'][1]),
        reg_lambda = trial.suggest_float('reg_lambda', prms['reg_lambda'][0], prms['reg_lambda'][1])
    )
    
    xgbc = xgb.XGBClassifier(random_state= 0, objective='binary:logistic', **params)
    
    pipe = Pipeline(
        steps = [
            ('scaler', StandardScaler()),
            ('kmeans', KMeansTransformer(n_clusters=4)),
            ('model', xgbc)
        ]
    )
    
    score = np.mean(cross_val_score(pipe, X,pastry, scoring='roc_auc', cv= cvo))
    return score
    
study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective_xgb, n_trials= 3, n_jobs=-1, show_progress_bar=True)

[I 2024-03-13 11:10:52,155] A new study created in memory with name: no-name-482268e6-883e-415d-a7b4-f49f9fd0efe9


  0%|          | 0/3 [00:00<?, ?it/s]

[I 2024-03-13 11:10:55,838] Trial 0 finished with value: 0.8508063346985892 and parameters: {'n_estimators': 100, 'max_depth': 115, 'max_leaves': 138, 'learning_rate': 0.29363111960193405, 'gamma': 29.906670579354852, 'min_child_weight': 40.487784151016285, 'subsample': 0.551523587604274, 'colsample_bytree': 0.3515826060138617, 'reg_alpha': 0.7495992344875034, 'reg_lambda': 0.44250024495795925}. Best is trial 0 with value: 0.8508063346985892.
[I 2024-03-13 11:10:55,906] Trial 2 finished with value: 0.8472237715434462 and parameters: {'n_estimators': 100, 'max_depth': 83, 'max_leaves': 16, 'learning_rate': 0.25294885177154747, 'gamma': 44.437067386588744, 'min_child_weight': 11.098040687340992, 'subsample': 0.5121547184754389, 'colsample_bytree': 0.7114813176772874, 'reg_alpha': 0.17492129491217426, 'reg_lambda': 0.6856112629698355}. Best is trial 0 with value: 0.8508063346985892.
[I 2024-03-13 11:10:56,386] Trial 1 finished with value: 0.8667161376615328 and parameters: {'n_estimators'

In [17]:
study_lgb.trials[0].params

{'n_estimators': 100,
 'max_depth': 115,
 'max_leaves': 138,
 'learning_rate': 0.29363111960193405,
 'gamma': 29.906670579354852,
 'min_child_weight': 40.487784151016285,
 'subsample': 0.551523587604274,
 'colsample_bytree': 0.3515826060138617,
 'reg_alpha': 0.7495992344875034,
 'reg_lambda': 0.44250024495795925}

In [23]:
trails_df = study_lgb.trials_dataframe().drop(['number', 'datetime_start', 'datetime_complete', 'duration','state'],axis=1) 
trails_df = trails_df.sort_values(by = 'value', ascending=False)

In [26]:
trials_df_top = trails_df.iloc[:2]; trials_df_top

Unnamed: 0,value,params_colsample_bytree,params_gamma,params_learning_rate,params_max_depth,params_max_leaves,params_min_child_weight,params_n_estimators,params_reg_alpha,params_reg_lambda,params_subsample
1,0.866716,0.864916,1.808122,0.067666,124,143,29.069754,100,0.968157,0.745992,0.545719
0,0.850806,0.351583,29.906671,0.293631,115,138,40.487784,100,0.749599,0.4425,0.551524


In [29]:
for key in prms:
    mod_key = 'params_' + key
    prms[key][0] = trials_df_top[mod_key].min()
    prms[key][1] = trials_df_top[mod_key].max()

In [30]:
prms

{'n_estimators': [100, 100],
 'max_depth': [115, 124],
 'max_leaves': [138, 143],
 'learning_rate': [0.06766589891749072, 0.29363111960193405],
 'gamma': [1.8081217885080474, 29.906670579354852],
 'min_child_weight': [29.069754293706445, 40.487784151016285],
 'subsample': [0.5457190296010062, 0.551523587604274],
 'colsample_bytree': [0.3515826060138617, 0.8649163415929932],
 'reg_alpha': [0.7495992344875034, 0.9681573328050053],
 'reg_lambda': [0.44250024495795925, 0.74599217290967]}