In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import catboost
import re
import optuna

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (FunctionTransformer, StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer, PowerTransformer,
                                   OneHotEncoder)
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold, StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin

col_names = []
with open('../data/Faults27x7_var','r') as f:
    for line in f:
        col_names.append(line.strip())
        
train_org = pd.read_csv('../data/train.csv')
test_org = pd.read_csv('../data/test.csv')
org_data = pd.read_csv('../data/Faults.NNA', delimiter='\s', engine='python', names=col_names)

# logistic regression
X = train_org.drop(['id','Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps','Other_Faults'], axis=1)
pastry = train_org['Pastry'].copy()
z_scratch = train_org['Z_Scratch'].copy()
k_scratch = train_org['K_Scatch'].copy()
stains = train_org['Stains'].copy()
dirtiness = train_org['Dirtiness'].copy()
bumps = train_org['Bumps'].copy()
other_faults = train_org['Other_Faults'].copy()

ys = [pastry, z_scratch, k_scratch, stains, dirtiness, bumps, other_faults]

class KMeansTransformer(BaseEstimator,TransformerMixin):
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_init=10, random_state=0)
        
    def fit(self,X, y=None):
        self.kmeans.fit(X)
        return self
        
    def transform(self,X):
        labels = self.kmeans.predict(X)
        return np.c_[X, labels]

In [17]:
def objective_stack_lgb_pastry(trial):
    
    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params = dict(
        n_estimators = trial.suggest_int('n_estimators',100,500),
        max_depth = trial.suggest_int('max_depth',2,64),
        num_leaves = trial.suggest_int('num_leaves',2,128),
        learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
        min_child_samples = trial.suggest_int('min_child_samples',2,500),
        min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
        subsample = trial.suggest_float('subsample', 0.33,0.85),
        colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
        reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
        reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
    )
    
    lgbc = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
    model = StackingClassifier(
        estimators=[
            ('logreg', LogisticRegression())
        ], 
        final_estimator= lgbc,
        stack_method='predict_proba',
        passthrough=True
    )
    
    pipe = Pipeline(
        steps = [
            ('scaler', StandardScaler()),
            ('kmeans', KMeansTransformer(n_clusters=5)),
            ('model', model)
        ]
    )
    
    score = np.mean(cross_val_score(pipe, X,pastry, scoring='roc_auc', cv= cvo))
    return score

In [18]:
study_stack_lgb_pastry = optuna.create_study(direction='maximize')

[I 2024-03-03 17:00:30,426] A new study created in memory with name: no-name-8e6f569a-2f58-4e9e-a02b-0898add0fb49


In [19]:
study_stack_lgb_pastry.optimize(objective_stack_lgb_pastry, n_trials=1000, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-03-03 17:01:03,740] Trial 0 finished with value: 0.8654072617331569 and parameters: {'n_estimators': 184, 'max_depth': 12, 'num_leaves': 2, 'learning_rate': 0.2590115552320802, 'min_child_samples': 214, 'min_child_weight': 7.375727194086049, 'subsample': 0.6433618058435139, 'colsample_bylevel': 0.6218645892327825, 'reg_alpha': 0.033828981002751134, 'reg_lambda': 0.06615969071455194}. Best is trial 0 with value: 0.8654072617331569.
[I 2024-03-03 17:01:06,391] Trial 2 finished with value: 0.857775179997728 and parameters: {'n_estimators': 239, 'max_depth': 46, 'num_leaves': 7, 'learning_rate': 0.2534709444758287, 'min_child_samples': 81, 'min_child_weight': 9.01516817716258, 'subsample': 0.4196765292096054, 'colsample_bylevel': 0.451393020929864, 'reg_alpha': 0.0937353587893418, 'reg_lambda': 0.09549115873929885}. Best is trial 0 with value: 0.8654072617331569.
[I 2024-03-03 17:01:08,212] Trial 4 finished with value: 0.8664304031689941 and parameters: {'n_estimators': 180, 'max_d

In [20]:
study_stack_lgb_pastry.best_params

{'n_estimators': 280,
 'max_depth': 9,
 'num_leaves': 18,
 'learning_rate': 0.02148602951813924,
 'min_child_samples': 341,
 'min_child_weight': 6.361004402846124,
 'subsample': 0.49585801139053814,
 'colsample_bylevel': 0.35930633871573076,
 'reg_alpha': 0.06447122051496705,
 'reg_lambda': 0.016497887759421646}

In [2]:
pastry_params = {'n_estimators': 280,
 'max_depth': 9,
 'num_leaves': 18,
 'learning_rate': 0.02148602951813924,
 'min_child_samples': 341,
 'min_child_weight': 6.361004402846124,
 'subsample': 0.49585801139053814,
 'colsample_bylevel': 0.35930633871573076,
 'reg_alpha': 0.06447122051496705,
 'reg_lambda': 0.016497887759421646}

pastry_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans', KMeansTransformer(n_clusters=5)),
        ('model', lgb.LGBMClassifier(random_state= 0, objective='binary', verbose = -1,**pastry_params))
    ]
)
pastry_m.fit(X,pastry)
pastry_pred = pastry_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [21]:
def objective_stack_lgb_z_scratch(trial):
    
    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params = dict(
        n_estimators = trial.suggest_int('n_estimators',100,500),
        max_depth = trial.suggest_int('max_depth',2,64),
        num_leaves = trial.suggest_int('num_leaves',2,128),
        learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
        min_child_samples = trial.suggest_int('min_child_samples',2,500),
        min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
        subsample = trial.suggest_float('subsample', 0.33,0.85),
        colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
        reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
        reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
    )
    
    lgbc = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
    model = StackingClassifier(
        estimators=[
            ('logreg', LogisticRegression())
        ], 
        final_estimator= lgbc,
        stack_method='predict_proba',
        passthrough=True
    )
    
    pipe = Pipeline(
        steps = [
            ('scaler', StandardScaler()),
            ('kmeans', KMeansTransformer(n_clusters=5)),
            ('model', model)
        ]
    )
    
    score = np.mean(cross_val_score(pipe, X,z_scratch, scoring='roc_auc', cv= cvo))
    return score

In [22]:
study_stack_lgb_z_scratch = optuna.create_study(direction='maximize')

[I 2024-03-03 17:38:46,716] A new study created in memory with name: no-name-79868468-4213-4946-b24b-96720afc89fb


In [23]:
study_stack_lgb_z_scratch.optimize(objective_stack_lgb_z_scratch, n_trials=1000, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-03-03 17:39:04,403] Trial 4 finished with value: 0.960530778881149 and parameters: {'n_estimators': 301, 'max_depth': 2, 'num_leaves': 14, 'learning_rate': 0.08768990876249501, 'min_child_samples': 313, 'min_child_weight': 0.9613742169037168, 'subsample': 0.8430834607196884, 'colsample_bylevel': 0.5450777190844704, 'reg_alpha': 0.06793539242328991, 'reg_lambda': 0.042065755777485614}. Best is trial 4 with value: 0.960530778881149.
[I 2024-03-03 17:39:05,108] Trial 7 finished with value: 0.9504913351929467 and parameters: {'n_estimators': 134, 'max_depth': 18, 'num_leaves': 14, 'learning_rate': 0.2800848594241815, 'min_child_samples': 334, 'min_child_weight': 5.794008915979099, 'subsample': 0.7359788045986346, 'colsample_bylevel': 0.4017504162361352, 'reg_alpha': 0.03476043468924543, 'reg_lambda': 0.05567300929818739}. Best is trial 4 with value: 0.960530778881149.
[I 2024-03-03 17:39:07,798] Trial 0 finished with value: 0.9585785071117492 and parameters: {'n_estimators': 165, '

In [24]:
study_stack_lgb_z_scratch.best_params

{'n_estimators': 296,
 'max_depth': 3,
 'num_leaves': 74,
 'learning_rate': 0.03879534308586422,
 'min_child_samples': 298,
 'min_child_weight': 2.4215929637347218,
 'subsample': 0.47753292687280846,
 'colsample_bylevel': 0.35906419523398436,
 'reg_alpha': 0.057670762264904175,
 'reg_lambda': 0.0427161308925542}

In [3]:
z_scratch_params = {'n_estimators': 296,
 'max_depth': 3,
 'num_leaves': 74,
 'learning_rate': 0.03879534308586422,
 'min_child_samples': 298,
 'min_child_weight': 2.4215929637347218,
 'subsample': 0.47753292687280846,
 'colsample_bylevel': 0.35906419523398436,
 'reg_alpha': 0.057670762264904175,
 'reg_lambda': 0.0427161308925542}

z_scratch_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans', KMeansTransformer(n_clusters=5)),
        ('model', lgb.LGBMClassifier(random_state= 0, objective='binary', verbose = -1,**z_scratch_params))
    ]
)
z_scratch_m.fit(X,z_scratch)
z_scratch_pred = z_scratch_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [25]:
def objective_stack_lgb_k_scatch(trial):
    
    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params = dict(
        n_estimators = trial.suggest_int('n_estimators',100,500),
        max_depth = trial.suggest_int('max_depth',2,64),
        num_leaves = trial.suggest_int('num_leaves',2,128),
        learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
        min_child_samples = trial.suggest_int('min_child_samples',2,500),
        min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
        subsample = trial.suggest_float('subsample', 0.33,0.85),
        colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
        reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
        reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
    )
    
    lgbc = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
    model = StackingClassifier(
        estimators=[
            ('logreg', LogisticRegression())
        ], 
        final_estimator= lgbc,
        stack_method='predict_proba',
        passthrough=True
    )
    
    pipe = Pipeline(
        steps = [
            ('scaler', StandardScaler()),
            ('kmeans', KMeansTransformer(n_clusters=5)),
            ('model', model)
        ]
    )
    
    score = np.mean(cross_val_score(pipe, X,k_scratch, scoring='roc_auc', cv= cvo))
    return score

In [26]:
study_stack_lgb_k_scatch = optuna.create_study(direction='maximize')

[I 2024-03-03 18:22:26,491] A new study created in memory with name: no-name-802b527f-0582-476b-bd06-28e480dfd0e4


In [27]:
study_stack_lgb_k_scatch.optimize(objective_stack_lgb_k_scatch, n_trials=1000, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-03-03 18:23:17,783] Trial 0 finished with value: 0.9847393899290603 and parameters: {'n_estimators': 126, 'max_depth': 7, 'num_leaves': 5, 'learning_rate': 0.23787494163726589, 'min_child_samples': 24, 'min_child_weight': 5.661833148798491, 'subsample': 0.694739302490523, 'colsample_bylevel': 0.5691215423569622, 'reg_alpha': 0.03262671906956733, 'reg_lambda': 0.07014502073913792}. Best is trial 0 with value: 0.9847393899290603.
[I 2024-03-03 18:23:21,568] Trial 11 finished with value: 0.984235821592909 and parameters: {'n_estimators': 271, 'max_depth': 4, 'num_leaves': 116, 'learning_rate': 0.1689099203669211, 'min_child_samples': 294, 'min_child_weight': 0.48386483265418884, 'subsample': 0.5672739621895317, 'colsample_bylevel': 0.3887908261727712, 'reg_alpha': 0.07209228125642478, 'reg_lambda': 0.02409738686552075}. Best is trial 0 with value: 0.9847393899290603.
[I 2024-03-03 18:23:23,859] Trial 2 finished with value: 0.982194085673331 and parameters: {'n_estimators': 285, 'm

In [28]:
study_stack_lgb_k_scatch.best_params

{'n_estimators': 165,
 'max_depth': 25,
 'num_leaves': 82,
 'learning_rate': 0.039218761275269305,
 'min_child_samples': 311,
 'min_child_weight': 2.0950790203327543,
 'subsample': 0.6650426132547665,
 'colsample_bylevel': 0.37920052637206214,
 'reg_alpha': 0.07068585892847412,
 'reg_lambda': 0.07335542586336943}

In [4]:
k_scratch_params = {'n_estimators': 271,
 'max_depth': 29,
 'num_leaves': 118,
 'learning_rate': 0.02560845874325205,
 'min_child_samples': 449,
 'min_child_weight': 1.802873155395419,
 'subsample': 0.7902749625886126,
 'colsample_bylevel': 0.3433292920431201,
 'reg_alpha': 0.047379430530416024,
 'reg_lambda': 0.010007517965380167}

k_scratch_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans', KMeansTransformer(n_clusters=5)),
        ('model', lgb.LGBMClassifier(random_state= 0, objective='binary', verbose = -1,**k_scratch_params))
    ]
)
k_scratch_m.fit(X,k_scratch)
k_scratch_pred = k_scratch_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [29]:
def objective_stack_lgb_stains(trial):
    
    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params = dict(
        n_estimators = trial.suggest_int('n_estimators',100,500),
        max_depth = trial.suggest_int('max_depth',2,64),
        num_leaves = trial.suggest_int('num_leaves',2,128),
        learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
        min_child_samples = trial.suggest_int('min_child_samples',2,500),
        min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
        subsample = trial.suggest_float('subsample', 0.33,0.85),
        colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
        reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
        reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
    )
    
    lgbc = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
    model = StackingClassifier(
        estimators=[
            ('logreg', LogisticRegression())
        ], 
        final_estimator= lgbc,
        stack_method='predict_proba',
        passthrough=True
    )
    
    pipe = Pipeline(
        steps = [
            ('scaler', StandardScaler()),
            ('kmeans', KMeansTransformer(n_clusters=5)),
            ('model', model)
        ]
    )
    
    score = np.mean(cross_val_score(pipe, X,stains, scoring='roc_auc', cv= cvo))
    return score

In [30]:
study_stack_lgb_stains = optuna.create_study(direction='maximize')

[I 2024-03-03 19:09:18,500] A new study created in memory with name: no-name-c34924ca-1156-46f3-8f78-8cc03292d129


In [31]:
study_stack_lgb_stains.optimize(objective_stack_lgb_stains, n_trials=1000, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-03-03 19:10:09,757] Trial 5 finished with value: 0.9911710565291598 and parameters: {'n_estimators': 135, 'max_depth': 57, 'num_leaves': 48, 'learning_rate': 0.20770839528589122, 'min_child_samples': 314, 'min_child_weight': 8.541897161088082, 'subsample': 0.5617020780691977, 'colsample_bylevel': 0.37158405030824376, 'reg_alpha': 0.01207476708909198, 'reg_lambda': 0.031269300806795186}. Best is trial 5 with value: 0.9911710565291598.
[I 2024-03-03 19:10:10,211] Trial 9 finished with value: 0.992371333307724 and parameters: {'n_estimators': 149, 'max_depth': 7, 'num_leaves': 56, 'learning_rate': 0.06216033762946182, 'min_child_samples': 456, 'min_child_weight': 6.952569458895605, 'subsample': 0.6833371024548366, 'colsample_bylevel': 0.6965382108174839, 'reg_alpha': 0.04362592086769989, 'reg_lambda': 0.06411505569679747}. Best is trial 9 with value: 0.992371333307724.
[I 2024-03-03 19:10:11,004] Trial 6 finished with value: 0.9918586120998627 and parameters: {'n_estimators': 152,

In [32]:
study_stack_lgb_stains.best_params

{'n_estimators': 266,
 'max_depth': 5,
 'num_leaves': 75,
 'learning_rate': 0.027151102431374634,
 'min_child_samples': 170,
 'min_child_weight': 1.024782187730875,
 'subsample': 0.5803990695479041,
 'colsample_bylevel': 0.6394813204481101,
 'reg_alpha': 0.09477440109735585,
 'reg_lambda': 0.05246070547701958}

In [5]:
stains_params = {'n_estimators': 266,
 'max_depth': 5,
 'num_leaves': 75,
 'learning_rate': 0.027151102431374634,
 'min_child_samples': 170,
 'min_child_weight': 1.024782187730875,
 'subsample': 0.5803990695479041,
 'colsample_bylevel': 0.6394813204481101,
 'reg_alpha': 0.09477440109735585,
 'reg_lambda': 0.05246070547701958}

stains_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans', KMeansTransformer(n_clusters=5)),
        ('model', lgb.LGBMClassifier(random_state= 0, objective='binary', verbose = -1,**stains_params))
    ]
)
stains_m.fit(X,stains)
stains_pred = stains_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [33]:
def objective_stack_lgb_dirtiness(trial):
    
    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params = dict(
        n_estimators = trial.suggest_int('n_estimators',100,500),
        max_depth = trial.suggest_int('max_depth',2,64),
        num_leaves = trial.suggest_int('num_leaves',2,128),
        learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
        min_child_samples = trial.suggest_int('min_child_samples',2,500),
        min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
        subsample = trial.suggest_float('subsample', 0.33,0.85),
        colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
        reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
        reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
    )
    
    lgbc = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
    model = StackingClassifier(
        estimators=[
            ('logreg', LogisticRegression())
        ], 
        final_estimator= lgbc,
        stack_method='predict_proba',
        passthrough=True
    )
    
    pipe = Pipeline(
        steps = [
            ('scaler', StandardScaler()),
            ('kmeans', KMeansTransformer(n_clusters=5)),
            ('model', model)
        ]
    )
    
    score = np.mean(cross_val_score(pipe, X,dirtiness, scoring='roc_auc', cv= cvo))
    return score

In [34]:
study_stack_lgb_dirtiness = optuna.create_study(direction='maximize')

[I 2024-03-03 19:47:54,875] A new study created in memory with name: no-name-a6880c56-338a-4fc6-9b07-42d4ae63a709


In [35]:
study_stack_lgb_dirtiness.optimize(objective_stack_lgb_dirtiness, n_trials=1000, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-03-03 19:48:34,737] Trial 8 finished with value: 0.8797506479781243 and parameters: {'n_estimators': 144, 'max_depth': 7, 'num_leaves': 8, 'learning_rate': 0.11885568355814391, 'min_child_samples': 337, 'min_child_weight': 0.13496123032082194, 'subsample': 0.705696394338552, 'colsample_bylevel': 0.4446828143443458, 'reg_alpha': 0.044205601418915136, 'reg_lambda': 0.09514976517193034}. Best is trial 8 with value: 0.8797506479781243.
[I 2024-03-03 19:48:36,446] Trial 9 finished with value: 0.8889124201829709 and parameters: {'n_estimators': 115, 'max_depth': 13, 'num_leaves': 25, 'learning_rate': 0.02548542385755895, 'min_child_samples': 136, 'min_child_weight': 1.7646675143327613, 'subsample': 0.34273992249448654, 'colsample_bylevel': 0.5158610309543519, 'reg_alpha': 0.0689215729223927, 'reg_lambda': 0.07383577234563478}. Best is trial 9 with value: 0.8889124201829709.
[I 2024-03-03 19:48:37,394] Trial 0 finished with value: 0.8811087412209918 and parameters: {'n_estimators': 10

In [None]:
study_stack_lgb_dirtiness = optuna.create_study(direction='maximize')

[I 2024-03-03 19:47:54,875] A new study created in memory with name: no-name-a6880c56-338a-4fc6-9b07-42d4ae63a709


In [None]:
study_stack_lgb_dirtiness.optimize(objective_stack_lgb_dirtiness, n_trials=1000, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-03-03 19:48:34,737] Trial 8 finished with value: 0.8797506479781243 and parameters: {'n_estimators': 144, 'max_depth': 7, 'num_leaves': 8, 'learning_rate': 0.11885568355814391, 'min_child_samples': 337, 'min_child_weight': 0.13496123032082194, 'subsample': 0.705696394338552, 'colsample_bylevel': 0.4446828143443458, 'reg_alpha': 0.044205601418915136, 'reg_lambda': 0.09514976517193034}. Best is trial 8 with value: 0.8797506479781243.
[I 2024-03-03 19:48:36,446] Trial 9 finished with value: 0.8889124201829709 and parameters: {'n_estimators': 115, 'max_depth': 13, 'num_leaves': 25, 'learning_rate': 0.02548542385755895, 'min_child_samples': 136, 'min_child_weight': 1.7646675143327613, 'subsample': 0.34273992249448654, 'colsample_bylevel': 0.5158610309543519, 'reg_alpha': 0.0689215729223927, 'reg_lambda': 0.07383577234563478}. Best is trial 9 with value: 0.8889124201829709.
[I 2024-03-03 19:48:37,394] Trial 0 finished with value: 0.8811087412209918 and parameters: {'n_estimators': 10

In [36]:
study_stack_lgb_dirtiness.best_params

{'n_estimators': 203,
 'max_depth': 10,
 'num_leaves': 118,
 'learning_rate': 0.01713016421956772,
 'min_child_samples': 91,
 'min_child_weight': 1.180539507651263,
 'subsample': 0.8158594170623332,
 'colsample_bylevel': 0.3425634708462354,
 'reg_alpha': 0.053339818014764395,
 'reg_lambda': 0.0065811775175819804}

In [6]:
dirtiness_params = {'n_estimators': 100,
 'max_depth': 13,
 'num_leaves': 62,
 'learning_rate': 0.04095550709994935,
 'min_child_samples': 331,
 'min_child_weight': 0.1953843881250878,
 'subsample': 0.4674944721594767,
 'colsample_bylevel': 0.33072889947558426,
 'reg_alpha': 0.07067954909592045,
 'reg_lambda': 0.0017098736746246962}

dirtiness_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans', KMeansTransformer(n_clusters=5)),
        ('model', lgb.LGBMClassifier(random_state= 0, objective='binary', verbose = -1,**dirtiness_params))
    ]
)
dirtiness_m.fit(X,dirtiness)
dirtiness_pred = dirtiness_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [37]:
def objective_stack_lgb_bumps(trial):
    
    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params = dict(
        n_estimators = trial.suggest_int('n_estimators',100,500),
        max_depth = trial.suggest_int('max_depth',2,64),
        num_leaves = trial.suggest_int('num_leaves',2,128),
        learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
        min_child_samples = trial.suggest_int('min_child_samples',2,500),
        min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
        subsample = trial.suggest_float('subsample', 0.33,0.85),
        colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
        reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
        reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
    )
    
    lgbc = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
    model = StackingClassifier(
        estimators=[
            ('logreg', LogisticRegression())
        ], 
        final_estimator= lgbc,
        stack_method='predict_proba',
        passthrough=True
    )
    
    pipe = Pipeline(
        steps = [
            ('scaler', StandardScaler()),
            ('kmeans', KMeansTransformer(n_clusters=5)),
            ('model', model)
        ]
    )
    
    score = np.mean(cross_val_score(pipe, X,bumps, scoring='roc_auc', cv= cvo))
    return score

In [38]:
study_stack_lgb_bumps = optuna.create_study(direction='maximize')

[I 2024-03-03 20:23:59,438] A new study created in memory with name: no-name-e07fd003-bebc-4ffb-a8d3-f717d2cad57f


In [39]:
study_stack_lgb_bumps.optimize(objective_stack_lgb_bumps, n_trials=1000, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-03-03 20:24:23,849] Trial 10 finished with value: 0.7972415162954498 and parameters: {'n_estimators': 143, 'max_depth': 16, 'num_leaves': 48, 'learning_rate': 0.14715384655684072, 'min_child_samples': 457, 'min_child_weight': 1.8557308231983398, 'subsample': 0.5577678933373795, 'colsample_bylevel': 0.5571149943900406, 'reg_alpha': 0.007600965559821776, 'reg_lambda': 0.054021452955823536}. Best is trial 10 with value: 0.7972415162954498.
[I 2024-03-03 20:24:24,265] Trial 2 finished with value: 0.7990425398779413 and parameters: {'n_estimators': 259, 'max_depth': 6, 'num_leaves': 113, 'learning_rate': 0.13115758440126915, 'min_child_samples': 427, 'min_child_weight': 6.439349551168621, 'subsample': 0.7545282253836547, 'colsample_bylevel': 0.5735516669669373, 'reg_alpha': 0.06277084428519855, 'reg_lambda': 0.029119357932858}. Best is trial 2 with value: 0.7990425398779413.
[I 2024-03-03 20:24:24,313] Trial 5 finished with value: 0.8080665936208884 and parameters: {'n_estimators': 

In [40]:
study_stack_lgb_bumps.best_params

{'n_estimators': 331,
 'max_depth': 50,
 'num_leaves': 18,
 'learning_rate': 0.02532698833869288,
 'min_child_samples': 301,
 'min_child_weight': 3.3586368002493936,
 'subsample': 0.5651245378254034,
 'colsample_bylevel': 0.49272822687245355,
 'reg_alpha': 0.025556571381612034,
 'reg_lambda': 0.033444917506411945}

In [7]:
bumps_params = {'n_estimators': 450,
 'max_depth': 8,
 'num_leaves': 44,
 'learning_rate': 0.012849208203633231,
 'min_child_samples': 322,
 'min_child_weight': 1.2745300791059089,
 'subsample': 0.5869159792528273,
 'colsample_bylevel': 0.46785554903946636,
 'reg_alpha': 0.05444067251372892,
 'reg_lambda': 0.04278704991242975}

bumps_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans', KMeansTransformer(n_clusters=5)),
        ('model', lgb.LGBMClassifier(random_state= 0, objective='binary', verbose = -1,**bumps_params))
    ]
)
bumps_m.fit(X,bumps)
bumps_pred = bumps_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [41]:
def objective_stack_lgb_other_faults(trial):
    
    cvo = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    params = dict(
        n_estimators = trial.suggest_int('n_estimators',100,500),
        max_depth = trial.suggest_int('max_depth',2,64),
        num_leaves = trial.suggest_int('num_leaves',2,128),
        learning_rate = trial.suggest_float('learning_rate',0.001,0.3),
        min_child_samples = trial.suggest_int('min_child_samples',2,500),
        min_child_weight = trial.suggest_float('min_child_weight', 0.01,10),
        subsample = trial.suggest_float('subsample', 0.33,0.85),
        colsample_bytree = trial.suggest_float('colsample_bylevel',0.33,0.7),
        reg_alpha=trial.suggest_float('reg_alpha', 0.001, 0.1),
        reg_lambda = trial.suggest_float('reg_lambda', 0.001,0.1)
    )
    
    lgbc = lgb.LGBMClassifier(random_state= 0, objective='binary', verbose=-1,**params)
    model = StackingClassifier(
        estimators=[
            ('logreg', LogisticRegression())
        ], 
        final_estimator= lgbc,
        stack_method='predict_proba',
        passthrough=True
    )
    
    pipe = Pipeline(
        steps = [
            ('scaler', StandardScaler()),
            ('kmeans', KMeansTransformer(n_clusters=5)),
            ('model', model)
        ]
    )
    
    score = np.mean(cross_val_score(pipe, X,other_faults, scoring='roc_auc', cv= cvo))
    return score

In [42]:
study_stack_lgb_other_faults = optuna.create_study(direction='maximize')

[I 2024-03-03 21:17:10,949] A new study created in memory with name: no-name-fc19cc00-32ca-4d58-92fb-45ae29225dde


In [43]:
study_stack_lgb_other_faults.optimize(objective_stack_lgb_other_faults, n_trials=1000, n_jobs=-1, show_progress_bar=True)

  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-03-03 21:18:04,931] Trial 8 finished with value: 0.7024483299917061 and parameters: {'n_estimators': 178, 'max_depth': 39, 'num_leaves': 4, 'learning_rate': 0.254869760629667, 'min_child_samples': 335, 'min_child_weight': 1.746849086927732, 'subsample': 0.7761953126472041, 'colsample_bylevel': 0.6710260461103333, 'reg_alpha': 0.00631239499570064, 'reg_lambda': 0.04057320219101983}. Best is trial 8 with value: 0.7024483299917061.
[I 2024-03-03 21:18:05,666] Trial 7 finished with value: 0.6965974804753023 and parameters: {'n_estimators': 402, 'max_depth': 10, 'num_leaves': 2, 'learning_rate': 0.0501971762817943, 'min_child_samples': 32, 'min_child_weight': 6.697452674841072, 'subsample': 0.4230400938790368, 'colsample_bylevel': 0.3730535601737534, 'reg_alpha': 0.029894276101817125, 'reg_lambda': 0.015118762874917615}. Best is trial 8 with value: 0.7024483299917061.
[I 2024-03-03 21:18:06,299] Trial 4 finished with value: 0.6954284923918528 and parameters: {'n_estimators': 280, 'm

In [8]:
other_faults_params = {'n_estimators': 254,
 'max_depth': 7,
 'num_leaves': 18,
 'learning_rate': 0.020543880031226605,
 'min_child_samples': 378,
 'min_child_weight': 8.98045597340312,
 'subsample': 0.6398999645373245,
 'colsample_bylevel': 0.6110641328208201,
 'reg_alpha': 0.09445215428123344,
 'reg_lambda': 0.010077108649068555}

other_faults_m = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('kmeans', KMeansTransformer(n_clusters=5)),
        ('model', lgb.LGBMClassifier(random_state= 0, objective='binary', verbose = -1,**other_faults_params))
    ]
)
other_faults_m.fit(X,other_faults)
other_faults_pred = other_faults_m.predict_proba(test_org.drop(['id'], axis=1))[:,1]

In [9]:
sub = pd.DataFrame({'id':test_org['id'].copy(), 'Pastry':pastry_pred, 'Z_Scratch':z_scratch_pred, 'K_Scatch':k_scratch_pred,
                    'Stains':stains_pred, 'Dirtiness':dirtiness_pred, 'Bumps':bumps_pred, 'Other_Faults':other_faults_pred})

In [10]:
sub.head()

Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.585735,0.001115,0.002984,7.4e-05,0.011791,0.159638,0.353399
1,19220,0.271087,0.011868,0.002667,9e-05,0.165159,0.150879,0.330532
2,19221,0.002238,0.038427,0.037689,0.000339,0.002944,0.308827,0.508441
3,19222,0.158231,0.00067,0.000375,0.001257,0.013434,0.376329,0.425821
4,19223,0.003147,0.001589,0.000338,0.001265,0.006323,0.565537,0.389942


In [11]:
(0.8721124548680992 + 0.9613490481775984 + 0.9860986294660549 + 0.9931795666268013 + 0.8967476825823397 + 0.8119434051286692 + 0.709327117443453)/7

0.8901082720418595

In [None]:
sub.to_csv('../submissions/m4.csv', index=False)