In [2]:
import numpy as np
import pandas as pd

from engineering import full_feature_engineering

In [3]:
df = pd.read_csv('./data/train.csv')
df.describe()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
count,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,...,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0
mean,9609.0,709.854675,753.857641,1849756.0,1846605.0,1683.987616,95.654665,64.124096,191846.7,84.808419,...,0.102742,-0.138382,0.571902,0.076279,0.059837,0.178573,0.029554,0.025235,0.247828,0.341225
std,5548.191747,531.544189,499.836603,1903554.0,1896295.0,3730.319865,177.821382,101.054178,442024.7,28.800344,...,0.487681,0.120344,0.332219,0.26545,0.23719,0.383005,0.169358,0.156844,0.431762,0.474133
min,0.0,0.0,4.0,6712.0,6724.0,6.0,2.0,1.0,250.0,0.0,...,-0.9884,-0.885,0.119,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4804.5,49.0,214.0,657468.0,657502.0,89.0,15.0,14.0,9848.0,70.0,...,-0.2727,-0.1925,0.2532,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,9609.0,777.0,796.0,1398169.0,1398179.0,168.0,25.0,23.0,18238.0,90.0,...,0.1111,-0.1426,0.4729,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,14413.5,1152.0,1165.0,2368032.0,2362511.0,653.0,64.0,61.0,67978.0,105.0,...,0.5294,-0.084,0.9994,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,19218.0,1705.0,1713.0,12987660.0,12987690.0,152655.0,7553.0,903.0,11591410.0,196.0,...,0.9917,0.6421,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## XGBoost + Feature Importances

Iterative Elimination Rounds: 

0) 0.88260
1) 0.88273

Already, the score began dropping when trying to eliminate low-importance features.

In [99]:
from sklearn.model_selection import GroupKFold
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

drop_cols = ['Y_Maximum', 'Y_Minimum', 'X_Maximum', 'X_Minimum',
             'Log_Outside_X_Index', 'Log_X_Perimeter', 'Log_Y_Perimeter',
             'Log_Width', 'Log_Lum', 'Log_Height', 'Height', 'pca_3',
             'pca_28', 'pca_25', 'pca_30', 'pca_33',]

train_df = full_feature_engineering(df, drop_cols=drop_cols)
y_cols = ['No Defect', 'Pastry', 'Z_Scratch', 'K_Scatch',
          'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
y = train_df[y_cols]
X = train_df.drop(y, axis=1)
ids = X['id']
X = X.drop(['id'], axis=1)

#------------------------------------------------------------------------

estimators = 1000
stopping = 5
subsample = 0.9
lr = 0.05

xgboosts = []
gkf = GroupKFold(n_splits=5)
aucs = np.zeros(8)

for fold, (train_index, valid_index) in enumerate(gkf.split(X, y, ids)):
    print(f'Fold: {fold}')

    train_X = X.loc[train_index]
    train_y = y.loc[train_index]
    valid_X = X.loc[valid_index]
    valid_y = y.loc[valid_index]

    # Fit Model
    model = XGBClassifier(n_estimators=estimators,
                          early_stopping_rounds=stopping,
                          learning_rate=lr,
                          subsample=subsample,
                          n_jobs=-1)
    model.fit(train_X, train_y,
              eval_set=[(valid_X, valid_y)], verbose=0)
    xgboosts.append(model)

    # Calculate ROCS
    preds = model.predict_proba(valid_X)
    aucs += np.array(roc_auc_score(valid_y, preds,
                     multi_class='ovr', average=None))

val_aucs = pd.Series(aucs, index=y_cols)/5
print('='*50)

importances = xgboosts[0].feature_importances_
for xgboost in xgboosts[1:]:
    importances += xgboost.feature_importances_
importances = np.array(importances) / 5
Xcols = np.array(X.columns)[np.argsort(importances)][::-1]
sortd = np.sort(importances)[::-1]
for i, col in enumerate(Xcols):
    print(f'{col}: {sortd[i]:.4f}')

print('='*50)
real_cols = ['Pastry', 'Z_Scratch', 'K_Scatch',
             'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
print(f'Mean AUC: {val_aucs[real_cols].mean():.5f}')

Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
pca_0: 0.2480
TypeOfSteel_A300: 0.0876
LogOfAreas: 0.0845
Pixels_Areas: 0.0435
Orientation_Index: 0.0298
Steel_Plate_Thickness: 0.0292
Outside_X_Index: 0.0245
Length_of_Conveyer: 0.0239
Edges_Y_Index: 0.0187
Y_Perimeter: 0.0187
Width: 0.0181
pca_1: 0.0163
pca_2: 0.0161
Log_Y_Index: 0.0124
Outside_Global_Index: 0.0112
Abs_Orientation: 0.0111
pca_8: 0.0105
Log_Range: 0.0103
Edges_Index: 0.0102
Minimum_of_Luminosity: 0.0096
TypeOfSteel_A400: 0.0088
X_Perimeter: 0.0086
Maximum_of_Luminosity: 0.0084
pca_7: 0.0082
pca_9: 0.0081
Square_Index: 0.0078
Log_X_Index: 0.0075
pca_12: 0.0071
pca_11: 0.0070
Luminosity_Index: 0.0069
pca_6: 0.0069
pca_22: 0.0069
Sum_of_Luminosity: 0.0068
Edges_X_Index: 0.0068
pca_13: 0.0068
SigmoidOfAreas: 0.0067
Empty_Index: 0.0067
pca_21: 0.0066
pca_5: 0.0065
pca_14: 0.0063
pca_31: 0.0063
pca_34: 0.0063
pca_10: 0.0063
Log_Avg_Lum: 0.0062
pca_20: 0.0062
pca_24: 0.0062
pca_18: 0.0061
pca_29: 0.0060
pca_37: 0.0060
pca_4: 0.0060
pca

Now, we'll proceed with leave-one-out feature selection.

In [133]:
from sklearn.model_selection import GroupKFold
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

def leave_one_out(X, y, ids, col):
    aucs = np.zeros(8)
    gkf = GroupKFold(n_splits=5)
    for _, (train_index, valid_index) in enumerate(gkf.split(X, y, ids)):
        
        if col is None: 
            train_X = X.loc[train_index]
            valid_X = X.loc[valid_index]
        else:
            train_X = X.loc[train_index].drop(columns=[col])
            valid_X = X.loc[valid_index].drop(columns=[col])

        train_y = y.loc[train_index]
        valid_y = y.loc[valid_index]

        # Fit Model
        estimators = 1000
        stopping = 5
        subsample = 1.0
        lr = 0.05

        model = XGBClassifier(n_estimators=estimators,
                              early_stopping_rounds=stopping,
                              learning_rate=lr,
                              subsample=subsample,
                              n_jobs=-1)
        model.fit(train_X, train_y,
                  eval_set=[(valid_X, valid_y)], verbose=0)

        # Calculate ROCS
        preds = model.predict_proba(valid_X)
        aucs += np.array(roc_auc_score(valid_y, preds,
                         multi_class='ovr', average=None))

    val_aucs = pd.Series(aucs, index=y_cols)/5
    real_cols = ['Pastry', 'Z_Scratch', 'K_Scatch',
                 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
    return val_aucs[real_cols].mean()

In [141]:
drop_cols = ['Y_Maximum', 'Y_Minimum', 'X_Maximum', 'X_Minimum',
             'Log_Outside_X_Index', 'Log_X_Perimeter', 'Log_Y_Perimeter',
             'Log_Width', 'Log_Lum', 'Log_Height', 'Height', 'pca_3',
             'pca_28', 'pca_25', 'pca_30', 'pca_33', 'pca_36', 'pca_31',
             'pca_37', 'pca_35', 'pca_26', 'pca_23', 'pca_20', 'pca_19',
             'pca_17', 'pca_16', 'pca_9', 'pca_2', 'pca_1', 'LogOfAreas',
             'Edges_X_Index', 'pca_13', 'pca_29', 'pca_22', 'pca_14', 'pca_32',
             'pca_34', 'pca_18', 'pca_27', 'pca_15', 'pca_11', 'Outside_Global_Index',
             'pca_21', 'pca_10', 'pca_8', 'pca_7', 'Log_X_Index', 'pca_24', 'Log_Y_Index',
             ]

train_df = full_feature_engineering(df, drop_cols=drop_cols)
y_cols = ['No Defect', 'Pastry', 'Z_Scratch', 'K_Scatch',
          'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
y = train_df[y_cols]
X = train_df.drop(y, axis=1)
ids = X['id']
X = X.drop(['id'], axis=1)

baseline = leave_one_out(X, y, ids, None)

print(f'Baseline: {baseline:.5f}')
print('='*50)
print('Improvement when Eliminated: ')

for i, col in enumerate(X.columns[::-1]):
    if i < 0: continue # looping after deleting instead of restarting
    improvement = leave_one_out(X, y, ids, col) - baseline
    print(f'{i+1}/{len(X.columns)} : {col} : {improvement:.5f}')
    if improvement > 0: break

Baseline: 0.88480
Improvement when Eliminated: 
1/27 : pca_12 : -0.00034
2/27 : pca_6 : -0.00076
3/27 : pca_5 : -0.00067
4/27 : pca_4 : -0.00004
5/27 : pca_0 : -0.00012
6/27 : Log_Avg_Lum : -0.00014
7/27 : Log_Range : -0.00075
8/27 : Abs_Orientation : -0.00032
9/27 : Width : -0.00023
10/27 : SigmoidOfAreas : -0.00015
11/27 : Luminosity_Index : -0.00045
12/27 : Orientation_Index : -0.00087
13/27 : Edges_Y_Index : -0.00144
14/27 : Outside_X_Index : -0.00096
15/27 : Square_Index : -0.00032
16/27 : Empty_Index : -0.00027
17/27 : Edges_Index : -0.00208
18/27 : Steel_Plate_Thickness : -0.01016
19/27 : TypeOfSteel_A400 : -0.00025
20/27 : TypeOfSteel_A300 : -0.00019
21/27 : Length_of_Conveyer : -0.00397
22/27 : Maximum_of_Luminosity : -0.00006
23/27 : Minimum_of_Luminosity : -0.00113
24/27 : Sum_of_Luminosity : -0.00027
25/27 : Y_Perimeter : -0.00057
26/27 : X_Perimeter : -0.00011
27/27 : Pixels_Areas : -0.00042


Iterations:

Starting with...
```
folds = 3
estimators = 500
stopping = 3
subsample = 1.0
lr = 0.05
```

0) 0.87918
1) 0.87928
2) 0.87941
3) 0.87945
4) 0.87962
5) 0.87978
6) 0.88011
7) 0.88019
8) 0.88032
9) 0.88049
10) 0.88088
11) 0.88094
12) 0.88133
13) 0.88149
14) 0.88158
15) 0.88162
16) 0.88167
17) 0.88168
18) 0.88170
19) 0.88202
20) 0.88210
21) 0.88212
22) 0.88229
23) 0.88236
24) 0.88257
25) 0.88303
26) 0.88326

Continuing with...
```
folds = 5
estimators = 1000
stopping = 5
subsample = 1.0
lr = 0.05
```

0) 0.88415
1) 0.88416
2) 0.88462
3) 0.88478
4) 0.88476 (-0.00002)
5) 0.88479
6) 0.88480

We were able to iteratively remove **32** features.

## Comparing Selected Features to Baseline

In [142]:
drop_cols = ['Y_Maximum', 'Y_Minimum', 'X_Maximum', 'X_Minimum',
             'Log_Outside_X_Index', 'Log_X_Perimeter', 'Log_Y_Perimeter',
             'Log_Width', 'Log_Lum', 'Log_Height', 'Height', 'pca_3',
             'pca_28', 'pca_25', 'pca_30', 'pca_33', 'pca_36', 'pca_31',
             'pca_37', 'pca_35', 'pca_26', 'pca_23', 'pca_20', 'pca_19',
             'pca_17', 'pca_16', 'pca_9', 'pca_2', 'pca_1', 'LogOfAreas',
             'Edges_X_Index', 'pca_13', 'pca_29', 'pca_22', 'pca_14', 'pca_32',
             'pca_34', 'pca_18', 'pca_27', 'pca_15', 'pca_11', 'Outside_Global_Index',
             'pca_21', 'pca_10', 'pca_8', 'pca_7', 'Log_X_Index', 'pca_24', 'Log_Y_Index',
             ]

train_df = full_feature_engineering(df, drop_cols=drop_cols)
y_cols = ['No Defect', 'Pastry', 'Z_Scratch', 'K_Scatch',
          'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
y = train_df[y_cols]
X = train_df.drop(y, axis=1)
ids = X['id']
X = X.drop(['id'], axis=1)

In [143]:
from sklearn.model_selection import GroupKFold
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

estimators = 1000
stopping = 5
lr = 0.05

xgboosts = []
gkf = GroupKFold(n_splits=5)
aucs = np.zeros(8)

for fold, (train_index, valid_index) in enumerate(gkf.split(X, y, ids)):
    print(f'Fold: {fold}')
    
    train_X = X.loc[train_index]
    train_y = y.loc[train_index]
    valid_X = X.loc[valid_index]
    valid_y = y.loc[valid_index]

    # Fit Model
    model = XGBClassifier(n_estimators=estimators,
                          early_stopping_rounds=stopping,
                          learning_rate=lr,
                          n_jobs=4)
    model.fit(train_X, train_y,
              eval_set=[(valid_X, valid_y)], verbose=0)
    xgboosts.append(model)

    # Calculate ROCS
    preds = model.predict_proba(valid_X)
    aucs += np.array(roc_auc_score(valid_y, preds,
                     multi_class='ovr', average=None))

val_aucs = pd.Series(aucs, index=y_cols)/5
display(val_aucs)
real_cols = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 
             'Dirtiness', 'Bumps', 'Other_Faults']
print(f'Mean AUC after Feature Selection: {val_aucs[real_cols].mean():.5f}')

Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4


No Defect       0.640116
Pastry          0.865461
Z_Scratch       0.956900
K_Scatch        0.984899
Stains          0.991477
Dirtiness       0.883928
Bumps           0.806348
Other_Faults    0.704560
dtype: float64

Mean AUC after Feature Selection: 0.88480


We improved our baseline of `0.88454` to `0.88480` by feature engineering and selection.

## Hyperparameter Optimization

### First Round

In [151]:
from sklearn.model_selection import GroupKFold
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

drop_cols = ['Y_Maximum', 'Y_Minimum', 'X_Maximum', 'X_Minimum',
             'Log_Outside_X_Index', 'Log_X_Perimeter', 'Log_Y_Perimeter',
             'Log_Width', 'Log_Lum', 'Log_Height', 'Height', 'pca_3',
             'pca_28', 'pca_25', 'pca_30', 'pca_33', 'pca_36', 'pca_31',
             'pca_37', 'pca_35', 'pca_26', 'pca_23', 'pca_20', 'pca_19',
             'pca_17', 'pca_16', 'pca_9', 'pca_2', 'pca_1', 'LogOfAreas',
             'Edges_X_Index', 'pca_13', 'pca_29', 'pca_22', 'pca_14', 'pca_32',
             'pca_34', 'pca_18', 'pca_27', 'pca_15', 'pca_11', 'Outside_Global_Index',
             'pca_21', 'pca_10', 'pca_8', 'pca_7', 'Log_X_Index', 'pca_24', 'Log_Y_Index',
             ]

train_df = full_feature_engineering(df, drop_cols=drop_cols)
y_cols = ['No Defect', 'Pastry', 'Z_Scratch', 'K_Scatch',
          'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
real_cols = ['Pastry', 'Z_Scratch', 'K_Scatch',
             'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
y = train_df[y_cols]
X = train_df.drop(y, axis=1)
ids = X['id']
X = X.drop(['id'], axis=1)

def cv_score(params):
    gkf = GroupKFold(n_splits=5)
    aucs = np.zeros(8)

    for fold, (train_index, valid_index) in enumerate(gkf.split(X, y, ids)):
        train_X = X.loc[train_index]
        train_y = y.loc[train_index]
        valid_X = X.loc[valid_index]
        valid_y = y.loc[valid_index]

        # Fit Model
        model = XGBClassifier(**params)
        model.fit(train_X, train_y, eval_set=[(valid_X, valid_y)], verbose=0)

        # Calculate ROCS
        preds = model.predict_proba(valid_X)
        aucs += np.array(roc_auc_score(valid_y, preds,
                         multi_class='ovr', average=None))
    
    val_aucs = pd.Series(aucs, index=y_cols)/5
    return val_aucs[real_cols].mean()

def objective(trial):
    params = {
        'n_estimators': 1500,
        'n_jobs': -1,
        'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 5e-3, 1e-1, log=True),
        'lambda': trial.suggest_float('lambda', 1e-8, 1e-2, log=True),
        'alpha': trial.suggest_float('alpha', 1e-4, 1e2, log=True),
        'subsample': trial.suggest_float('subsample', 0.2, 1.0, log=True)
    }
    return cv_score(params)

In [152]:
import optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)

[I 2024-04-11 03:04:20,001] A new study created in memory with name: no-name-9360f822-b233-4ed0-bd63-338edbee3ebf
[I 2024-04-11 03:04:35,555] Trial 0 finished with value: 0.8578463728273172 and parameters: {'early_stopping_rounds': 4, 'learning_rate': 0.09497020979311749, 'lambda': 3.976099936470505e-07, 'alpha': 89.1087038520795, 'subsample': 0.43538637426602994}. Best is trial 0 with value: 0.8578463728273172.
[I 2024-04-11 03:05:50,281] Trial 1 finished with value: 0.8851219894065755 and parameters: {'early_stopping_rounds': 10, 'learning_rate': 0.014141523812489641, 'lambda': 2.665718261390501e-06, 'alpha': 0.8762382914632539, 'subsample': 0.9804919536437807}. Best is trial 1 with value: 0.8851219894065755.
[I 2024-04-11 03:06:06,651] Trial 2 finished with value: 0.8842067942495789 and parameters: {'early_stopping_rounds': 6, 'learning_rate': 0.060344687492343285, 'lambda': 0.0014232491257852688, 'alpha': 0.009869270586528289, 'subsample': 0.6295005495784566}. Best is trial 1 with 

In [153]:
optuna.visualization.plot_slice(study, params=['alpha', 'lambda', 'subsample',
                                               'early_stopping_rounds',
                                               'learning_rate'])

In [154]:
optuna.visualization.plot_param_importances(study)

### Second Round

In [157]:
def objective2(trial):
    params = {
        'n_estimators': 1500,
        'n_jobs': -1,
        'early_stopping_rounds': 9,
        'lambda': 1e-7,
        'learning_rate': trial.suggest_float('learning_rate', 5e-3, 5e-2, log=True),
        'alpha': trial.suggest_float('alpha', 1e-2, 1e1, log=True),
        'subsample': trial.suggest_float('subsample', 0.3, 0.9, log=True)
    }
    return cv_score(params)

In [158]:
study2 = optuna.create_study(direction='maximize')
study2.optimize(objective2, n_trials=25)

[I 2024-04-11 03:50:14,963] A new study created in memory with name: no-name-3b73dba5-d4f7-4494-b4cc-eac44e2f27ad
[I 2024-04-11 03:51:55,318] Trial 0 finished with value: 0.8853190556272964 and parameters: {'learning_rate': 0.007944309617823268, 'alpha': 0.017076139350772904, 'subsample': 0.33683922598727045}. Best is trial 0 with value: 0.8853190556272964.
[I 2024-04-11 03:52:49,295] Trial 1 finished with value: 0.8857129730392764 and parameters: {'learning_rate': 0.020873602602648752, 'alpha': 6.37037780346313, 'subsample': 0.6585042566802036}. Best is trial 1 with value: 0.8857129730392764.
[I 2024-04-11 03:54:36,035] Trial 2 finished with value: 0.8853876987926836 and parameters: {'learning_rate': 0.0076742184319203745, 'alpha': 0.027794027539996845, 'subsample': 0.3386138594087756}. Best is trial 1 with value: 0.8857129730392764.
[I 2024-04-11 03:56:31,640] Trial 3 finished with value: 0.8864037910339342 and parameters: {'learning_rate': 0.008327076897233509, 'alpha': 2.5322713810

In [159]:
optuna.visualization.plot_slice(study2, params=['alpha', 'subsample', 'learning_rate'])

In [160]:
optuna.visualization.plot_param_importances(study2)

In [161]:
print(study2.best_params)

{'learning_rate': 0.01232843252761889, 'alpha': 3.0588301198483765, 'subsample': 0.7117189306161321}
