In [2]:
import numpy as np
import pandas as pd
from joblib import delayed, Parallel

In [3]:
df = pd.read_csv('./data/train.csv')
df.describe()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
count,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,...,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0
mean,9609.0,709.854675,753.857641,1849756.0,1846605.0,1683.987616,95.654665,64.124096,191846.7,84.808419,...,0.102742,-0.138382,0.571902,0.076279,0.059837,0.178573,0.029554,0.025235,0.247828,0.341225
std,5548.191747,531.544189,499.836603,1903554.0,1896295.0,3730.319865,177.821382,101.054178,442024.7,28.800344,...,0.487681,0.120344,0.332219,0.26545,0.23719,0.383005,0.169358,0.156844,0.431762,0.474133
min,0.0,0.0,4.0,6712.0,6724.0,6.0,2.0,1.0,250.0,0.0,...,-0.9884,-0.885,0.119,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4804.5,49.0,214.0,657468.0,657502.0,89.0,15.0,14.0,9848.0,70.0,...,-0.2727,-0.1925,0.2532,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,9609.0,777.0,796.0,1398169.0,1398179.0,168.0,25.0,23.0,18238.0,90.0,...,0.1111,-0.1426,0.4729,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,14413.5,1152.0,1165.0,2368032.0,2362511.0,653.0,64.0,61.0,67978.0,105.0,...,0.5294,-0.084,0.9994,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,19218.0,1705.0,1713.0,12987660.0,12987690.0,152655.0,7553.0,903.0,11591410.0,196.0,...,0.9917,0.6421,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Feature Engineering

In [4]:
from sklearn.decomposition import PCA

def full_feature_engineering(df, drop_cols=['Y_Maximum', 'Y_Minimum', 'X_Maximum', 'X_Minimum']):
    new_df = df.copy()
    new_df['Height'] = np.abs(new_df['Y_Maximum'] - new_df['Y_Minimum'])
    new_df['Width'] = np.abs(new_df['X_Maximum'] - new_df['X_Minimum'])
    new_df[['Log_Outside_X_Index',
            'Log_X_Perimeter', 'Log_Y_Perimeter']] = np.log(new_df[['Outside_X_Index',
                                                                    'X_Perimeter', 'Y_Perimeter']] + 1e-6)
    new_df[['Log_Width', 'Log_Height']] = np.log(
        new_df[['Width', 'Height']] + 1)
    new_df['Abs_Orientation'] = np.abs(new_df['Orientation_Index'])
    new_df['Log_Range'] = np.log(
        1 + new_df['Maximum_of_Luminosity']) - np.log(1 + new_df['Minimum_of_Luminosity'])
    new_df['Log_Lum'] = np.log(new_df['Sum_of_Luminosity'])
    new_df['Log_Avg_Lum'] = new_df['Log_Lum'] - 2 * new_df['LogOfAreas']

    y_cols = ['Pastry', 'Z_Scratch', 'K_Scatch',
              'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
    non_cols = ['id'] + y_cols

    X = new_df.drop(columns=non_cols)
    X_stats = X.describe().T
    X = (X - X_stats['mean']) / X_stats['std']
    #n_components = ?
    pca = PCA().fit(X)
    new_df[[f'pca_{i}' for i in range(len(pca.components_))]] = pca.transform(X)

    new_df['No Defect'] = (new_df[y_cols].sum(axis=1) == 0).astype('int')

    return new_df.drop(columns=drop_cols)

## RF + Feature Importances

Iterative Elimination Rounds: 

0) 0.87054
1) 0.87193

Low importance feature dropout improved AUROC for only the first dropout.

In [21]:
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

drop_cols = ['Y_Maximum', 'Y_Minimum', 'X_Maximum', 'X_Minimum',
             'pca_37',]

train_df = full_feature_engineering(df, drop_cols=drop_cols)
y_cols = ['No Defect', 'Pastry', 'Z_Scratch', 'K_Scatch',
          'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
y = train_df[y_cols]
X = train_df.drop(y, axis=1)
ids = X['id']
X = X.drop(['id'], axis=1)

#------------------------------------------------------------------------

rfs = []
gkf = GroupKFold(n_splits=5)
aucs = np.zeros(8)

for fold, (train_index, valid_index) in enumerate(gkf.split(X, y, ids)):
    print(f'Fold: {fold}')

    train_X = X.loc[train_index]
    train_y = y.loc[train_index]
    valid_X = X.loc[valid_index]
    valid_y = y.loc[valid_index]

    # Fit Model
    model = RandomForestClassifier(n_estimators=100,
                                   max_features=10,
                                   random_state=0,
                                   n_jobs=-1)
    model.fit(train_X, train_y)
    rfs.append(model)

    # Calculate ROCS
    preds = np.array(model.predict_proba(valid_X))[:,:,1].T
    aucs += np.array(roc_auc_score(valid_y, preds,
                     multi_class='ovr', average=None))

val_aucs = pd.Series(aucs, index=y_cols)/5
print('='*50)

importances = rfs[0].feature_importances_
for rf in rfs[1:]:
    importances += rf.feature_importances_
importances = np.array(importances) / 5
Xcols = np.array(X.columns)[np.argsort(importances)][::-1]
sortd = np.sort(importances)[::-1]
for i, col in enumerate(Xcols):
    print(f'{col}: {sortd[i]:.4f}')

print('='*50)
real_cols = ['Pastry', 'Z_Scratch', 'K_Scatch',
             'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
print(f'Mean AUC: {val_aucs[real_cols].mean():.5f}')

Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
pca_0: 0.0304
Log_Outside_X_Index: 0.0284
Outside_X_Index: 0.0254
Log_Width: 0.0246
Pixels_Areas: 0.0244
Width: 0.0237
LogOfAreas: 0.0222
Log_X_Perimeter: 0.0218
Length_of_Conveyer: 0.0206
Log_X_Index: 0.0199
pca_9: 0.0198
Steel_Plate_Thickness: 0.0184
Log_Lum: 0.0179
Minimum_of_Luminosity: 0.0179
X_Perimeter: 0.0170
Log_Range: 0.0169
pca_1: 0.0167
pca_2: 0.0166
pca_7: 0.0165
Edges_Index: 0.0154
pca_8: 0.0153
Sum_of_Luminosity: 0.0152
pca_12: 0.0146
pca_11: 0.0141
pca_6: 0.0140
Orientation_Index: 0.0140
Luminosity_Index: 0.0135
pca_5: 0.0132
pca_13: 0.0131
Empty_Index: 0.0130
pca_31: 0.0127
pca_35: 0.0126
pca_14: 0.0125
pca_25: 0.0124
pca_10: 0.0124
pca_33: 0.0124
pca_3: 0.0122
pca_4: 0.0122
pca_30: 0.0122
pca_28: 0.0120
pca_24: 0.0120
pca_16: 0.0120
pca_22: 0.0119
pca_17: 0.0119
pca_34: 0.0118
pca_23: 0.0118
pca_15: 0.0118
pca_32: 0.0118
Y_Perimeter: 0.0118
pca_29: 0.0118
SigmoidOfAreas: 0.0118
pca_36: 0.0118
pca_26: 0.0117
pca_18: 0.0117
pca_27

Now, we'll proceed with leave-one-out feature selection.

In [22]:
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

def leave_one_out(X, y, ids, col):
    aucs = np.zeros(8)
    gkf = GroupKFold(n_splits=5)
    for _, (train_index, valid_index) in enumerate(gkf.split(X, y, ids)):
        
        if col is None: 
            train_X = X.loc[train_index]
            valid_X = X.loc[valid_index]
        else:
            train_X = X.loc[train_index].drop(columns=[col])
            valid_X = X.loc[valid_index].drop(columns=[col])

        train_y = y.loc[train_index]
        valid_y = y.loc[valid_index]

        # Fit Model
        model = RandomForestClassifier(n_estimators=100,
                                       max_features=10,
                                       random_state=0,
                                       n_jobs=-1)
        model.fit(train_X, train_y)
        rfs.append(model)

        # Calculate ROCS
        preds = np.array(model.predict_proba(valid_X))[:, :, 1].T
        aucs += np.array(roc_auc_score(valid_y, preds,
                         multi_class='ovr', average=None))

    val_aucs = pd.Series(aucs, index=y_cols)/5
    real_cols = ['Pastry', 'Z_Scratch', 'K_Scatch',
                 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
    return val_aucs[real_cols].mean()

In [24]:
drop_cols = ['Y_Maximum', 'Y_Minimum', 'X_Maximum', 'X_Minimum',
             'pca_37',]


train_df = full_feature_engineering(df, drop_cols=drop_cols)

y_cols = ['No Defect', 'Pastry', 'Z_Scratch', 'K_Scatch',
          'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

y = train_df[y_cols]
X = train_df.drop(y, axis=1)
ids = X['id']
X = X.drop(['id'], axis=1)

baseline = leave_one_out(X, y, ids, None)

print(f'Baseline: {baseline:.5f}')
print('='*50)
print('Improvement when Eliminated: ')

for i, col in enumerate(X.columns[::-1]):
    if i < 9: continue # looping after deleting instead of restarting
    improvement = leave_one_out(X, y, ids, col) - baseline
    print(f'{i+1}/{len(X.columns)} : {col} : {improvement:.5f}')
    if improvement > 0: break

Baseline: 0.87193
Improvement when Eliminated: 
10/71 : pca_27 : -0.00110
11/71 : pca_26 : -0.00118
12/71 : pca_25 : -0.00077
13/71 : pca_24 : -0.00176
14/71 : pca_23 : -0.00148
15/71 : pca_22 : -0.00138
16/71 : pca_21 : -0.00084
17/71 : pca_20 : -0.00124
18/71 : pca_19 : -0.00081
19/71 : pca_18 : -0.00172
20/71 : pca_17 : -0.00101
21/71 : pca_16 : -0.00084
22/71 : pca_15 : -0.00125
23/71 : pca_14 : -0.00077
24/71 : pca_13 : -0.00203
25/71 : pca_12 : -0.00129
26/71 : pca_11 : -0.00306
27/71 : pca_10 : -0.00208
28/71 : pca_9 : -0.00199
29/71 : pca_8 : -0.00097
30/71 : pca_7 : -0.00070
31/71 : pca_6 : -0.00054
32/71 : pca_5 : -0.00083
33/71 : pca_4 : -0.00180
34/71 : pca_3 : -0.00117
35/71 : pca_2 : -0.00133
36/71 : pca_1 : -0.00036
37/71 : pca_0 : 0.00002


Iterations:

0) 0.87193
1) 
2) 
3) 

## Hyperparameter Optimization

In [5]:
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

drop_cols = ['Y_Maximum', 'Y_Minimum', 'X_Maximum', 'X_Minimum',
             'pca_37']

train_df = full_feature_engineering(df, drop_cols=drop_cols)
y_cols = ['No Defect', 'Pastry', 'Z_Scratch', 'K_Scatch',
          'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
real_cols = ['Pastry', 'Z_Scratch', 'K_Scatch',
             'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
y = train_df[y_cols]
X = train_df.drop(y, axis=1)
ids = X['id']
X = X.drop(['id'], axis=1)

def cv_score(params):
    gkf = GroupKFold(n_splits=5)
    aucs = np.zeros(8)

    for _, (train_index, valid_index) in enumerate(gkf.split(X, y, ids)):
        train_X = X.loc[train_index]
        train_y = y.loc[train_index]
        valid_X = X.loc[valid_index]
        valid_y = y.loc[valid_index]

        # Fit Model
        model = RandomForestClassifier(**params)
        model.fit(train_X, train_y)

        # Calculate ROCS
        preds = np.array(model.predict_proba(valid_X))[:, :, 1].T
        aucs += np.array(roc_auc_score(valid_y, preds,
                         multi_class='ovr', average=None))

    val_aucs = pd.Series(aucs, index=y_cols)/5
    return val_aucs[real_cols].mean()

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_features': trial.suggest_int('max_features', 5, 50),
        'max_samples': trial.suggest_float('max_samples', 1e-3, 1e-1, log=True),
        'n_jobs': -1,
        'random_state': 0,
        'class_weight': 'balanced',
    }
    return cv_score(params)

In [8]:
import optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)

[I 2024-04-12 04:14:37,780] A new study created in memory with name: no-name-d9432e15-815a-47ca-aa7b-d109c515fdbb
[I 2024-04-12 04:14:43,069] Trial 0 finished with value: 0.8402754847845538 and parameters: {'n_estimators': 210, 'max_features': 15, 'max_samples': 0.002801782930887835}. Best is trial 0 with value: 0.8402754847845538.
[I 2024-04-12 04:14:51,057] Trial 1 finished with value: 0.8293468001268446 and parameters: {'n_estimators': 327, 'max_features': 26, 'max_samples': 0.0015294230694544906}. Best is trial 0 with value: 0.8402754847845538.
[I 2024-04-12 04:15:01,037] Trial 2 finished with value: 0.8579387604056397 and parameters: {'n_estimators': 408, 'max_features': 25, 'max_samples': 0.005630503334280253}. Best is trial 2 with value: 0.8579387604056397.
[I 2024-04-12 04:15:14,960] Trial 3 finished with value: 0.8538395619494487 and parameters: {'n_estimators': 554, 'max_features': 39, 'max_samples': 0.003499978940449209}. Best is trial 2 with value: 0.8579387604056397.
[I 20

In [9]:
optuna.visualization.plot_slice(study, params=['n_estimators',
                                               'max_features',
                                               'max_samples'])

In [10]:
optuna.visualization.plot_param_importances(study)

### Second Round

In [11]:
def objective2(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 600, 1200),
        'max_samples': trial.suggest_float('max_samples', 2e-2, 2e-1, log=True),
        'max_features': 33,
        'n_jobs': -1,
        'random_state': 0,
        'class_weight': 'balanced',
    }
    return cv_score(params)

In [12]:
study2 = optuna.create_study(direction='maximize')
study2.optimize(objective2, n_trials=25)

[I 2024-04-12 04:39:16,988] A new study created in memory with name: no-name-bd74f019-cf8e-422e-be4f-58f88b9c1b45
[I 2024-04-12 04:41:59,831] Trial 0 finished with value: 0.8773071505843656 and parameters: {'n_estimators': 1155, 'max_samples': 0.11772601093714034}. Best is trial 0 with value: 0.8773071505843656.
[I 2024-04-12 04:43:53,691] Trial 1 finished with value: 0.8767565608770971 and parameters: {'n_estimators': 840, 'max_samples': 0.10991516439419435}. Best is trial 0 with value: 0.8773071505843656.
[I 2024-04-12 04:44:44,034] Trial 2 finished with value: 0.8743859647968535 and parameters: {'n_estimators': 960, 'max_samples': 0.03824251448180383}. Best is trial 0 with value: 0.8773071505843656.
[I 2024-04-12 04:48:51,750] Trial 3 finished with value: 0.8780368486690415 and parameters: {'n_estimators': 1168, 'max_samples': 0.1694265738195301}. Best is trial 3 with value: 0.8780368486690415.
[I 2024-04-12 04:50:25,141] Trial 4 finished with value: 0.8761712966659083 and parameter

In [14]:
optuna.visualization.plot_slice(study2, params=['n_estimators', 'max_samples'])

In [15]:
optuna.visualization.plot_param_importances(study2)

In [16]:
print(study2.best_params)

{'n_estimators': 1034, 'max_samples': 0.14561473602974484}


In [17]:
params = {
    'n_estimators': 2000,
    'max_samples': 0.17,
    'max_features': 33,
    'n_jobs': -1,
    'random_state': 0,
    'class_weight': 'balanced',
}
print(cv_score(params))

0.8782391243824988


It seems that more estimators is just better.

In [6]:
params = {
    'n_estimators': 5000,
    'max_samples': 0.15,
    'max_features': 33,
    'n_jobs': -1,
    'random_state': 0,
    'class_weight': 'balanced',
}
print(cv_score(params))

0.8784478872403902
