In [1]:
import numpy as np
import pandas as pd

from sklearn.decomposition import PCA
from engineering import full_feature_engineering

import gc

In [2]:
df = pd.read_csv('./data/train.csv')
df.describe()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
count,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,...,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0
mean,9609.0,709.854675,753.857641,1849756.0,1846605.0,1683.987616,95.654665,64.124096,191846.7,84.808419,...,0.102742,-0.138382,0.571902,0.076279,0.059837,0.178573,0.029554,0.025235,0.247828,0.341225
std,5548.191747,531.544189,499.836603,1903554.0,1896295.0,3730.319865,177.821382,101.054178,442024.7,28.800344,...,0.487681,0.120344,0.332219,0.26545,0.23719,0.383005,0.169358,0.156844,0.431762,0.474133
min,0.0,0.0,4.0,6712.0,6724.0,6.0,2.0,1.0,250.0,0.0,...,-0.9884,-0.885,0.119,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4804.5,49.0,214.0,657468.0,657502.0,89.0,15.0,14.0,9848.0,70.0,...,-0.2727,-0.1925,0.2532,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,9609.0,777.0,796.0,1398169.0,1398179.0,168.0,25.0,23.0,18238.0,90.0,...,0.1111,-0.1426,0.4729,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,14413.5,1152.0,1165.0,2368032.0,2362511.0,653.0,64.0,61.0,67978.0,105.0,...,0.5294,-0.084,0.9994,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,19218.0,1705.0,1713.0,12987660.0,12987690.0,152655.0,7553.0,903.0,11591410.0,196.0,...,0.9917,0.6421,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## RF + Feature Importances

Iterative Elimination Rounds: 

0) 0.87389
1) 0.87496

I iteratively add lowest importance feature to drop_cols. I stop when model stops improving (1 round).

In [3]:
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

drop_cols = ['Y_Maximum', 'Y_Minimum', 'X_Maximum', 'X_Minimum',
             'pca_37',]

train_df = full_feature_engineering(df, drop_cols=drop_cols)
y_cols = ['No Defect', 'Pastry', 'Z_Scratch', 'K_Scatch',
          'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
y = train_df[y_cols]
X = train_df.drop(y, axis=1)
ids = X['id']
X = X.drop(['id'], axis=1)

#------------------------------------------------------------------------

rfs = []
gkf = GroupKFold(n_splits=5)
aucs = np.zeros(8)

for fold, (train_index, valid_index) in enumerate(gkf.split(X, y, ids)):
    print(f'Fold: {fold}')

    train_X = X.loc[train_index]
    train_y = y.loc[train_index]
    valid_X = X.loc[valid_index]
    valid_y = y.loc[valid_index]

    # Fit Model
    model = RandomForestClassifier(n_estimators=150,
                                   max_features=10,
                                   random_state=0,
                                   n_jobs=-1)
    model.fit(train_X, train_y)
    rfs.append(model)

    # Calculate AUROCs
    preds = np.array(model.predict_proba(valid_X))[:,:,1].T
    aucs += np.array(roc_auc_score(valid_y, preds,
                     multi_class='ovr', average=None))

val_aucs = pd.Series(aucs, index=y_cols)/5
print('='*50)

importances = rfs[0].feature_importances_
for rf in rfs[1:]:
    importances += rf.feature_importances_
importances = np.array(importances) / 5
Xcols = np.array(X.columns)[np.argsort(importances)][::-1]
sortd = np.sort(importances)[::-1]
for i, col in enumerate(Xcols):
    print(f'{col}: {sortd[i]:.4f}')

print('='*50)
real_cols = ['Pastry', 'Z_Scratch', 'K_Scatch',
             'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
print(f'Mean AUC: {val_aucs[real_cols].mean():.5f}')
gc.collect()

Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
pca_0: 0.0334
Outside_X_Index: 0.0301
Pixels_Areas: 0.0299
Log_Outside_X_Index: 0.0294
Log_X_Index: 0.0256
LogOfAreas: 0.0227
Log_Width: 0.0226
Width: 0.0222
Length_of_Conveyer: 0.0204
pca_9: 0.0198
Log_Range: 0.0190
Steel_Plate_Thickness: 0.0168
pca_7: 0.0167
pca_2: 0.0163
pca_1: 0.0162
Sum_of_Luminosity: 0.0159
Minimum_of_Luminosity: 0.0159
Edges_Index: 0.0157
pca_8: 0.0155
X_Perimeter: 0.0151
pca_12: 0.0143
Log_Avg_Lum: 0.0143
pca_6: 0.0142
pca_11: 0.0141
pca_5: 0.0134
Luminosity_Index: 0.0133
Orientation_Index: 0.0132
pca_13: 0.0131
Empty_Index: 0.0130
SigmoidOfAreas: 0.0130
Log_X_Perimeter: 0.0129
pca_31: 0.0127
pca_10: 0.0125
pca_14: 0.0125
pca_4: 0.0124
pca_25: 0.0124
pca_3: 0.0123
pca_33: 0.0123
pca_30: 0.0122
Y_Perimeter: 0.0122
pca_35: 0.0122
pca_28: 0.0120
pca_34: 0.0119
pca_23: 0.0118
pca_17: 0.0118
pca_24: 0.0118
pca_36: 0.0118
pca_22: 0.0117
pca_15: 0.0117
pca_16: 0.0117
pca_32: 0.0117
Log_Lum: 0.0117
pca_29: 0.0117
pca_27: 0.0116
p

69

Now, we'll proceed with leave-one-out feature selection.

In [4]:
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

def leave_one_out(X, y, ids, col):
    aucs = np.zeros(8)
    gkf = GroupKFold(n_splits=5)
    for _, (train_index, valid_index) in enumerate(gkf.split(X, y, ids)):
        
        if col is None: 
            train_X = X.loc[train_index]
            valid_X = X.loc[valid_index]
        else:
            train_X = X.loc[train_index].drop(columns=[col])
            valid_X = X.loc[valid_index].drop(columns=[col])

        train_y = y.loc[train_index]
        valid_y = y.loc[valid_index]

        # Fit Model
        model = RandomForestClassifier(n_estimators=150,
                                       max_features=10,
                                       random_state=0,
                                       n_jobs=-1)
        model.fit(train_X, train_y)
        rfs.append(model)

        # Calculate ROCS
        preds = np.array(model.predict_proba(valid_X))[:, :, 1].T
        aucs += np.array(roc_auc_score(valid_y, preds,
                         multi_class='ovr', average=None))

    val_aucs = pd.Series(aucs, index=y_cols)/5
    real_cols = ['Pastry', 'Z_Scratch', 'K_Scatch',
                 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
    return val_aucs[real_cols].mean()

In [5]:
drop_cols = ['Y_Maximum', 'Y_Minimum', 'X_Maximum', 'X_Minimum',
             'pca_37', 'pca_7',]

train_df = full_feature_engineering(df, drop_cols=drop_cols)

y_cols = ['No Defect', 'Pastry', 'Z_Scratch', 'K_Scatch',
          'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

y = train_df[y_cols]
X = train_df.drop(y, axis=1)
ids = X['id']
X = X.drop(['id'], axis=1)

baseline = leave_one_out(X, y, ids, None)

print(f'Baseline: {baseline:.5f}')
print('='*50)
print('Improvement when Eliminated: ')

for i, col in enumerate(X.columns[::-1]):
    if i < 65: continue # skip
    if (i+1) % 5 == 0: gc.collect() # free some memory
    improvement = leave_one_out(X, y, ids, col) - baseline
    print(f'{i+1}/{len(X.columns)} : {col} : {improvement:.5f}')
    if improvement > 0: break

Baseline: 0.87554
Improvement when Eliminated: 
66/70 : Minimum_of_Luminosity : -0.00123
67/70 : Sum_of_Luminosity : -0.00239
68/70 : Y_Perimeter : -0.00141
69/70 : X_Perimeter : -0.00160
70/70 : Pixels_Areas : -0.00197


Iterations:

0) 0.87496
1) 0.87554

## Hyperparameter Optimization

### First Round

In [7]:
from sklearn.model_selection import GroupKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

drop_cols = ['Y_Maximum', 'Y_Minimum', 'X_Maximum', 'X_Minimum',
             'pca_37', 'pca_7']

train_df = full_feature_engineering(df, drop_cols=drop_cols)
y_cols = ['No Defect', 'Pastry', 'Z_Scratch', 'K_Scatch',
          'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
real_cols = ['Pastry', 'Z_Scratch', 'K_Scatch',
             'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
y = train_df[y_cols]
X = train_df.drop(y, axis=1)
ids = X['id']
X = X.drop(['id'], axis=1)

def cv_score(params):
    gkf = GroupKFold(n_splits=5)
    aucs = np.zeros(8)

    for _, (train_index, valid_index) in enumerate(gkf.split(X, y, ids)):
        train_X = X.loc[train_index]
        train_y = y.loc[train_index]
        valid_X = X.loc[valid_index]
        valid_y = y.loc[valid_index]

        # Fit Model
        model = RandomForestClassifier(**params)
        model.fit(train_X, train_y)

        # Calculate ROCS
        preds = np.array(model.predict_proba(valid_X))[:, :, 1].T
        aucs += np.array(roc_auc_score(valid_y, preds,
                         multi_class='ovr', average=None))

    val_aucs = pd.Series(aucs, index=y_cols)/5
    return val_aucs[real_cols].mean()

def objective(trial):
    params = {
        'max_features': trial.suggest_int('max_features', 5, 50),
        'max_samples': trial.suggest_float('max_samples', 3e-3, 3e-1, log=True),
        'n_estimators': 500,
        'n_jobs': -1,
        'random_state': 0,
        'class_weight': 'balanced',
    }
    return cv_score(params)

In [9]:
import optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)

In [10]:
optuna.visualization.plot_slice(study, params=['max_features',
                                               'max_samples'])

In [11]:
optuna.visualization.plot_param_importances(study)

In [13]:
print(study.best_params)

{'max_features': 31, 'max_samples': 0.24634257357543676}


### Second Round

In [15]:
def objective2(trial):
    params = {
        'max_samples': trial.suggest_float('max_samples', 5e-2, 3e-1, log=True),
        'n_estimators': 1000,
        'max_features': 20,
        'n_jobs': -1,
        'random_state': 0,
        'class_weight': 'balanced',
    }
    return cv_score(params)

In [17]:
study2 = optuna.create_study(direction='maximize')
study2.optimize(objective2, n_trials=15)

In [18]:
optuna.visualization.plot_slice(study2, params=['max_samples'])

In [19]:
print(study2.best_params)

{'max_samples': 0.23723057360270555}


In [20]:
params = {
    'n_estimators': 2000,
    'max_samples': 0.24,
    'max_features': 20,
    'n_jobs': -1,
    'random_state': 0,
    'class_weight': 'balanced',
}
print(cv_score(params))

0.8789895158730356


In [21]:
params = {
    'n_estimators': 5000,
    'max_samples': 0.24,
    'max_features': 15, # reduced: good tradeoff for time reduction
    'n_jobs': -1,
    'random_state': 0,
    'class_weight': 'balanced',
}
print(cv_score(params))

0.879466291815855


More estimators is just better.

### Third Round

Let's try a small study for 2000 estimators since this should drive the variance down. `max_features` and `max_samples` may change.

In [22]:
def objective3(trial):
    params = {
        'max_samples': trial.suggest_float('max_samples', 1e-1, 4e-1, log=True),
        'max_features': trial.suggest_int('max_features', 10, 20),
        'n_estimators': 2000,
        'n_jobs': -1,
        'random_state': 0,
        'class_weight': 'balanced',
    }
    return cv_score(params)

In [23]:
study3 = optuna.create_study(direction='maximize')
study3.optimize(objective3, n_trials=5)

In [24]:
optuna.visualization.plot_parallel_coordinate(study3)

In [25]:
print(study3.best_params)

{'max_samples': 0.2007991722816782, 'max_features': 16}


## Scale Up

In [26]:
params = {
    'n_estimators': 5000,
    'max_samples': 0.20,
    'max_features': 15,
    'n_jobs': -1,
    'random_state': 0,
    'class_weight': 'balanced',
}
print(cv_score(params))

0.8796481034681278
