In [11]:
import numpy as np
import pandas as pd
from joblib import delayed, Parallel

# Base Scores

In [12]:
train_df = pd.read_csv('./data/train.csv')

In [13]:
y_cols = ['Pastry', 'Z_Scratch', 'K_Scatch',
          'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
y = train_df[y_cols]
X = train_df.drop(y, axis=1)
ids = X['id']
X = X.drop(['id'], axis=1)

In [14]:
from sklearn.model_selection import GroupKFold
gkf = GroupKFold(n_splits=5)

trains = []
valids = []
for _, (train_index, valid_index) in enumerate(gkf.split(X, y, ids)):
    train_X = X.loc[train_index]
    train_y = y.loc[train_index]
    valid_X = X.loc[valid_index]
    valid_y = y.loc[valid_index]
    trains.append((train_X, train_y))
    valids.append((valid_X, valid_y))

In [15]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

estimators = 1000
stopping = 5
lr = 0.05

xgboosts = []
aucs = np.zeros(7)

for fold in range(5):
    print(f'Fold: {fold}')

    # Fit Model
    model = XGBClassifier(n_estimators=estimators,
                          early_stopping_rounds=stopping,
                          learning_rate=lr,
                          n_jobs=4)
    model.fit(*trains[fold], 
              eval_set=[valids[fold]], verbose=0)
    xgboosts.append(model)

    # Calculate ROCS
    X_val, y_val = valids[fold]
    preds = model.predict_proba(X_val)
    aucs += np.array(roc_auc_score(y_val, preds, multi_class='ovr', average=None))
    
val_aucs = pd.Series(aucs, index=y_cols)/5
display(val_aucs)
print(f'Base Model Mean AUC: {val_aucs.mean():.5f}')

Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4


Pastry          0.863051
Z_Scratch       0.958140
K_Scatch        0.985536
Stains          0.990906
Dirtiness       0.883273
Bumps           0.807605
Other_Faults    0.702566
dtype: float64

Base Model Mean AUC: 0.88444


In [16]:
importances = xgboosts[0].feature_importances_
for xgboost in xgboosts[1:]:
    importances += xgboost.feature_importances_
importances = np.array(importances) / 5
Xcols = np.array(X.columns)[np.argsort(importances)][::-1]
sortd = np.sort(importances)[::-1]
for i, col in enumerate(Xcols):
    print(f'{col}: {sortd[i]:.2f}')

LogOfAreas: 0.19
Log_X_Index: 0.13
Outside_X_Index: 0.09
TypeOfSteel_A300: 0.08
Pixels_Areas: 0.06
X_Perimeter: 0.06
Steel_Plate_Thickness: 0.05
Orientation_Index: 0.04
Length_of_Conveyer: 0.03
TypeOfSteel_A400: 0.03
Edges_Y_Index: 0.02
Outside_Global_Index: 0.02
Y_Perimeter: 0.02
Minimum_of_Luminosity: 0.02
Log_Y_Index: 0.02
Square_Index: 0.01
X_Maximum: 0.01
X_Minimum: 0.01
Edges_Index: 0.01
Maximum_of_Luminosity: 0.01
Sum_of_Luminosity: 0.01
Luminosity_Index: 0.01
Empty_Index: 0.01
SigmoidOfAreas: 0.01
Edges_X_Index: 0.01
Y_Maximum: 0.01
Y_Minimum: 0.01


# Class-Based Importances

In [17]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

estimators = 1000
stopping = 5
lr = 0.05

aucs = pd.Series()
for i, col in enumerate(y_cols):
    print('='*30)
    print(col)
    print('='*30)
    xgboosts=[]
    aucs_sum = 0
    for fold in range(5):
        
        X, y = trains[fold]
        y_col = np.array(y[col])
        X_val, y_val = valids[fold]
        y_val_col = np.array(y_val[col])

        # Fit Model

        model = XGBClassifier(n_estimators=estimators,
                              early_stopping_rounds=stopping,
                              learning_rate=lr,
                              n_jobs=4,
                              eval_metric='auc')

        model.fit(X, y_col, eval_set=
                  [(X_val, np.array(y_val[col]))], verbose=0)
        xgboosts.append(model)

        # Calculate ROCS
        preds = model.predict_proba(X_val)[:,1]
        aucs_sum += roc_auc_score(np.array(y_val[col]), preds)

    aucs[col] = aucs_sum / 5
    # Importances
    importances = xgboosts[0].feature_importances_
    for xgboost in xgboosts[1:]:
        importances += xgboost.feature_importances_
    importances = np.array(importances) / 5
    Xcols = np.array(X.columns)[np.argsort(importances)][::-1]
    sortd = np.sort(importances)[::-1]
    for i, clm in enumerate(Xcols):
        print(f'{clm}: {sortd[i]:.2f}')

display(aucs)
print(f'Mean AUC: {round(aucs.mean(), 4)}')

Pastry
Orientation_Index: 0.40
Edges_Y_Index: 0.11
Square_Index: 0.04
Length_of_Conveyer: 0.04
Y_Perimeter: 0.04
Minimum_of_Luminosity: 0.03
TypeOfSteel_A300: 0.03
Outside_X_Index: 0.02
Log_Y_Index: 0.02
Steel_Plate_Thickness: 0.02
Maximum_of_Luminosity: 0.02
Edges_Index: 0.02
Outside_Global_Index: 0.02
SigmoidOfAreas: 0.02
Sum_of_Luminosity: 0.02
Empty_Index: 0.02
Luminosity_Index: 0.02
Log_X_Index: 0.02
Pixels_Areas: 0.01
X_Perimeter: 0.01
Edges_X_Index: 0.01
LogOfAreas: 0.01
Y_Minimum: 0.01
X_Maximum: 0.01
X_Minimum: 0.01
Y_Maximum: 0.01
TypeOfSteel_A400: 0.00
Z_Scratch
TypeOfSteel_A300: 0.43
Length_of_Conveyer: 0.19
Steel_Plate_Thickness: 0.09
X_Maximum: 0.03
X_Minimum: 0.02
LogOfAreas: 0.02
Pixels_Areas: 0.02
Outside_X_Index: 0.02
Maximum_of_Luminosity: 0.01
SigmoidOfAreas: 0.01
Empty_Index: 0.01
Outside_Global_Index: 0.01
Log_X_Index: 0.01
Minimum_of_Luminosity: 0.01
Y_Perimeter: 0.01
Edges_Y_Index: 0.01
Orientation_Index: 0.01
Edges_Index: 0.01
Y_Maximum: 0.01
Edges_X_Index: 0.0

Pastry          0.853370
Z_Scratch       0.948755
K_Scatch        0.983470
Stains          0.979529
Dirtiness       0.874876
Bumps           0.806725
Other_Faults    0.702790
dtype: float64

Mean AUC: 0.8785


## Base Score with New Target

From our EDA, we know that many datapoints have no defect, and a few have more than one. We could solve this by doing seven "one-versus-rest" classification tasks. However, we saw that the baseline mean AUC was only `0.8785` in this one-versus-rest set-up. This is compared to a mean AUC of `0.8844` when predicting a probability distribution over all seven classes simultaneously.

Because there is likely useful information given in the defect-free data, we create an artificial eighth class, `No Defect`. Our mean AUC score will not include this class.

In [18]:
train_df = pd.read_csv('./data/train.csv')

y_cols = ['Pastry', 'Z_Scratch', 'K_Scatch',
          'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

train_df['No Defect'] = (train_df[y_cols].sum(axis=1) == 0).astype('int')
y_cols = [*y_cols, 'No Defect']

y = train_df[y_cols]
X = train_df.drop(y, axis=1)
ids = X['id']
X = X.drop(['id'], axis=1)

In [21]:
from sklearn.model_selection import GroupKFold
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

estimators = 1000
stopping = 5
lr = 0.05

xgboosts = []
gkf = GroupKFold(n_splits=5)
aucs = np.zeros(8)

for fold, (train_index, valid_index) in enumerate(gkf.split(X, y, ids)):
    print(f'Fold: {fold}')
    
    train_X = X.loc[train_index]
    train_y = y.loc[train_index]
    valid_X = X.loc[valid_index]
    valid_y = y.loc[valid_index]

    # Fit Model
    model = XGBClassifier(n_estimators=estimators,
                          early_stopping_rounds=stopping,
                          learning_rate=lr,
                          n_jobs=4)
    model.fit(train_X, train_y,
              eval_set=[(valid_X, valid_y)], verbose=0)
    xgboosts.append(model)

    # Calculate ROCS
    preds = model.predict_proba(valid_X)
    aucs += np.array(roc_auc_score(valid_y, preds,
                     multi_class='ovr', average=None))

val_aucs = pd.Series(aucs, index=y_cols)/5
display(val_aucs)
print(f'Base Model Mean AUC: {val_aucs[:-1].mean():.5f}')

Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4


Pastry          0.863279
Z_Scratch       0.957976
K_Scatch        0.985535
Stains          0.990929
Dirtiness       0.883198
Bumps           0.807419
Other_Faults    0.703478
No Defect       0.634252
dtype: float64

Base Model Mean AUC: 0.88454



| Fault | Original | w/ `No Defect` | Change |
| :- | -: | :-: | -: |
| Pastry       |   0.86305 | 0.86378| +0.00073 |
| Z_Scratch    |   0.95814 | 0.95798| -0.00016 |
| K_Scatch     |   0.98554 | 0.98554| +0.00000 |
| Stains       |   0.99091 | 0.99093| +0.00002 |
| Dirtiness    |   0.88327 | 0.88320| -0.00007 |
| Bumps        |   0.80761 | 0.80742| -0.00019 |
| Other_Faults |   0.70257 | 0.70348| +0.00091 |
| **Mean**     | **0.88444** | **0.88454**| **+0.00010**|