In [3]:
import numpy as np
import pandas as pd
from joblib import delayed, Parallel

# Base Scores

In [4]:
train_df = pd.read_csv('./data/train.csv')

In [5]:
y_cols = ['Pastry', 'Z_Scratch', 'K_Scatch',
          'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
y = train_df[y_cols]
X = train_df.drop(y, axis=1)
ids = X['id']
X = X.drop(['id'], axis=1)

In [7]:
from sklearn.model_selection import GroupKFold
gkf = GroupKFold(n_splits=5)

trains = []
valids = []
for fold, (train_index, valid_index) in enumerate(gkf.split(X, y, ids)):
    train_X = X.loc[train_index]
    train_y = y.loc[train_index]
    valid_X = X.loc[valid_index]
    valid_y = y.loc[valid_index]
    trains.append((train_X, train_y))
    valids.append((valid_X, valid_y))

In [8]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

estimators = 1000
stopping = 5
lr = 0.05

xgboosts = []
aucs = np.zeros(7)

for fold in range(5):
    print(f'Fold: {fold}')

    # Fit Model
    model = XGBClassifier(n_estimators=estimators,
                          early_stopping_rounds=stopping,
                          learning_rate=lr,
                          n_jobs=4)
    model.fit(*trains[fold], 
              eval_set=[valids[fold]], verbose=0)
    xgboosts.append(model)

    # Calculate ROCS
    X_val, y_val = valids[fold]
    preds = model.predict_proba(X_val)
    aucs += np.array(roc_auc_score(y_val, preds, multi_class='ovr', average=None))
    
val_aucs = pd.Series(aucs, index=y_cols)/5
display(val_aucs)
print(f'Base Model Mean AUC: {round(val_aucs.mean(), 4)}')

Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4


Pastry          0.863051
Z_Scratch       0.958140
K_Scatch        0.985536
Stains          0.990906
Dirtiness       0.883273
Bumps           0.807605
Other_Faults    0.702566
dtype: float64

Base Model Mean AUC: 0.8844


In [9]:
importances = xgboosts[0].feature_importances_
for xgboost in xgboosts[1:]:
    importances += xgboost.feature_importances_
importances = np.array(importances) / 5
Xcols = np.array(X.columns)[np.argsort(importances)][::-1]
sortd = np.sort(importances)[::-1]
for i, col in enumerate(Xcols):
    print(f'{col}: {round(sortd[i], 2)}')

LogOfAreas: 0.1899999976158142
Log_X_Index: 0.12999999523162842
Outside_X_Index: 0.09000000357627869
TypeOfSteel_A300: 0.07999999821186066
Pixels_Areas: 0.05999999865889549
X_Perimeter: 0.05999999865889549
Steel_Plate_Thickness: 0.05000000074505806
Orientation_Index: 0.03999999910593033
Length_of_Conveyer: 0.029999999329447746
TypeOfSteel_A400: 0.029999999329447746
Edges_Y_Index: 0.019999999552965164
Outside_Global_Index: 0.019999999552965164
Y_Perimeter: 0.019999999552965164
Minimum_of_Luminosity: 0.019999999552965164
Log_Y_Index: 0.019999999552965164
Square_Index: 0.009999999776482582
X_Maximum: 0.009999999776482582
X_Minimum: 0.009999999776482582
Edges_Index: 0.009999999776482582
Maximum_of_Luminosity: 0.009999999776482582
Sum_of_Luminosity: 0.009999999776482582
Luminosity_Index: 0.009999999776482582
Empty_Index: 0.009999999776482582
SigmoidOfAreas: 0.009999999776482582
Edges_X_Index: 0.009999999776482582
Y_Maximum: 0.009999999776482582
Y_Minimum: 0.009999999776482582


# Class-Based Importances

In [10]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

estimators = 1000
stopping = 5
lr = 0.05

aucs = pd.Series()
for i, col in enumerate(y_cols):
    print('='*30)
    print(col)
    print('='*30)
    xgboosts=[]
    aucs_sum = 0
    for fold in range(5):
        
        X, y = trains[fold]
        y_col = np.array(y[col])
        X_val, y_val = valids[fold]
        y_val_col = np.array(y_val[col])

        # Fit Model

        model = XGBClassifier(n_estimators=estimators,
                              early_stopping_rounds=stopping,
                              learning_rate=lr,
                              n_jobs=4,
                              eval_metric='auc')

        model.fit(X, y_col, eval_set=
                  [(X_val, np.array(y_val[col]))], verbose=0)
        xgboosts.append(model)

        # Calculate ROCS
        preds = model.predict_proba(X_val)[:,1]
        aucs_sum += roc_auc_score(np.array(y_val[col]), preds)

    aucs[col] = aucs_sum / 5
    # Importances
    importances = xgboosts[0].feature_importances_
    for xgboost in xgboosts[1:]:
        importances += xgboost.feature_importances_
    importances = np.array(importances) / 5
    Xcols = np.array(X.columns)[np.argsort(importances)][::-1]
    sortd = np.sort(importances)[::-1]
    for i, clm in enumerate(Xcols):
        print(f'{clm}: {round(sortd[i], 2)}')

display(aucs)
print(f'Mean AUC: {round(aucs.mean(), 3)}')

Pastry
Orientation_Index: 0.4000000059604645
Edges_Y_Index: 0.10999999940395355
Square_Index: 0.03999999910593033
Length_of_Conveyer: 0.03999999910593033
Y_Perimeter: 0.03999999910593033
Minimum_of_Luminosity: 0.029999999329447746
TypeOfSteel_A300: 0.029999999329447746
Outside_X_Index: 0.019999999552965164
Log_Y_Index: 0.019999999552965164
Steel_Plate_Thickness: 0.019999999552965164
Maximum_of_Luminosity: 0.019999999552965164
Edges_Index: 0.019999999552965164
Outside_Global_Index: 0.019999999552965164
SigmoidOfAreas: 0.019999999552965164
Sum_of_Luminosity: 0.019999999552965164
Empty_Index: 0.019999999552965164
Luminosity_Index: 0.019999999552965164
Log_X_Index: 0.019999999552965164
Pixels_Areas: 0.009999999776482582
X_Perimeter: 0.009999999776482582
Edges_X_Index: 0.009999999776482582
LogOfAreas: 0.009999999776482582
Y_Minimum: 0.009999999776482582
X_Maximum: 0.009999999776482582
X_Minimum: 0.009999999776482582
Y_Maximum: 0.009999999776482582
TypeOfSteel_A400: 0.0
Z_Scratch
TypeOfSteel

Pastry          0.853370
Z_Scratch       0.948755
K_Scatch        0.983470
Stains          0.979529
Dirtiness       0.874876
Bumps           0.806725
Other_Faults    0.702790
dtype: float64

Mean AUC: 0.879
