In [1]:
import numpy as np
import pandas as pd
from joblib import delayed, Parallel

In [2]:
class paths:
    TRAIN = './data/train.csv'
    TEST = './data/test.csv'

In [3]:
df = pd.read_csv(paths.TRAIN)

## Pastry

Baseline: 0.853

In [148]:
def pastry_engineering(df):
    new_df = df.copy()
    new_df['Abs_Orientation'] = np.abs(new_df['Orientation_Index'])
    new_df['New_Lum'] = 1 + new_df['Luminosity_Index']
    #new_df['Width'] = np.abs(new_df['X_Maximum'] - new_df['X_Minimum'])
    new_df['Height'] = np.abs(new_df['Y_Maximum'] - new_df['Y_Minimum'])
    new_df['Lum_Range'] = np.abs(
        new_df['Maximum_of_Luminosity'] - new_df['Minimum_of_Luminosity'])
    
    new_df['Log_Y/X'] = new_df['Log_Y_Index'] / new_df['Log_X_Index']

    new_df['Feature1'] = new_df['Sum_of_Luminosity'] / new_df['New_Lum']
    new_df['Feature2'] = new_df['Steel_Plate_Thickness'] * \
        new_df['Length_of_Conveyer']
    new_df['Feature3'] = new_df['Outside_Global_Index'] * new_df['Minimum_of_Luminosity']
    new_df['Feature4'] = new_df['Minimum_of_Luminosity'] * new_df['Empty_Index']
    new_df['Feature5'] = new_df['New_Lum'] * new_df['Empty_Index'] / (1e-6+new_df['Log_Y_Index'])
    new_df['Feature6'] = new_df['Edges_Y_Index']**2 * new_df['Log_Y/X']
    new_df['Feature7'] = new_df['Maximum_of_Luminosity'] * new_df['Minimum_of_Luminosity'] / \
        (new_df['New_Lum'] + new_df['Outside_Global_Index'])
    new_df['Feature8'] = new_df['Outside_Global_Index'] * new_df['New_Lum']
    new_df['F9'] = new_df['Outside_Global_Index'] * new_df['Empty_Index'] / new_df['Sum_of_Luminosity']

    drop_cols = ['Y_Maximum', 'Y_Minimum', 'X_Maximum', 'X_Minimum',
                 'SigmoidOfAreas', 'Pixels_Areas', 'Outside_X_Index',
                 'X_Perimeter', 'Y_Perimeter', 'Square_Index',
                 'Orientation_Index', 'Luminosity_Index', 'Height', 'Edges_X_Index',
                 'Log_X_Index', 'Log_Y/X', 'Outside_Global_Index', 'TypeOfSteel_A400']

    new_df = new_df.drop(drop_cols, axis=1)
    return new_df

In [149]:
train_df = pastry_engineering(df)

y_cols = ['Pastry', 'Z_Scratch', 'K_Scatch',
          'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
y = train_df[y_cols]
X = train_df.drop(y, axis=1)
ids = X['id']
X = X.drop(['id'], axis=1)

In [150]:
from sklearn.model_selection import GroupKFold
gkf = GroupKFold(n_splits=5)

trains = []
valids = []
for fold, (train_index, valid_index) in enumerate(gkf.split(X, y, ids)):
    print(f'Fold: {fold}')
    train_X = X.loc[train_index]
    train_y = y.loc[train_index]
    valid_X = X.loc[valid_index]
    valid_y = y.loc[valid_index]
    trains.append((train_X, train_y))
    valids.append((valid_X, valid_y))

Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4


In [151]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

col = 'Pastry'

estimators = 1000
stopping = 5
lr = 0.05

xgboosts = []
aucs_sum = 0
for fold in range(5):
    X, y = trains[fold]
    y_col = np.array(y[col])
    X_val, y_val = valids[fold]
    y_val_col = np.array(y_val[col])

    # Fit Model
    model = XGBClassifier(n_estimators=estimators,
                          early_stopping_rounds=stopping,
                          learning_rate=lr,
                          n_jobs=4,
                          eval_metric='auc')

    model.fit(X, y_col, eval_set=[
              (X_val, np.array(y_val[col]))], verbose=0)
    xgboosts.append(model)

    # Calculate ROCS
    preds = model.predict_proba(X_val)[:, 1]
    aucs_sum += roc_auc_score(np.array(y_val[col]), preds)

aucs = aucs_sum / 5
# Importances
importances = xgboosts[0].feature_importances_
for xgboost in xgboosts[1:]:
    importances += xgboost.feature_importances_
importances = np.array(importances) / 5
Xcols = np.array(X.columns)[np.argsort(importances)][::-1]
sortd = np.sort(importances)[::-1]
for i, clm in enumerate(Xcols):
    print(f'{clm}: {round(sortd[i], 2)}')

print(f'\nMean AUC: {aucs:.3f}')

Feature6: 0.33000001311302185
Lum_Range: 0.05000000074505806
Length_of_Conveyer: 0.03999999910593033
Edges_Y_Index: 0.03999999910593033
TypeOfSteel_A300: 0.03999999910593033
Minimum_of_Luminosity: 0.03999999910593033
Steel_Plate_Thickness: 0.03999999910593033
Feature7: 0.03999999910593033
Feature5: 0.029999999329447746
Abs_Orientation: 0.029999999329447746
F9: 0.029999999329447746
Feature4: 0.029999999329447746
Maximum_of_Luminosity: 0.029999999329447746
Feature1: 0.029999999329447746
Feature2: 0.029999999329447746
Sum_of_Luminosity: 0.029999999329447746
Edges_Index: 0.029999999329447746
Feature3: 0.029999999329447746
LogOfAreas: 0.019999999552965164
Empty_Index: 0.019999999552965164
Feature8: 0.019999999552965164
Log_Y_Index: 0.019999999552965164
New_Lum: 0.019999999552965164

Mean AUC: 0.863
