In [3]:
import numpy as np
import pandas as pd
from joblib import delayed, Parallel

# Run This Ensemble When Submission Available
# (Try Squaring Results and Renormalizing Next)

In [4]:
class paths:
    TRAIN = './data/train.csv'
    TEST = './data/test.csv'

In [5]:
df = pd.read_csv(paths.TRAIN)
df.describe()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
count,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,...,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0
mean,9609.0,709.854675,753.857641,1849756.0,1846605.0,1683.987616,95.654665,64.124096,191846.7,84.808419,...,0.102742,-0.138382,0.571902,0.076279,0.059837,0.178573,0.029554,0.025235,0.247828,0.341225
std,5548.191747,531.544189,499.836603,1903554.0,1896295.0,3730.319865,177.821382,101.054178,442024.7,28.800344,...,0.487681,0.120344,0.332219,0.26545,0.23719,0.383005,0.169358,0.156844,0.431762,0.474133
min,0.0,0.0,4.0,6712.0,6724.0,6.0,2.0,1.0,250.0,0.0,...,-0.9884,-0.885,0.119,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4804.5,49.0,214.0,657468.0,657502.0,89.0,15.0,14.0,9848.0,70.0,...,-0.2727,-0.1925,0.2532,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,9609.0,777.0,796.0,1398169.0,1398179.0,168.0,25.0,23.0,18238.0,90.0,...,0.1111,-0.1426,0.4729,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,14413.5,1152.0,1165.0,2368032.0,2362511.0,653.0,64.0,61.0,67978.0,105.0,...,0.5294,-0.084,0.9994,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,19218.0,1705.0,1713.0,12987660.0,12987690.0,152655.0,7553.0,903.0,11591410.0,196.0,...,0.9917,0.6421,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Feature Engineering

In [6]:
def feature_engineering(df):
    new_df = df.copy()
    new_df['Sine'] = np.sqrt(1 - new_df['Orientation_Index']**2)
    new_df['Fixed_Area'] = new_df['Pixels_Areas'] / new_df['Sine']
    new_df['Fixed_LumSum'] = new_df['Sum_of_Luminosity'] / new_df['Sine']
    new_df['Norm_SumLum'] = new_df['Sum_of_Luminosity'] / new_df['Fixed_Area']
    new_df['Norm_MinLum'] = new_df['Minimum_of_Luminosity'] / new_df['Sine']
    new_df['Norm_MaxLum'] = new_df['Maximum_of_Luminosity'] / new_df['Sine']
    new_df['X_Size'] = new_df['X_Maximum'] - new_df['X_Minimum']
    new_df['X_Size_abs'] = np.abs(new_df['X_Maximum'] - new_df['X_Minimum'])

    drop_cols = ['Y_Maximum','Y_Minimum','X_Maximum','X_Minimum']
    #            'Luminosity_Index','Edges_X_Index',
    #            'Edges_Index', 'Maximum_of_Luminosity', 'Sum_of_Luminosity', 'Norm_MaxLum', 
    #            'Log_Y_Index', 'Square_Index', 'Empty_Index', 'SigmoidOfAreas']
    
    new_df = new_df.drop(drop_cols, axis=1)
    return new_df

In [7]:
train_df = feature_engineering(df)
train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,19219.0,9609.0,5548.191747,0.0,4804.5,9609.0,14413.5,19218.0
Pixels_Areas,19219.0,1683.987616,3730.319865,6.0,89.0,168.0,653.0,152655.0
X_Perimeter,19219.0,95.654665,177.821382,2.0,15.0,25.0,64.0,7553.0
Y_Perimeter,19219.0,64.124096,101.054178,1.0,14.0,23.0,61.0,903.0
Sum_of_Luminosity,19219.0,191846.678235,442024.694057,250.0,9848.0,18238.0,67978.0,11591410.0
Minimum_of_Luminosity,19219.0,84.808419,28.800344,0.0,70.0,90.0,105.0,196.0
Maximum_of_Luminosity,19219.0,128.64738,14.196976,39.0,124.0,127.0,135.0,253.0
Length_of_Conveyer,19219.0,1459.350747,145.568687,1227.0,1358.0,1364.0,1652.0,1794.0
TypeOfSteel_A300,19219.0,0.402674,0.490449,0.0,0.0,0.0,1.0,1.0
TypeOfSteel_A400,19219.0,0.596337,0.490644,0.0,0.0,1.0,1.0,1.0


In [8]:
train_df_A300 = train_df.loc[train_df['TypeOfSteel_A300']==1].drop(['TypeOfSteel_A300', 'TypeOfSteel_A400'], axis=1).reset_index(drop=True)
train_df_A400 = train_df.loc[train_df['TypeOfSteel_A400']==1].drop(['TypeOfSteel_A300', 'TypeOfSteel_A400'], axis=1).reset_index(drop=True)

## Splitting

In [9]:
y_cols = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

y_cl = train_df[y_cols]
X_cl = train_df.drop(y_cl, axis=1)
ids_cl = X_cl['id']
X_cl = X_cl.drop(['id'],axis=1)

y_3 = train_df_A300[y_cols]
X_3 = train_df_A300.drop(y_cols, axis=1)
ids_3 = X_3['id']
X_3 = X_3.drop(['id'],axis=1)

y_4 = train_df_A400[y_cols]
X_4 = train_df_A400.drop(y_cols, axis=1)
ids_4 = X_4['id']
X_4 = X_4.drop(['id'],axis=1)

## Group KFold

In [10]:
from sklearn.model_selection import GroupKFold
gkf_cl = GroupKFold(n_splits=5)
gkf_3 = GroupKFold(n_splits=5)
gkf_4 = GroupKFold(n_splits=5)

In [11]:
trains_cl = []
valids_cl = []
for fold, (train_index, valid_index) in enumerate(gkf_cl.split(X_cl, y_cl, ids_cl)):
    print(f'Fold: {fold}')
    train_X = X_cl.loc[train_index]
    train_y = y_cl.loc[train_index]
    valid_X = X_cl.loc[valid_index]
    valid_y = y_cl.loc[valid_index]
    trains_cl.append((train_X, train_y))
    valids_cl.append((valid_X, valid_y))

trains_3 = []
valids_3 = []
for fold, (train_index, valid_index) in enumerate(gkf_3.split(X_3, y_3, ids_3)):
    print(f'Fold: {fold}')
    train_X = X_3.loc[train_index]
    train_y = y_3.loc[train_index]
    valid_X = X_3.loc[valid_index]
    valid_y = y_3.loc[valid_index]
    trains_3.append((train_X, train_y))
    valids_3.append((valid_X, valid_y))
    
trains_4 = []
valids_4 = []
for fold, (train_index, valid_index) in enumerate(gkf_4.split(X_4, y_4, ids_4)):
    print(f'Fold: {fold}')
    train_X = X_4.loc[train_index]
    train_y = y_4.loc[train_index]
    valid_X = X_4.loc[valid_index]
    valid_y = y_4.loc[valid_index]
    trains_4.append((train_X, train_y))
    valids_4.append((valid_X, valid_y))

Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4
Fold: 0
Fold: 1
Fold: 2
Fold: 3
Fold: 4


## XGBoost

In [12]:
from xgboost import XGBClassifier

In [17]:
estimators = 1000
stopping = 10
lr = 0.05

def fitter(i: int):
    xgboosts = []
    if i == 0:
        for fold in range(5):
            model = XGBClassifier(n_estimators=estimators,
                                  early_stopping_rounds=stopping,
                                  learning_rate=lr)
            model.fit(trains_cl[fold][0], trains_cl[fold][1], 
                      eval_set = [valids_cl[fold]], verbose=0)
            xgboosts.append(model)

    if i == 1:
        for fold in range(5):
            print(f'Fold: {fold}')
            model = XGBClassifier(n_estimators=estimators,
                                early_stopping_rounds=stopping,
                                learning_rate=lr)
            model.fit(trains_3[fold][0], trains_3[fold][1], 
                    eval_set = [valids_3[fold]], verbose=0)
            xgboosts.append(model)

    if i == 2:
        for fold in range(5):
            print(f'Fold: {fold}')
            model = XGBClassifier(n_estimators=estimators,
                                early_stopping_rounds=stopping,
                                learning_rate=lr)
            model.fit(trains_4[fold][0], trains_4[fold][1], 
                    eval_set = [valids_4[fold]], verbose=0)
            xgboosts.append(model)
    return xgboosts

In [18]:
xgboosts_cl, xgboosts_3, xgboosts_4 = Parallel(n_jobs=3)(delayed(fitter)(i) for i in range(3))

## Feature Importances

In [19]:
print('Classic')
print('='*50)
importances = xgboosts_cl[0].feature_importances_
for xgboost in xgboosts_cl[1:]:
    importances += xgboost.feature_importances_
importances = np.array(importances) / 5
Xcols = np.array(X_cl.columns)[np.argsort(importances)][::-1]
sortd = np.sort(importances)[::-1]
print('')
for i, col in enumerate(Xcols):
    print(f'{col}: {round(sortd[i], 2)}')
    
print(' ')

print('A300')
print('='*50)
importances = xgboosts_3[0].feature_importances_
for xgboost in xgboosts_3[1:]:
    importances += xgboost.feature_importances_
importances = np.array(importances) / 5
Xcols = np.array(X_3.columns)[np.argsort(importances)][::-1]
sortd = np.sort(importances)[::-1]
print('')
for i, col in enumerate(Xcols):
    print(f'{col}: {round(sortd[i], 2)}')
    
print(' ')

print('A400')
print('='*50)
importances = xgboosts_4[0].feature_importances_
for xgboost in xgboosts_4[1:]:
    importances += xgboost.feature_importances_
importances = np.array(importances) / 5
Xcols = np.array(X_4.columns)[np.argsort(importances)][::-1]
sortd = np.sort(importances)[::-1]
print('')
for i, col in enumerate(Xcols):
    print(f'{col}: {round(sortd[i], 2)}')

Classic

X_Size_abs: 0.28999999165534973
LogOfAreas: 0.10999999940395355
Log_X_Index: 0.05000000074505806
TypeOfSteel_A300: 0.05000000074505806
Outside_X_Index: 0.05000000074505806
Pixels_Areas: 0.03999999910593033
X_Size: 0.03999999910593033
Steel_Plate_Thickness: 0.03999999910593033
Orientation_Index: 0.029999999329447746
Fixed_Area: 0.029999999329447746
Length_of_Conveyer: 0.029999999329447746
TypeOfSteel_A400: 0.019999999552965164
Sine: 0.019999999552965164
Edges_Y_Index: 0.019999999552965164
X_Perimeter: 0.019999999552965164
Norm_MinLum: 0.019999999552965164
Outside_Global_Index: 0.009999999776482582
Minimum_of_Luminosity: 0.009999999776482582
Y_Perimeter: 0.009999999776482582
Edges_Index: 0.009999999776482582
Sum_of_Luminosity: 0.009999999776482582
Norm_MaxLum: 0.009999999776482582
Maximum_of_Luminosity: 0.009999999776482582
Log_Y_Index: 0.009999999776482582
Fixed_LumSum: 0.009999999776482582
Square_Index: 0.009999999776482582
Luminosity_Index: 0.009999999776482582
Empty_Index: 0

# Classifying

In [20]:
test = pd.read_csv(paths.TEST)
test_df = feature_engineering(test)

In [21]:
test_df_A300 = test_df.loc[test_df['TypeOfSteel_A300']==1].drop(['TypeOfSteel_A300', 'TypeOfSteel_A400'], axis=1).reset_index(drop=True)
test_df_A400 = test_df.loc[test_df['TypeOfSteel_A400']==1].drop(['TypeOfSteel_A300', 'TypeOfSteel_A400'], axis=1).reset_index(drop=True)

In [22]:
ids_test_cl = test_df['id']
X_test_cl = test_df.drop(['id'],axis=1)

ids_test_3 = test_df_A300['id']
X_test_3 = test_df_A300.drop(['id'],axis=1)

ids_test_4 = test_df_A400['id']
X_test_4 = test_df_A400.drop(['id'],axis=1)

In [23]:
# A300
preds_classic = np.array(xgboosts_cl[0].predict_proba(X_test_cl))
for fold in range(1,5):
    preds_classic += xgboosts_cl[fold].predict_proba(X_test_cl)
preds_classic = preds_classic / preds_classic.sum(axis=1).reshape(-1,1)
result_classic = pd.DataFrame(preds_classic, columns = y_cols)
result_classic['id'] = ids_test_cl

# A300
preds_A300 = np.array(xgboosts_3[0].predict_proba(X_test_3))
for fold in range(1,5):
    preds_A300 += xgboosts_3[fold].predict_proba(X_test_3)
preds_A300 = preds_A300 / preds_A300.sum(axis=1).reshape(-1,1)
result_A300 = pd.DataFrame(preds_A300, columns = y_cols)
result_A300['id'] = ids_test_3

# A400
preds_A400 = np.array(xgboosts_4[0].predict_proba(X_test_4))
for fold in range(1,5):
    preds_A400 += xgboosts_4[fold].predict_proba(X_test_4)
preds_A400 = preds_A400 / preds_A400.sum(axis=1).reshape(-1,1)
result_A400 = pd.DataFrame(preds_A400, columns = y_cols)
result_A400['id'] = ids_test_4

result_df = pd.concat([result_classic, result_A300, result_A400])
result_df = result_df.groupby('id', as_index=False).agg('mean')
display(result_df)

Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.493833,0.002158,0.002752,0.001003,0.011071,0.153135,0.336048
1,19220,0.244078,0.016594,0.014261,0.001139,0.190776,0.177375,0.355778
2,19221,0.002218,0.064435,0.057306,0.001585,0.007183,0.340473,0.526801
3,19222,0.172182,0.002179,0.001675,0.002680,0.008585,0.406280,0.406420
4,19223,0.004290,0.002552,0.001765,0.002142,0.007863,0.620225,0.361164
...,...,...,...,...,...,...,...,...
12809,32028,0.087867,0.110921,0.003734,0.001307,0.035062,0.286698,0.474411
12810,32029,0.174633,0.005516,0.029003,0.007783,0.193803,0.153055,0.436207
12811,32030,0.001265,0.001355,0.934514,0.001097,0.001231,0.001413,0.059126
12812,32031,0.366505,0.011610,0.024353,0.001218,0.047436,0.181600,0.367278
