# Steel Plate Defect Prediction

In [37]:
#from google.colab import drive
#drive.mount('/content/drive')

In [38]:
#!pip install optuna
#!pip install catboost

In [1]:
%%time

import joblib

import numpy as np
np.random.seed(42)

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedShuffleSplit
import optuna

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


import warnings
warnings.filterwarnings("ignore")

sns.set_theme()
sns.set_context("paper")

CPU times: total: 2.5 s
Wall time: 5.29 s


In [2]:
#original_data = pd.read_csv('data/train.csv',index_col=0)
train = pd.read_csv('data/train.csv',index_col=0)
test = pd.read_csv('data/test.csv',index_col=0)
sub = pd.read_csv('data/sample_submission.csv',index_col=0)
df_org = pd.read_csv('data/original_data.csv')

In [3]:
# Print the first few rows of each dataset
print("Train data:\n", train.shape, "\n\nTest data:\n", test.shape, "\n\nSubmission data:\n", sub.shape, "\n\nOriginal data: \n", df_org.shape)

Train data:
 (19219, 34) 

Test data:
 (12814, 27) 

Submission data:
 (12814, 7) 

Original data: 
 (1941, 34)


In [4]:
targets = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains','Dirtiness', 'Bumps', 'Other_Faults']

In [5]:
train.head()

Unnamed: 0_level_0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,584,590,909972,909977,16,8,5,2274,113,140,...,-0.5,-0.0104,0.1417,0,0,0,1,0,0,0
1,808,816,728350,728372,433,20,54,44478,70,111,...,0.7419,-0.2997,0.9491,0,0,0,0,0,0,1
2,39,192,2212076,2212144,11388,705,420,1311391,29,141,...,-0.0105,-0.0944,1.0,0,0,1,0,0,0,0
3,781,789,3353146,3353173,210,16,29,3202,114,134,...,0.6667,-0.0402,0.4025,0,0,1,0,0,0,0
4,1540,1560,618457,618502,521,72,67,48231,82,111,...,0.9158,-0.2455,0.9998,0,0,0,0,0,0,1


In [6]:
train.columns

Index(['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas',
       'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity',
       'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer',
       'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness',
       'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index',
       'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas',
       'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index',
       'SigmoidOfAreas', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
       'Dirtiness', 'Bumps', 'Other_Faults'],
      dtype='object')

In [7]:
class Preprocess:

    def fit(self, X, y=None):
        return

    def fit_transform(self, X):
        epsilon = 1e-6

        # Location Features
        X['X_Distance'] = X['X_Maximum'] - X['X_Minimum']
        X['Y_Distance'] = X['Y_Maximum'] - X['Y_Minimum']

        # Density Feature
        X['Density'] = X['Pixels_Areas'] / (X['X_Perimeter'] + X['Y_Perimeter'])

        # Relative Perimeter Feature
        X['Relative_Perimeter'] = X['X_Perimeter'] / (X['X_Perimeter'] + X['Y_Perimeter'] + epsilon)

        # Circularity Feature
        X['Circularity'] = X['Pixels_Areas'] / (X['X_Perimeter'] ** 2)

        # Symmetry Index Feature
        X['Symmetry_Index'] = np.abs(X['X_Distance'] - X['Y_Distance']) / (X['X_Distance'] + X['Y_Distance'] + epsilon)

        # Color Contrast Feature
        X['Color_Contrast'] = X['Maximum_of_Luminosity'] - X['Minimum_of_Luminosity']

        # Combined Geometric Index Feature
        X['Combined_Geometric_Index'] = X['Edges_Index'] * X['Square_Index']

        # Interaction Term Feature
        X['X_Distance*Pixels_Areas'] = X['X_Distance'] * X['Pixels_Areas']

        # Additional Features
        X['sin_orientation'] = np.sin(X['Orientation_Index'])
        X['Edges_Index2'] = np.exp(X['Edges_Index'] + epsilon)
        X['X_Maximum2'] = np.sin(X['X_Maximum'])
        X['Y_Minimum2'] = np.sin(X['Y_Minimum'])
        X['Aspect_Ratio_Pixels'] = np.where(X['Y_Perimeter'] == 0, 0, X['X_Perimeter'] / X['Y_Perimeter'])
        X['Aspect_Ratio'] = np.where(X['Y_Distance'] == 0, 0, X['X_Distance'] / X['Y_Distance'])

        # Average Luminosity Feature
        X['Average_Luminosity'] = (X['Sum_of_Luminosity'] + X['Minimum_of_Luminosity']) / 2

        # Normalized Steel Thickness Feature
        X['Normalized_Steel_Thickness'] = (X['Steel_Plate_Thickness'] - X['Steel_Plate_Thickness'].min()) / (X['Steel_Plate_Thickness'].max() - X['Steel_Plate_Thickness'].min())

        # Logarithmic Features
        X['Log_Perimeter'] = np.log(X['X_Perimeter'] + X['Y_Perimeter'] + epsilon)
        X['Log_Luminosity'] = np.log(X['Sum_of_Luminosity'] + epsilon)
        X['Log_Aspect_Ratio'] = np.log(X['Aspect_Ratio'] ** 2 + epsilon)

        # Statistical Features
        X['Combined_Index'] = X['Orientation_Index'] * X['Luminosity_Index']
        X['Sigmoid_Areas'] = 1 / (1 + np.exp(-X['LogOfAreas'] + epsilon))

        return X


In [8]:
train = Preprocess().fit_transform(train)
train.head()

Unnamed: 0_level_0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Y_Minimum2,Aspect_Ratio_Pixels,Aspect_Ratio,Average_Luminosity,Normalized_Steel_Thickness,Log_Perimeter,Log_Luminosity,Log_Aspect_Ratio,Combined_Index,Sigmoid_Areas
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,584,590,909972,909977,16,8,5,2274,113,140,...,-0.260085,1.6,1.2,1193.5,0.038462,2.564949,7.729296,0.364644,0.0052,0.769253
1,808,816,728350,728372,433,20,54,44478,70,111,...,-0.017598,0.37037,0.363636,22274.0,0.153846,4.304065,10.70275,-2.023194,-0.222347,0.933174
2,39,192,2212076,2212144,11388,705,420,1311391,29,141,...,-0.876624,1.678571,2.25,655710.0,0.0,7.025538,14.086599,1.621861,0.000991,0.982983
3,781,789,3353146,3353173,210,16,29,3202,114,134,...,-0.997695,0.551724,0.296296,1658.0,0.0,3.806663,8.071531,-2.432779,-0.026801,0.910699
4,1540,1560,618457,618502,521,72,67,48231,82,111,...,0.071318,1.074627,0.444444,24156.5,1.0,4.934474,10.783757,-1.621855,-0.224829,0.941


In [9]:
df_org = Preprocess().fit_transform(df_org)
df_org.head()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Y_Minimum2,Aspect_Ratio_Pixels,Aspect_Ratio,Average_Luminosity,Normalized_Steel_Thickness,Log_Perimeter,Log_Luminosity,Log_Aspect_Ratio,Combined_Index,Sigmoid_Areas
0,42,50,270900,270944,267,17,44,24220,76,108,...,0.448853,0.386364,0.181818,12148.0,0.153846,4.110874,10.094934,-3.409466,-0.238342,0.918826
1,645,651,2538079,2538108,108,10,30,11397,84,123,...,-0.907991,0.333333,0.206897,5740.5,0.153846,3.688879,9.341105,-3.151049,-0.139268,0.884259
2,829,835,1553913,1553931,71,8,19,7972,99,125,...,-0.396659,0.421053,0.333333,4035.5,0.230769,3.295837,8.983691,-2.197216,-0.081871,0.86428
3,853,860,369370,369415,176,13,45,18996,99,126,...,0.375881,0.288889,0.155556,9547.5,0.961538,4.060443,9.851984,-3.721463,-0.132402,0.904262
4,1289,1306,498078,498335,2409,60,260,246930,37,126,...,-0.458158,0.230769,0.066148,123483.5,0.557692,5.768321,12.41686,-5.431497,-0.186013,0.967131


In [10]:
X, y = train.drop(targets, axis=1), np.argmax(train[targets].values, axis=1)

X_org, y_org  = df_org.drop(targets, axis=1), np.argmax(df_org[targets].values, axis=1)

X = pd.concat([X, X_org], axis=0)
y = np.concatenate([y, y_org])

In [11]:
def auc_score(estimator, X, y):
    y_prob = estimator.predict_proba(X)
    return roc_auc_score(y, y_prob, multi_class="ovr")

In [12]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

model_base = XGBClassifier(
    objective="multi:softprob",
    random_state=42,
    verbosity=0,
    n_jobs=-1,
)

scores = cross_val_score(model_base, X, y, cv=5, scoring=auc_score)
print('Accuracy:', np.round(scores, 2))
print('Accuracy mean:', np.round(scores.mean(), 2))


Accuracy: [0.86 0.87 0.86 0.87 0.89]
Accuracy mean: 0.87


In [13]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

model_skf = XGBClassifier(
    objective="multi:softprob",
    random_state=42,
    verbosity=0,
    n_jobs=-1,
)

scores_2 = cross_val_score(model_skf, X, y, cv=kfold, scoring=auc_score)
print('Accuracy:', np.round(scores_2, 2))
print('Accuracy mean:', np.round(scores_2.mean(), 2))

Accuracy: [0.87 0.87 0.88 0.87 0.88]
Accuracy mean: 0.87


In [14]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

kfold = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

def grid_search(params, random=False):
    xgb = XGBClassifier(
        objective="multi:softprob",
        random_state=42,
        verbosity=0,
        n_jobs=-1,
    )
    
    if random:
        grid = RandomizedSearchCV(xgb, params, n_iter=20, cv=kfold, n_jobs=-1)
    else:
        grid = GridSearchCV(xgb, params, cv=kfold, n_jobs=-1)
        
    grid.fit(X, y)
    
    best_params = grid.best_params_
    print('Best params:', best_params)
    
    best_score = grid.best_score_
    print("Training score: {:.3f}".format(best_score))
    
    return best_params

grid_search(params={'n_estimators':[100, 200, 400, 800]})
    

Best params: {'n_estimators': 100}
Training score: 0.582


({'n_estimators': 100}, 0.5820888468809073)

In [16]:
%%time

grid_search(params={'learning_rate':[0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]})

Best params: {'learning_rate': 0.05}
Training score: 0.600
CPU times: total: 36.5 s
Wall time: 53.2 s


({'learning_rate': 0.05}, 0.5999999999999999)

In [17]:
%%time

grid_search(params={'max_depth':[2, 3, 5, 6, 8]})

Best params: {'max_depth': 2}
Training score: 0.601
CPU times: total: 11.1 s
Wall time: 32.1 s


({'max_depth': 2}, 0.6007561436672969)

In [18]:
%%time

grid_search(params={'gamma':[0, 0.1, 0.5, 1, 2, 5]})

Best params: {'gamma': 5}
Training score: 0.598
CPU times: total: 10.3 s
Wall time: 26.5 s


({'gamma': 5}, 0.5976370510396976)

In [19]:
%%time

grid_search(params={'min_child_weight':[1, 2, 3, 4, 5]})

Best params: {'min_child_weight': 4}
Training score: 0.585
CPU times: total: 29.5 s
Wall time: 35.7 s


({'min_child_weight': 4}, 0.5850661625708884)

In [20]:
%%time

grid_search(params={'subsample':[0.5, 0.7, 0.8, 0.9, 1]})

Best params: {'subsample': 0.9}
Training score: 0.584
CPU times: total: 35.2 s
Wall time: 40.6 s


({'subsample': 0.9}, 0.583601134215501)

In [21]:
%%time

grid_search(params={'colsample_bytree':[0.5, 0.7, 0.8, 0.9, 1]})

Best params: {'colsample_bytree': 0.9}
Training score: 0.589
CPU times: total: 31.6 s
Wall time: 36.4 s


({'colsample_bytree': 0.9}, 0.5894612476370511)

In [33]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [34]:
eval_set = [(X_test, y_test)]

In [40]:
eval_metric = ["auc"]

In [44]:
y_pred

array([5, 5, 5, ..., 6, 2, 5], dtype=int64)

In [45]:
from sklearn.metrics import roc_auc_score

model = XGBClassifier(random_state=2, n_estimators=5000)

model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set, early_stopping_rounds=100)

# Use predict_proba instead of predict to get probabilities
y_pred_proba = model.predict_proba(X_test)

# Assuming y_test is one-hot encoded, use argmax to get predicted class labels
y_pred = y_pred_proba.argmax(axis=1)

accuracy = roc_auc_score(y_test, y_pred_proba, multi_class="ovr")

print("Accuracy: %.2f%%" % (accuracy * 100.0))

[0]	validation_0-auc:0.80401
[1]	validation_0-auc:0.81039
[2]	validation_0-auc:0.81478
[3]	validation_0-auc:0.81634
[4]	validation_0-auc:0.81836
[5]	validation_0-auc:0.81966
[6]	validation_0-auc:0.82126
[7]	validation_0-auc:0.82211
[8]	validation_0-auc:0.82263
[9]	validation_0-auc:0.82297
[10]	validation_0-auc:0.82360
[11]	validation_0-auc:0.82437
[12]	validation_0-auc:0.82488
[13]	validation_0-auc:0.82475
[14]	validation_0-auc:0.82477
[15]	validation_0-auc:0.82459
[16]	validation_0-auc:0.82467
[17]	validation_0-auc:0.82471
[18]	validation_0-auc:0.82445
[19]	validation_0-auc:0.82420
[20]	validation_0-auc:0.82376
[21]	validation_0-auc:0.82340
[22]	validation_0-auc:0.82298
[23]	validation_0-auc:0.82277
[24]	validation_0-auc:0.82252
[25]	validation_0-auc:0.82240
[26]	validation_0-auc:0.82211
[27]	validation_0-auc:0.82215
[28]	validation_0-auc:0.82186
[29]	validation_0-auc:0.82136
[30]	validation_0-auc:0.82093
[31]	validation_0-auc:0.82089
[32]	validation_0-auc:0.82108
[33]	validation_0-au

In [46]:
grid_search(params={'n_estimators':[2, 25, 50, 75, 100]})

Best params: {'n_estimators': 25}
Training score: 0.595


({'n_estimators': 25}, 0.5948487712665407)

In [47]:
grid_search(params={'max_depth':[1, 2, 3, 4, 5, 6, 7, 8], 'n_estimators':[25]})

Best params: {'max_depth': 5, 'n_estimators': 25}
Training score: 0.597


({'max_depth': 5, 'n_estimators': 25}, 0.5969281663516067)

In [48]:
grid_search(params={'max_depth':[1, 2, 3, 4, 6, 7, 8], 'n_estimators':[2, 50, 100]})

Best params: {'max_depth': 2, 'n_estimators': 100}
Training score: 0.601


({'max_depth': 2, 'n_estimators': 100}, 0.6007561436672969)

In [49]:
%%time

grid_search(params={'subsample':[0.5, 0.6, 0.7, 0.8, 0.9, 1],

                    'min_child_weight':[1, 2, 3, 4, 5],

                    'learning_rate':[0.1, 0.2, 0.3, 0.4, 0.5],

                    'max_depth':[1, 2, 3, 4, 5, None],

                    'n_estimators':[2, 25, 50, 75, 100]},

                    random=True)

Best params: {'subsample': 0.7, 'n_estimators': 75, 'min_child_weight': 4, 'max_depth': None, 'learning_rate': 0.1}
Training score: 0.600
CPU times: total: 25.4 s
Wall time: 39 s


({'subsample': 0.7,
  'n_estimators': 75,
  'min_child_weight': 4,
  'max_depth': None,
  'learning_rate': 0.1},
 0.6003780718336483)

In [50]:
best_params = {
    'subsample': 0.7,
    'n_estimators': 75,
    'min_child_weight': 4,
    'max_depth': None,
    'learning_rate': 0.1}

In [54]:
model = XGBClassifier(random_state=2, **best_params)

model.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set, early_stopping_rounds=100)

y_pred_proba = model.predict_proba(X_test)


y_pred = y_pred_proba.argmax(axis=1)

accuracy = roc_auc_score(y_test, y_pred_proba, multi_class="ovr")

print("ROC AUC: %.2f%%" % (accuracy * 100.0))



[0]	validation_0-auc:0.79959
[1]	validation_0-auc:0.80822
[2]	validation_0-auc:0.81119
[3]	validation_0-auc:0.81339
[4]	validation_0-auc:0.81501
[5]	validation_0-auc:0.81729
[6]	validation_0-auc:0.81739
[7]	validation_0-auc:0.81806
[8]	validation_0-auc:0.81872
[9]	validation_0-auc:0.81936
[10]	validation_0-auc:0.81956
[11]	validation_0-auc:0.82008
[12]	validation_0-auc:0.82029
[13]	validation_0-auc:0.82090
[14]	validation_0-auc:0.82140
[15]	validation_0-auc:0.82146
[16]	validation_0-auc:0.82184
[17]	validation_0-auc:0.82235
[18]	validation_0-auc:0.82291
[19]	validation_0-auc:0.82320
[20]	validation_0-auc:0.82322
[21]	validation_0-auc:0.82319
[22]	validation_0-auc:0.82368
[23]	validation_0-auc:0.82372
[24]	validation_0-auc:0.82389
[25]	validation_0-auc:0.82388
[26]	validation_0-auc:0.82404
[27]	validation_0-auc:0.82454
[28]	validation_0-auc:0.82476
[29]	validation_0-auc:0.82479
[30]	validation_0-auc:0.82478
[31]	validation_0-auc:0.82509
[32]	validation_0-auc:0.82503
[33]	validation_0-au

In [55]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

cm

array([[ 119,   24,   20,    4,    3,  119,  320],
       [   2,  269,    2,    1,    1,   19,   72],
       [   1,    3,  832,    2,    0,    8,   64],
       [   1,    0,    0,  134,    0,    7,   19],
       [  14,    0,    0,    1,   28,   18,   74],
       [  26,   33,    7,    4,    1,  647,  534],
       [  67,  101,   68,   32,   10,  477, 1102]], dtype=int64)

In [56]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.52      0.20      0.28       609
           1       0.63      0.73      0.68       366
           2       0.90      0.91      0.90       910
           3       0.75      0.83      0.79       161
           4       0.65      0.21      0.31       135
           5       0.50      0.52      0.51      1252
           6       0.50      0.59      0.55      1857

    accuracy                           0.59      5290
   macro avg       0.64      0.57      0.57      5290
weighted avg       0.59      0.59      0.58      5290

