In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_columns', None)

# Description of columns 

- **X_Minimum**: Represents the minimum x-coordinate of the defect.
- **X_Maximum**: Indicates the maximum x-coordinate of the defect.
- **Y_Minimum**: Denotes the minimum y-coordinate of the defect.
- **Y_Maximum**: Specifies the maximum y-coordinate of the defect.
- **Pixels_Areas**: Reflects the area of the defect in pixels.
- **X_Perimeter**: Describes the perimeter of the defect in the x-direction.
- **Y_Perimeter**: Illustrates the perimeter of the defect in the y-direction.
- **Sum_of_Luminosity**: Represents the sum of luminosity within the defect area.
- **Minimum_of_Luminosity**: Represents the minimum luminosity within the defect area.
- **Maximum_of_Luminosity**: Represents the maximum luminosity within the defect area.
- **Length_of_Conveyer**: Indicates the length of the conveyor at the time of defect detection.
- **TypeOfSteel_A300**: Binary indicator for type A300 steel.
- **TypeOfSteel_A400**: Binary indicator for type A400 steel.
- **Steel_Plate_Thickness**: Represents the thickness of the steel plate.
- **Edges_Index**: Describes the ratio of the perimeter to the area of the defect.
- **Empty_Index**: Illustrates the ratio of non-defect area to defect area.
- **Square_Index**: Describes the squareness of the defect.
- **Outside_X_Index**: Describes the ratio of the defect outside X to the entire image.
- **Edges_X_Index**: Describes the ratio of the defect edges in the X direction to the entire defect perimeter.
- **Edges_Y_Index**: Describes the ratio of the defect edges in the Y direction to the entire defect perimeter.
- **Outside_Global_Index**: Describes the ratio of the defect outside to the entire image.
- **LogOfAreas**: Represents the logarithm of the area of the defect.
- **Log_X_Index**: Describes the ratio of the X-axis length to the Y-axis length of the defect.
- **Log_Y_Index**: Describes the ratio of the Y-axis length to the X-axis length of the defect.
- **Orientation_Index**: Describes the orientation of the defect.
- **Luminosity_Index**: Describes the luminosity of the defect.
- **SigmoidOfAreas**: Represents the sigmoid of the area of the defect.
- **Pastry**: Indicates the presence of pastry on the surface (yes/no).
- **Z_Scratch**: Represents the presence of Z scratches on the surface (yes/no).
- **K_Scatch**: Indicates the presence of K scratches on the surface (yes/no).
- **Stains**: Indicates the presence of stains on the surface (yes/no).
- **Dirtiness**: Indicates the level of dirtiness on the surface (low, medium, high).
- **Bumps**: Indicates the presence of bumps on the surface (yes/no).
- **Other_Faults**: Represents other types of faults on the surface.

In [51]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv("test.csv")

df_train = df_train.drop('id', axis=1)

display(df_train.head())
display(df_train.shape)
display(df_test.head())
display(df_test.shape)

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,584,590,909972,909977,16,8,5,2274,113,140,1358,0,1,50,0.7393,0.4,0.5,0.0059,1.0,1.0,0.0,1.2041,0.9031,0.699,-0.5,-0.0104,0.1417,0,0,0,1,0,0,0
1,808,816,728350,728372,433,20,54,44478,70,111,1687,1,0,80,0.7772,0.2878,0.2581,0.0044,0.25,1.0,1.0,2.6365,0.7782,1.7324,0.7419,-0.2997,0.9491,0,0,0,0,0,0,1
2,39,192,2212076,2212144,11388,705,420,1311391,29,141,1400,0,1,40,0.0557,0.5282,0.9895,0.1077,0.2363,0.3857,0.0,4.0564,2.179,2.2095,-0.0105,-0.0944,1.0,0,0,1,0,0,0,0
3,781,789,3353146,3353173,210,16,29,3202,114,134,1387,0,1,40,0.7202,0.3333,0.3333,0.0044,0.375,0.931,1.0,2.3222,0.7782,1.4314,0.6667,-0.0402,0.4025,0,0,1,0,0,0,0
4,1540,1560,618457,618502,521,72,67,48231,82,111,1692,0,1,300,0.1211,0.5347,0.0842,0.0192,0.2105,0.9861,1.0,2.7694,1.415,1.8808,0.9158,-0.2455,0.9998,0,0,0,0,0,0,1


(19219, 34)

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas
0,19219,1015,1033,3826564,3826588,659,23,46,62357,67,127,1656,0,1,150,0.3877,0.4896,0.3273,0.0095,0.5652,1.0,1.0,2.841,1.1139,1.6628,0.6727,-0.2261,0.9172
1,19220,1257,1271,419960,419973,370,26,28,39293,92,132,1354,0,1,40,0.1629,0.4136,0.0938,0.0047,0.2414,1.0,1.0,2.5682,0.9031,1.4472,0.9063,-0.1453,0.9104
2,19221,1358,1372,117715,117724,289,36,32,29386,101,134,1360,0,1,40,0.0609,0.6234,0.4762,0.0155,0.6,0.75,0.0,2.4609,1.3222,1.3222,-0.5238,-0.0435,0.6514
3,19222,158,168,232415,232440,80,10,11,8586,107,140,1690,1,0,100,0.4439,0.3333,0.8182,0.0037,0.8,1.0,1.0,1.9031,0.699,1.0414,0.1818,-0.0738,0.2051
4,19223,559,592,544375,544389,140,19,15,15524,103,134,1688,1,0,60,0.8191,0.2619,0.4286,0.0158,0.8421,0.5333,0.0,2.1461,1.3222,1.1461,-0.5714,-0.0894,0.417


(12814, 28)

In [27]:
df_train.columns

Index(['X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum', 'Pixels_Areas',
       'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity',
       'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer',
       'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness',
       'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index',
       'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas',
       'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index',
       'SigmoidOfAreas', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
       'Dirtiness', 'Bumps', 'Other_Faults'],
      dtype='object')

In [28]:
df_train.describe()

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
count,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0,19219.0
mean,709.854675,753.857641,1849756.0,1846605.0,1683.987616,95.654665,64.124096,191846.7,84.808419,128.64738,1459.350747,0.402674,0.596337,76.213122,0.352939,0.409309,0.57452,0.030609,0.614749,0.831652,0.591899,2.473475,1.312667,1.389737,0.102742,-0.138382,0.571902,0.076279,0.059837,0.178573,0.029554,0.025235,0.247828,0.341225
std,531.544189,499.836603,1903554.0,1896295.0,3730.319865,177.821382,101.054178,442024.7,28.800344,14.196976,145.568687,0.490449,0.490644,53.93196,0.318976,0.124143,0.259436,0.047302,0.222391,0.220966,0.48205,0.760575,0.467848,0.405549,0.487681,0.120344,0.332219,0.26545,0.23719,0.383005,0.169358,0.156844,0.431762,0.474133
min,0.0,4.0,6712.0,6724.0,6.0,2.0,1.0,250.0,0.0,39.0,1227.0,0.0,0.0,40.0,0.0,0.0,0.0083,0.0015,0.0144,0.105,0.0,0.7782,0.301,0.0,-0.9884,-0.885,0.119,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,49.0,214.0,657468.0,657502.0,89.0,15.0,14.0,9848.0,70.0,124.0,1358.0,0.0,0.0,40.0,0.0586,0.3175,0.37575,0.0066,0.4516,0.6552,0.0,1.9494,1.0,1.0792,-0.2727,-0.1925,0.2532,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,777.0,796.0,1398169.0,1398179.0,168.0,25.0,23.0,18238.0,90.0,127.0,1364.0,0.0,1.0,69.0,0.2385,0.4135,0.5454,0.0095,0.6364,0.9643,1.0,2.2279,1.1461,1.3222,0.1111,-0.1426,0.4729,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1152.0,1165.0,2368032.0,2362511.0,653.0,64.0,61.0,67978.0,105.0,135.0,1652.0,1.0,1.0,80.0,0.6561,0.4946,0.8182,0.0191,0.7857,1.0,1.0,2.8149,1.4314,1.7076,0.5294,-0.084,0.9994,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1705.0,1713.0,12987660.0,12987690.0,152655.0,7553.0,903.0,11591410.0,196.0,253.0,1794.0,1.0,1.0,300.0,0.9952,0.9275,1.0,0.6651,1.0,1.0,1.0,4.5543,2.9973,4.0333,0.9917,0.6421,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [59]:
target = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains','Dirtiness', 'Bumps', 'Other_Faults']

In [37]:
X = df_train.drop(columns = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains','Dirtiness', 'Bumps', 'Other_Faults'], axis = 1)
y = df_train[['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains','Dirtiness', 'Bumps', 'Other_Faults']]

In [55]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [57]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

scaled_X_train = sc.fit_transform(X_train)
scaled_X_test = sc.transform(X_test)

In [63]:
defects = {}
#training for each defect
for defect in target:
    model = LogisticRegression()
    model.fit(scaled_X_train, y_train[defect])
    defects[defect] = model

In [66]:
preds = pd.DataFrame()

for defect, model in defects.items():
    preds[defect] = model.predict_proba(scaled_X_test)[:,1]

In [76]:
preds

Unnamed: 0,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,0.081701,0.008895,0.001179,1.002501e-03,0.071245,0.145207,0.557723
1,0.001153,0.028334,0.832689,2.978973e-07,0.003751,0.034991,0.168511
2,0.001474,0.000063,0.000266,8.646113e-05,0.002560,0.158957,0.804199
3,0.085696,0.001998,0.000340,8.874319e-06,0.005432,0.444315,0.442604
4,0.077312,0.061370,0.024320,3.444126e-03,0.051533,0.298696,0.347873
...,...,...,...,...,...,...,...
3839,0.001248,0.228879,0.001622,1.360538e-05,0.012928,0.250884,0.411395
3840,0.012027,0.000952,0.000295,9.398722e-03,0.002241,0.435509,0.362429
3841,0.151087,0.026001,0.015658,2.411535e-04,0.056456,0.267633,0.280310
3842,0.215022,0.000804,0.009818,8.436596e-06,0.030143,0.328491,0.374359


In [67]:
auc_scores = []
for defect in target:
    auc = roc_auc_score(y_test[defect], preds[defect])
    auc_scores.append(auc)
    print(f"AUC for {defect} defect: {auc}")

AUC for Pastry defect: 0.8657143419760832
AUC for Z_Scratch defect: 0.9192968593216937
AUC for K_Scatch defect: 0.9820870453942621
AUC for Stains defect: 0.9853766025641026
AUC for Dirtiness defect: 0.8511212306151511
AUC for Bumps defect: 0.7608787047550247
AUC for Other_Faults defect: 0.6830538764705095


In [77]:
average_auc = sum(auc_scores)/ len(auc_scores)
print(average_auc)

0.8639326658709754


In [69]:
df_test.head()

Unnamed: 0,id,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,Length_of_Conveyer,TypeOfSteel_A300,TypeOfSteel_A400,Steel_Plate_Thickness,Edges_Index,Empty_Index,Square_Index,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas
0,19219,1015,1033,3826564,3826588,659,23,46,62357,67,127,1656,0,1,150,0.3877,0.4896,0.3273,0.0095,0.5652,1.0,1.0,2.841,1.1139,1.6628,0.6727,-0.2261,0.9172
1,19220,1257,1271,419960,419973,370,26,28,39293,92,132,1354,0,1,40,0.1629,0.4136,0.0938,0.0047,0.2414,1.0,1.0,2.5682,0.9031,1.4472,0.9063,-0.1453,0.9104
2,19221,1358,1372,117715,117724,289,36,32,29386,101,134,1360,0,1,40,0.0609,0.6234,0.4762,0.0155,0.6,0.75,0.0,2.4609,1.3222,1.3222,-0.5238,-0.0435,0.6514
3,19222,158,168,232415,232440,80,10,11,8586,107,140,1690,1,0,100,0.4439,0.3333,0.8182,0.0037,0.8,1.0,1.0,1.9031,0.699,1.0414,0.1818,-0.0738,0.2051
4,19223,559,592,544375,544389,140,19,15,15524,103,134,1688,1,0,60,0.8191,0.2619,0.4286,0.0158,0.8421,0.5333,0.0,2.1461,1.3222,1.1461,-0.5714,-0.0894,0.417


In [70]:
scaled_x_test2 = sc.transform(df_test.drop(columns = 'id'))
scaled_x_test2

array([[ 0.57784256,  0.56222589,  1.03969947, ...,  1.16820855,
        -0.73720464,  1.03889526],
       [ 1.03276483,  1.03777704, -0.75085779, ...,  1.64723668,
        -0.06240197,  1.01843333],
       [ 1.22262909,  1.23958656, -0.90972186, ..., -1.28537517,
         0.78778257,  0.23907451],
       ...,
       [-1.2531256 , -1.08222198, -0.13716454, ..., -0.34023791,
         0.48211701,  1.28804935],
       [ 1.16811361,  1.17564691, -0.60248917, ...,  1.5237885 ,
        -1.04454051,  0.6392858 ],
       [-1.25688529, -1.12817861,  0.00584493, ...,  0.02723573,
         0.2724939 ,  1.28804935]])

In [72]:
final_preds = pd.DataFrame()

for defect,model in defects.items():
    final_preds[defect] = model.predict_proba(scaled_x_test2)[:,1]

In [73]:
final_preds

Unnamed: 0,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,0.344545,0.000209,0.001697,4.225456e-07,0.034867,1.998444e-01,0.468843
1,0.264299,0.033889,0.015552,6.600605e-05,0.163480,1.143163e-01,0.337212
2,0.004194,0.026837,0.099481,1.196063e-03,0.020561,2.230572e-01,0.390498
3,0.120956,0.001864,0.000561,5.019216e-04,0.004044,2.901689e-01,0.412497
4,0.001674,0.000719,0.003537,3.228588e-04,0.005516,5.307527e-01,0.385973
...,...,...,...,...,...,...,...
12809,0.114655,0.061446,0.007166,3.955055e-05,0.085775,2.046643e-01,0.354568
12810,0.222625,0.017414,0.021977,2.577499e-02,0.084014,1.258406e-01,0.365035
12811,0.000063,0.000960,0.940194,4.205842e-11,0.000024,2.858697e-07,0.064326
12812,0.388957,0.024625,0.043884,2.485959e-05,0.076529,1.946820e-01,0.250095


In [74]:
submission = pd.DataFrame({'id':df_test['id']})
for defect in target:
    submission[defect] = final_preds[defect]
submission

Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.344545,0.000209,0.001697,4.225456e-07,0.034867,1.998444e-01,0.468843
1,19220,0.264299,0.033889,0.015552,6.600605e-05,0.163480,1.143163e-01,0.337212
2,19221,0.004194,0.026837,0.099481,1.196063e-03,0.020561,2.230572e-01,0.390498
3,19222,0.120956,0.001864,0.000561,5.019216e-04,0.004044,2.901689e-01,0.412497
4,19223,0.001674,0.000719,0.003537,3.228588e-04,0.005516,5.307527e-01,0.385973
...,...,...,...,...,...,...,...,...
12809,32028,0.114655,0.061446,0.007166,3.955055e-05,0.085775,2.046643e-01,0.354568
12810,32029,0.222625,0.017414,0.021977,2.577499e-02,0.084014,1.258406e-01,0.365035
12811,32030,0.000063,0.000960,0.940194,4.205842e-11,0.000024,2.858697e-07,0.064326
12812,32031,0.388957,0.024625,0.043884,2.485959e-05,0.076529,1.946820e-01,0.250095


In [75]:
submission.to_csv('submission1.csv', index=False)