<b>Step 1</b> - Importing libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
from matplotlib import pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from boruta import BorutaPy

<b>Step 2</b> - importing data

In [2]:
Data = pd.read_csv("WIDS Dataset_2020_Adj.csv")

In [3]:
Data[:5] #checking if the dataset was imported successfully

Unnamed: 0,X,Date,Zone1Position,Zone2Position,Zone3Position,SKU,Zone1_Row_Num,Zone1_Col_Num,Zone2_Row_Num,Zone2_Col_num,...,Zone2_Humidity_Max,Zone2_Humidity_Range,Zone3_Humidity_Avg,Zone3_Humidity_Min,Zone3_Humidity_Max,Zone3_Humidity_Range,Block_Num,Block_Position,Block_Orientation,Result_Type
0,1,09/01/2020,3.0,2.0,3.0,A001,1.0,3.0,1.0,2.0,...,61.8,10.0,39.19,33.19,45.19,12.0,1000.0,1.0,1,Defect_1
1,2,03/12/2019,8.0,4.0,3.0,A001,2.0,4.0,2.0,2.0,...,58.12,16.0,48.5,40.5,56.5,16.0,1003.0,1.0,1,Defect_1
2,3,21/11/2019,4.0,2.0,3.0,B003,1.0,4.0,1.0,2.0,...,50.97,10.0,52.2,46.2,58.2,12.0,1001.0,4.0,1,PASS
3,4,22/11/2019,6.0,3.0,1.0,B003,2.0,2.0,2.0,1.0,...,59.13,4.0,39.29,31.29,47.29,16.0,1003.0,2.0,1,PASS
4,5,23/12/2019,3.0,2.0,3.0,B003,1.0,3.0,1.0,2.0,...,65.8,18.0,45.37,42.37,48.37,6.0,1002.0,1.0,1,PASS


<b>Step 3</b> - Exploring the data columns' name and the type of data in each column

In [4]:
Data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75750 entries, 0 to 75749
Data columns (total 53 columns):
X                         75750 non-null int64
Date                      75707 non-null object
Zone1Position             75712 non-null float64
Zone2Position             75706 non-null float64
Zone3Position             75713 non-null float64
SKU                       75711 non-null object
Zone1_Row_Num             75715 non-null float64
Zone1_Col_Num             75708 non-null float64
Zone2_Row_Num             75704 non-null float64
Zone2_Col_num             75716 non-null float64
Zone3_Row_Num             75713 non-null float64
Zone3_Col_Num             75714 non-null float64
Zone1_Left_Block_Bin      75708 non-null float64
Zone1_Right_Block_Bin     75710 non-null float64
Zone1_Area                75709 non-null object
Zone3_Area                75718 non-null object
Zone1_Dur                 75713 non-null float64
Zone2_Dur                 75708 non-null float64
Zone3_Dur      

<b>Step 4</b> - Identifying the number and percentage of missing data per each column

In [5]:
print("Number of missing values per each column:")
print(Data.isnull().sum())

Number of missing values per each column:
X                          0
Date                      43
Zone1Position             38
Zone2Position             44
Zone3Position             37
SKU                       39
Zone1_Row_Num             35
Zone1_Col_Num             42
Zone2_Row_Num             46
Zone2_Col_num             34
Zone3_Row_Num             37
Zone3_Col_Num             36
Zone1_Left_Block_Bin      42
Zone1_Right_Block_Bin     40
Zone1_Area                41
Zone3_Area                32
Zone1_Dur                 37
Zone2_Dur                 42
Zone3_Dur                 47
Zone1_Out_Zone2_In_Dur    43
Zone1_Out_Zone3_In_Dur    42
Zone2_Out_Zone3_In_Dur    41
Zone1_In_Zone3_Out_Dur    34
Zone1_In_Zone2_Out_Dur    44
Zone2_In_Zone3_Out_Dur    39
Zone1_Temp_Avg            52
Zone1_Temp_Min            46
Zone1_Temp_Max            39
Zone1_Temp_Range          41
Zone2_Temp_Avg            51
Zone2_Temp_Min            53
Zone2_Temp_Max            37
Zone2_Temp_Range          37
Z

In [6]:
print("Percentage of missing data in each column:")
sum_miss_data = (Data.isnull().sum()) / len(Data)
print(sum_miss_data)

Percentage of missing data in each column:
X                         0.000000
Date                      0.000568
Zone1Position             0.000502
Zone2Position             0.000581
Zone3Position             0.000488
SKU                       0.000515
Zone1_Row_Num             0.000462
Zone1_Col_Num             0.000554
Zone2_Row_Num             0.000607
Zone2_Col_num             0.000449
Zone3_Row_Num             0.000488
Zone3_Col_Num             0.000475
Zone1_Left_Block_Bin      0.000554
Zone1_Right_Block_Bin     0.000528
Zone1_Area                0.000541
Zone3_Area                0.000422
Zone1_Dur                 0.000488
Zone2_Dur                 0.000554
Zone3_Dur                 0.000620
Zone1_Out_Zone2_In_Dur    0.000568
Zone1_Out_Zone3_In_Dur    0.000554
Zone2_Out_Zone3_In_Dur    0.000541
Zone1_In_Zone3_Out_Dur    0.000449
Zone1_In_Zone2_Out_Dur    0.000581
Zone2_In_Zone3_Out_Dur    0.000515
Zone1_Temp_Avg            0.000686
Zone1_Temp_Min            0.000607
Zone1_Temp_M

<b>Step 4</b> - Inputing the columns with missing data based on the data from the other dependent columns, such as:
- The four routes of the items (E.g. the position in a zone can be deduced from the value in the row and column)
- Range, Min, Max (Range can be calculated by substracting the minimum value from the maximum value, Minimum can be inputed by sbstracting the range from the maximum

In [8]:
Data["Zone1Position"] = Data["Zone1Position"].fillna(value=(Data["Zone1_Row_Num"] -1) * 4 + Data["Zone1_Col_Num"])
Data["Zone2Position"] = Data["Zone2Position"].fillna(value=(Data["Zone2_Row_Num"] -1) * 2 + Data["Zone2_Col_num"])
Data["Zone3Position"] = Data["Zone3Position"].fillna(value=(Data["Zone3_Row_Num"] -1) * 3 + Data["Zone3_Col_Num"])
    
Data["Zone1_Row_Num"] = Data["Zone1_Row_Num"].fillna(value=((Data["Zone1Position"]-Data["Zone1_Col_Num"])/4)+1)
Data["Zone2_Row_Num"] = Data["Zone2_Row_Num"].fillna(value=((Data["Zone2Position"]-Data["Zone2_Col_num"])/2)+1)
Data["Zone3_Row_Num"] = Data["Zone3_Row_Num"].fillna(value=((Data["Zone3Position"]-Data["Zone3_Col_Num"])/3)+1)
    
Data["Zone1_Col_Num"] = Data["Zone1_Col_Num"].fillna(value=Data["Zone1Position"]-((Data["Zone1_Row_Num"]-1)*4))
Data["Zone2_Col_num"] = Data["Zone2_Col_num"].fillna(value=Data["Zone2Position"]-((Data["Zone2_Row_Num"]-1)*2))
Data["Zone3_Col_Num"] = Data["Zone3_Col_Num"].fillna(value=Data["Zone3Position"]-((Data["Zone3_Row_Num"]-1)*3))
    
Data["Zone1_Temp_Range"] = Data["Zone1_Temp_Range"].fillna(value=Data["Zone1_Temp_Max"]-Data["Zone1_Temp_Min"])
Data["Zone2_Temp_Range"] = Data["Zone2_Temp_Range"].fillna(value=Data["Zone2_Temp_Max"]-Data["Zone2_Temp_Min"])
Data["Zone3_Temp_Range"] = Data["Zone3_Temp_Range"].fillna(value=Data["Zone3_Temp_Max"]-Data["Zone3_Temp_Min"])

Data["Zone1_Temp_Max"] = Data["Zone1_Temp_Max"].fillna(value=Data["Zone1_Temp_Range"]+Data["Zone1_Temp_Min"])
Data["Zone2_Temp_Max"] = Data["Zone2_Temp_Max"].fillna(value=Data["Zone2_Temp_Range"]+Data["Zone2_Temp_Min"])
Data["Zone3_Temp_Max"] = Data["Zone3_Temp_Max"].fillna(value=Data["Zone3_Temp_Range"]+Data["Zone3_Temp_Min"])

Data["Zone1_Temp_Min"] = Data["Zone1_Temp_Min"].fillna(value=Data["Zone1_Temp_Max"]-Data["Zone1_Temp_Range"])
Data["Zone2_Temp_Min"] = Data["Zone2_Temp_Min"].fillna(value=Data["Zone2_Temp_Max"]-Data["Zone2_Temp_Range"])
Data["Zone3_Temp_Min"] = Data["Zone3_Temp_Min"].fillna(value=Data["Zone3_Temp_Max"]-Data["Zone3_Temp_Range"])
    
Data["Zone1_Humidity_Range"] = Data["Zone1_Humidity_Range"].fillna(value=Data["Zone1_Humidity_Max"]-
                                                                   Data["Zone1_Humidity_Min"])
Data["Zone2_Humidity_Range"] = Data["Zone2_Humidity_Range"].fillna(value=Data["Zone2_Humidity_Max"]-
                                                                   Data["Zone2_Humidity_Min"])
Data["Zone3_Humidity_Range"] = Data["Zone3_Humidity_Range"].fillna(value=Data["Zone3_Humidity_Max"]-
                                                                   Data["Zone3_Humidity_Min"])

Data["Zone1_Humidity_Max"] = Data["Zone1_Humidity_Max"].fillna(value=Data["Zone1_Humidity_Range"]+
                                                               Data["Zone1_Humidity_Min"])
Data["Zone2_Humidity_Max"] = Data["Zone2_Humidity_Max"].fillna(value=Data["Zone2_Humidity_Range"]+
                                                               Data["Zone2_Humidity_Min"])
Data["Zone3_Humidity_Max"] = Data["Zone3_Humidity_Max"].fillna(value=Data["Zone3_Humidity_Range"]+
                                                               Data["Zone3_Humidity_Min"])

Data["Zone1_Humidity_Min"] = Data["Zone1_Humidity_Min"].fillna(value=Data["Zone1_Humidity_Max"]-
                                                               Data["Zone1_Humidity_Range"])
Data["Zone2_Humidity_Min"] = Data["Zone2_Humidity_Min"].fillna(value=Data["Zone2_Humidity_Max"]-
                                                               Data["Zone2_Humidity_Range"])
Data["Zone3_Humidity_Min"] = Data["Zone3_Humidity_Min"].fillna(value=Data["Zone3_Humidity_Max"]-
                                                               Data["Zone3_Humidity_Range"])

In [9]:
Data.isnull().sum()

X                          0
Date                      43
Zone1Position              0
Zone2Position              0
Zone3Position              0
SKU                       39
Zone1_Row_Num              0
Zone1_Col_Num              0
Zone2_Row_Num              0
Zone2_Col_num              0
Zone3_Row_Num              0
Zone3_Col_Num              0
Zone1_Left_Block_Bin      42
Zone1_Right_Block_Bin     40
Zone1_Area                41
Zone3_Area                32
Zone1_Dur                 37
Zone2_Dur                 42
Zone3_Dur                 47
Zone1_Out_Zone2_In_Dur    43
Zone1_Out_Zone3_In_Dur    42
Zone2_Out_Zone3_In_Dur    41
Zone1_In_Zone3_Out_Dur    34
Zone1_In_Zone2_Out_Dur    44
Zone2_In_Zone3_Out_Dur    39
Zone1_Temp_Avg            52
Zone1_Temp_Min             0
Zone1_Temp_Max             0
Zone1_Temp_Range           0
Zone2_Temp_Avg            51
Zone2_Temp_Min             0
Zone2_Temp_Max             0
Zone2_Temp_Range           0
Zone3_Temp_Avg            43
Zone3_Temp_Min

<b>Inputing block bin and areas for all zones</b>

In [10]:
# Zone1_Left_Block_Bin
L1 = ['1','2','5','6']
L2 = ['3','4','7','8']
m1 = Data['Zone1_Left_Block_Bin'].isnull()
m2 = Data['Zone1Position'].isin(L1)
m3 = Data['Zone1Position'].isin(L2)

Data['Zone1_Left_Block_Bin'] = Data['Zone1_Left_Block_Bin'].mask(m1 & m2, 1)
Data['Zone1_Left_Block_Bin'] = Data['Zone1_Left_Block_Bin'].mask(m1 & m3, 0)

# Zone1_Right_Block_Bin
L1 = ['1','2','5','6']
L2 = ['3','4','7','8']
m1 = Data['Zone1_Right_Block_Bin'].isnull()
m2 = Data['Zone1Position'].isin(L1)
m3 = Data['Zone1Position'].isin(L2)

Data['Zone1_Right_Block_Bin'] = Data['Zone1_Right_Block_Bin'].mask(m1 & m2, 0)
Data['Zone1_Right_Block_Bin'] = Data['Zone1_Right_Block_Bin'].mask(m1 & m3, 1)

# Zone1_Area
L1 = ['1','2']
L2 = ['3','4']
L3 = ['5','6']
L4 = ['7','8']
m1 = Data['Zone1_Area'].isnull()
m2 = Data['Zone1Position'].isin(L1)
m3 = Data['Zone1Position'].isin(L2)
m4 = Data['Zone1Position'].isin(L3)
m5 = Data['Zone1Position'].isin(L4)

Data['Zone1_Area'] = Data['Zone1_Area'].mask(m1 & m2, 'Top Left')
Data['Zone1_Area'] = Data['Zone1_Area'].mask(m1 & m3, 'Top Right')
Data['Zone1_Area'] = Data['Zone1_Area'].mask(m1 & m4, 'Bottom Left')
Data['Zone1_Area'] = Data['Zone1_Area'].mask(m1 & m5, 'Bottom Right')

# Zone3_Area
L1 = ['1','2','4']
L2 = ['3','5','6']
m1 = Data['Zone3_Area'].isnull()
m2 = Data['Zone3Position'].isin(L1)
m3 = Data['Zone3Position'].isin(L2)

Data['Zone3_Area'] = Data['Zone3_Area'].mask(m1 & m2, 'Top Left')
Data['Zone3_Area'] = Data['Zone3_Area'].mask(m1 & m3, 'Bottom Right')

Data.isnull().sum()

X                          0
Date                      43
Zone1Position              0
Zone2Position              0
Zone3Position              0
SKU                       39
Zone1_Row_Num              0
Zone1_Col_Num              0
Zone2_Row_Num              0
Zone2_Col_num              0
Zone3_Row_Num              0
Zone3_Col_Num              0
Zone1_Left_Block_Bin       0
Zone1_Right_Block_Bin      0
Zone1_Area                 0
Zone3_Area                 0
Zone1_Dur                 37
Zone2_Dur                 42
Zone3_Dur                 47
Zone1_Out_Zone2_In_Dur    43
Zone1_Out_Zone3_In_Dur    42
Zone2_Out_Zone3_In_Dur    41
Zone1_In_Zone3_Out_Dur    34
Zone1_In_Zone2_Out_Dur    44
Zone2_In_Zone3_Out_Dur    39
Zone1_Temp_Avg            52
Zone1_Temp_Min             0
Zone1_Temp_Max             0
Zone1_Temp_Range           0
Zone2_Temp_Avg            51
Zone2_Temp_Min             0
Zone2_Temp_Max             0
Zone2_Temp_Range           0
Zone3_Temp_Avg            43
Zone3_Temp_Min

<b>Inputing numeric fields</b>

In [11]:
cols = Data.columns[np.concatenate([range(15,25),[28,32,36,40,44]])].to_list()
Data[cols]=Data[cols].fillna(Data.filter(Data.columns[np.concatenate([range(15,25),[28,32,36,40,44]])]).mean().iloc[0])

Data.isnull().sum()

X                          0
Date                      43
Zone1Position              0
Zone2Position              0
Zone3Position              0
SKU                       39
Zone1_Row_Num              0
Zone1_Col_Num              0
Zone2_Row_Num              0
Zone2_Col_num              0
Zone3_Row_Num              0
Zone3_Col_Num              0
Zone1_Left_Block_Bin       0
Zone1_Right_Block_Bin      0
Zone1_Area                 0
Zone3_Area                 0
Zone1_Dur                  0
Zone2_Dur                  0
Zone3_Dur                  0
Zone1_Out_Zone2_In_Dur     0
Zone1_Out_Zone3_In_Dur     0
Zone2_Out_Zone3_In_Dur     0
Zone1_In_Zone3_Out_Dur     0
Zone1_In_Zone2_Out_Dur     0
Zone2_In_Zone3_Out_Dur     0
Zone1_Temp_Avg            52
Zone1_Temp_Min             0
Zone1_Temp_Max             0
Zone1_Temp_Range           0
Zone2_Temp_Avg            51
Zone2_Temp_Min             0
Zone2_Temp_Max             0
Zone2_Temp_Range           0
Zone3_Temp_Avg            43
Zone3_Temp_Min

<b>Inputing categorical variables</b>

In [12]:
imputer_str = SimpleImputer(strategy = 'most_frequent')

imputer_str.fit(Data[["Date", "SKU", "Block_Num", "Block_Position"]])
Data[["Date", "SKU", "Block_Num", "Block_Position"]] = imputer_str.transform(Data[["Date", "SKU", 
                                                                                   "Block_Num", "Block_Position"]])

Data.isnull().sum()

X                          0
Date                       0
Zone1Position              0
Zone2Position              0
Zone3Position              0
SKU                        0
Zone1_Row_Num              0
Zone1_Col_Num              0
Zone2_Row_Num              0
Zone2_Col_num              0
Zone3_Row_Num              0
Zone3_Col_Num              0
Zone1_Left_Block_Bin       0
Zone1_Right_Block_Bin      0
Zone1_Area                 0
Zone3_Area                 0
Zone1_Dur                  0
Zone2_Dur                  0
Zone3_Dur                  0
Zone1_Out_Zone2_In_Dur     0
Zone1_Out_Zone3_In_Dur     0
Zone2_Out_Zone3_In_Dur     0
Zone1_In_Zone3_Out_Dur     0
Zone1_In_Zone2_Out_Dur     0
Zone2_In_Zone3_Out_Dur     0
Zone1_Temp_Avg            52
Zone1_Temp_Min             0
Zone1_Temp_Max             0
Zone1_Temp_Range           0
Zone2_Temp_Avg            51
Zone2_Temp_Min             0
Zone2_Temp_Max             0
Zone2_Temp_Range           0
Zone3_Temp_Avg            43
Zone3_Temp_Min

<b>Droping redundant columns</b>

In [None]:
Data = Data.drop('Date', axis=1)

<b>Exporting the cleaned data</b>

In [None]:
Data.to_csv("Data_processed.csv", encoding='utf-8', index=False)

<b>Encoding categorical variables</b>

In [None]:
categorical_feature_mask = Data.dtypes==object
print(categorical_feature_mask)

categorical_cols = Data.columns[categorical_feature_mask].tolist()

le = LabelEncoder()

Data[categorical_cols] = Data[categorical_cols].apply(lambda col: le.fit_transform(col))
print(Data[categorical_cols].head(10))

Data[categorical_cols].describe()

<b>Selecting features using Boruta</b>

In [None]:
X = Data[Data.columns[~Data.columns.isin(["Result_Type"])]]
y = Data["Result_Type"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)
boruta_feature_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=4242, max_iter = 50, perc = 90)
boruta_feature_selector.fit(X_train.values, y_train.values.ravel())