In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

from sklearn.metrics import f1_score

import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

# Load Data

In [2]:
DATA_PATH = '../data/processed'

x_column_names = {
    'Air temperature': 'air_temperature', 
    'Process temperature': 'process_temperature',
    'Rotational speed' : 'rotational_speed',
    'Torque' : 'torque',
    'Tool wear' : 'tool_wear'
}

y_column_names = {
    'Machine failure' : 'machine_failure',
    'TWF' : 'twf',
    'HDF' : 'hdf',
    'PWF' : 'pwf',
    'OSF' : 'osf',
    'RNF' : 'rnf'
}

## Load Training Data

In [3]:
x_train_scaled = pd.read_csv(f'{DATA_PATH}/X_train_scaled.csv')
x_train_scaled = x_train_scaled.rename(columns = x_column_names)

y_train = pd.read_csv(f'{DATA_PATH}/Y_train.csv')
y_train = y_train.rename(columns = y_column_names)

In [4]:
display(x_train_scaled)
display(y_train)

Unnamed: 0,quality_variation,air_temperature,process_temperature,rotational_speed,torque,tool_wear
0,0,-0.998835,-1.149025,-0.359382,0.256796,-0.016174
1,0,0.247930,0.333286,-0.912986,0.738238,-0.817033
2,1,0.497283,0.468042,-0.248661,0.808448,0.580544
3,0,0.397542,0.468042,-0.425815,0.276856,1.020231
4,1,2.043272,2.152486,0.266191,0.106346,0.329294
...,...,...,...,...,...,...
7494,1,-1.347929,-1.283781,-0.935130,1.199619,1.664059
7495,1,1.095730,0.804930,0.127790,-0.515516,-0.031877
7496,0,-1.597283,-1.418536,-0.270805,-0.665966,0.235076
7497,0,-0.849223,-0.610003,0.432272,-0.896657,1.365700


Unnamed: 0,machine_failure,twf,hdf,pwf,osf,rnf
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
7494,0,0,0,0,0,0
7495,0,0,0,0,0,0
7496,0,0,0,0,0,0
7497,0,0,0,0,0,0


In [5]:
x_train_scaled.to_numpy()

array([[ 0.        , -0.99883525, -1.1490252 , -0.35938202,  0.25679633,
        -0.01617427],
       [ 0.        ,  0.24792986,  0.33328595, -0.91298614,  0.7382378 ,
        -0.81703327],
       [ 1.        ,  0.49728288,  0.46804151, -0.24866119,  0.80844801,
         0.58054419],
       ...,
       [ 0.        , -1.5972825 , -1.41853631, -0.27080536, -0.66596647,
         0.23507561],
       [ 0.        , -0.84922344, -0.61000296,  0.43227187, -0.89665717,
         1.36570007],
       [ 0.        ,  0.64689469,  0.26590817, -0.54207138,  0.146466  ,
         1.39710631]])

## Load Validation Data

In [6]:
x_val_scaled = pd.read_csv(f'{DATA_PATH}/X_val_scaled.csv')
x_val_scaled = x_val_scaled.rename(columns = x_column_names)

y_val = pd.read_csv(f'{DATA_PATH}/Y_val.csv')
y_val = y_val.rename(columns = y_column_names)

In [7]:
display(x_val_scaled)
display(y_val)

Unnamed: 0,quality_variation,air_temperature,process_temperature,rotational_speed,torque,tool_wear
0,0,-1.298059,-1.149025,-1.056923,0.577757,-1.021174
1,1,0.397542,0.804930,0.531921,-0.816417,1.098747
2,2,0.247930,-0.475247,-0.370454,0.517577,1.805387
3,1,0.347671,1.141819,-0.215445,-0.114315,-1.649298
4,0,0.198059,0.939686,2.203805,-1.618819,0.360701
...,...,...,...,...,...,...
1496,1,0.297800,-0.205736,-0.608504,0.256796,-1.335236
1497,1,-0.350517,0.468042,-0.542071,0.547667,0.596247
1498,0,-1.347929,-1.283781,0.166542,-0.565666,1.349997
1499,1,0.946118,0.670175,-0.586360,0.146466,-0.895549


Unnamed: 0,machine_failure,twf,hdf,pwf,osf,rnf
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
1496,0,0,0,0,0,0
1497,0,0,0,0,0,0
1498,0,0,0,0,0,0
1499,0,0,0,0,0,0


## Load Test Data

In [8]:
x_test_scaled = pd.read_csv(f'{DATA_PATH}/X_test_scaled.csv')
x_test_scaled = x_test_scaled.rename(columns = x_column_names)

y_test = pd.read_csv(f'{DATA_PATH}/Y_test.csv')
y_test = y_test.rename(columns = y_column_names)

In [9]:
display(x_test_scaled)
display(y_test)

Unnamed: 0,quality_variation,air_temperature,process_temperature,rotational_speed,torque,tool_wear
0,1,1.195471,0.265908,0.266191,-0.575696,0.643357
1,0,1.694177,1.343953,-0.093652,-0.415216,1.679762
2,0,0.347671,0.804930,1.882715,-1.578699,-1.319533
3,0,1.145601,0.670175,0.293871,-0.736177,0.988825
4,0,-0.649741,-0.070981,0.304943,0.246766,0.596247
...,...,...,...,...,...,...
995,0,-0.649741,-0.340492,0.747826,-0.926747,0.517732
996,0,-0.400388,-1.014270,1.821818,-1.307888,0.470622
997,0,-0.948965,-1.418536,-0.198837,0.176556,-0.455862
998,1,0.198059,-0.475247,-0.392598,0.026106,0.910310


Unnamed: 0,machine_failure,twf,hdf,pwf,osf,rnf
0,0,0,0,0,0,0
1,1,1,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
995,0,0,0,0,0,0
996,0,0,0,0,0,0
997,0,0,0,0,0,0
998,0,0,0,0,0,0


## EDA

In [10]:
y_train[y_train[['twf', 'hdf', 'pwf', 'osf', 'rnf']].sum(axis=1) > 1]

Unnamed: 0,machine_failure,twf,hdf,pwf,osf,rnf
49,1,0,1,0,1,0
902,1,0,0,1,1,0
1232,1,0,0,1,1,0
1559,1,1,0,0,0,1
1739,1,0,1,1,0,0
1886,1,0,0,1,1,0
1901,1,0,1,0,1,0
1923,1,0,0,1,1,0
2230,1,1,0,0,1,0
2537,1,0,1,0,1,0


In [11]:
y_train[ (y_train[['twf', 'hdf', 'pwf', 'osf', 'rnf']].sum(axis=1) == 0) & 
         (y_train[['machine_failure']].sum(axis=1) == 1)]

Unnamed: 0,machine_failure,twf,hdf,pwf,osf,rnf


In [12]:
y_train[ (y_train[['twf', 'hdf', 'pwf', 'osf', 'rnf']].sum(axis=1) > 0) & 
         (y_train[['machine_failure']].sum(axis=1) == 0)]

Unnamed: 0,machine_failure,twf,hdf,pwf,osf,rnf


In [13]:
y_val[ (y_val[['twf', 'hdf', 'pwf', 'osf', 'rnf']].sum(axis=1) == 0) & 
       (y_val[['machine_failure']].sum(axis=1) == 1)]

Unnamed: 0,machine_failure,twf,hdf,pwf,osf,rnf


In [14]:
y_val[ (y_val[['twf', 'hdf', 'pwf', 'osf', 'rnf']].sum(axis=1) > 0) & 
         (y_val[['machine_failure']].sum(axis=1) == 0)]

Unnamed: 0,machine_failure,twf,hdf,pwf,osf,rnf


In [15]:
y_test[ (y_test[['twf', 'hdf', 'pwf', 'osf', 'rnf']].sum(axis=1) > 0) & 
         (y_test[['machine_failure']].sum(axis=1) == 0)]

Unnamed: 0,machine_failure,twf,hdf,pwf,osf,rnf


In [16]:
y_test[ (y_test[['twf', 'hdf', 'pwf', 'osf', 'rnf']].sum(axis=1) > 0) & 
         (y_test[['machine_failure']].sum(axis=1) == 0)]

Unnamed: 0,machine_failure,twf,hdf,pwf,osf,rnf


# Transform Class Labels for Binary and Multilabel Classification

## Binary Class Labels

In [17]:
y_train_binary = y_train['machine_failure'].to_numpy()
print(y_train_binary[:10])
print(y_train_binary.shape)

[0 0 0 0 0 0 0 0 0 0]
(7499,)


In [18]:
y_val_binary = y_val['machine_failure'].to_numpy()
print(y_val_binary[:10])
print(y_val_binary.shape)

[0 0 0 0 0 0 0 0 0 0]
(1501,)


In [19]:
y_test_binary = y_test['machine_failure'].to_numpy()
print(y_test_binary[:10])
print(y_test_binary.shape)

[0 1 0 0 0 0 0 0 0 0]
(1000,)


## Multilabel Class Labels

In [20]:
y_multilabel_cols = ['twf', 'hdf', 'pwf', 'osf', 'rnf']

In [21]:
y_train_multi = y_train[y_multilabel_cols].to_numpy()
print(y_train_multi[:10])
print(y_train_multi.shape)

[[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]
(7499, 5)


In [22]:
y_val_multi = y_val[y_multilabel_cols].to_numpy()
print(y_val_multi[:10])
print(y_val_multi.shape)

[[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]
(1501, 5)


In [23]:
y_test_multi = y_test[y_multilabel_cols].to_numpy()
print(y_test_multi[:10])
print(y_test_multi.shape)

[[0 0 0 0 0]
 [1 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]]
(1000, 5)


# Decision Tree

In [24]:
features = x_train_scaled.columns.to_list()

In [25]:
dt = DecisionTreeClassifier(criterion='entropy')

In [26]:
test_dt_binary = dt.fit(x_train_scaled, y_train_binary)

print('Accuracy on training data:', dt.score(x_train_scaled, y_train_binary))
print('Accuracy on validation data:', dt.score(x_val_scaled, y_val_binary))

print('Accuracy on test data:', dt.score(x_test_scaled, y_test_binary))

y_pred = test_dt_binary.predict(x_test_scaled)
df_binary_f1 = f1_score(y_test_binary, y_pred)
print(f'f1 score: {df_binary_f1}')

Accuracy on training data: 1.0
Accuracy on validation data: 0.9740173217854764
Accuracy on test data: 0.976
f1 score: 0.7209302325581395


In [27]:
multi_output_classifier = MultiOutputClassifier(estimator=dt, n_jobs=-1).fit(x_train_scaled, y_train_multi)

print('Accuracy on training data:', multi_output_classifier.score(x_train_scaled, y_train_multi))
print('Accuracy on validation data:', multi_output_classifier.score(x_val_scaled, y_val_multi))

print('Accuracy on test data:', multi_output_classifier.score(x_test_scaled, y_test_multi))

y_pred = multi_output_classifier.predict(x_test_scaled)
df_multi_f1 = f1_score(y_test_multi, y_pred, average='weighted')
print(f'f1 score: {df_multi_f1}')

Accuracy on training data: 1.0
Accuracy on validation data: 0.9746835443037974
Accuracy on test data: 0.982
f1 score: 0.7352229780801209


# Random Forest

In [28]:
forest = RandomForestClassifier(criterion='entropy',
                                n_estimators=40, 
                                random_state=1,
                                n_jobs=2)

In [29]:
binary_forest = forest.fit(x_train_scaled, y_train_binary)

print('Accuracy on training data:', binary_forest.score(x_train_scaled, y_train_binary))
print('Accuracy on validation data:', binary_forest.score(x_val_scaled, y_val_binary))
print('Accuracy on test data:', binary_forest.score(x_test_scaled, y_test_binary))

print('Features:', features)
print ('Feature importances:', binary_forest.feature_importances_)

y_pred = binary_forest.predict(x_test_scaled)
forest_binary_f1 = f1_score(y_test_binary, y_pred)
print(f'f1 score: {forest_binary_f1}')

Accuracy on training data: 0.9994665955460728
Accuracy on validation data: 0.977348434377082
Accuracy on test data: 0.981
Features: ['quality_variation', 'air_temperature', 'process_temperature', 'rotational_speed', 'torque', 'tool_wear']
Feature importances: [0.0197098  0.13441235 0.11073296 0.24862811 0.30052366 0.18599312]
f1 score: 0.7246376811594203


In [30]:
multi_output_forest = MultiOutputClassifier(forest, n_jobs=2).fit(x_train_scaled, y_train_multi)

print('Accuracy on training data:', multi_output_forest.score(x_train_scaled, y_train_multi))
print('Accuracy on validation data:', multi_output_forest.score(x_val_scaled, y_val_multi))

print('Accuracy on test data:', multi_output_forest.score(x_test_scaled, y_test_multi))

feat_impts = [] 
for clf in multi_output_forest.estimators_:
    feat_impts.append(clf.feature_importances_)

print('Features:', features)
print ('Feature importances:', np.mean(feat_impts, axis=0))

y_pred = multi_output_forest.predict(x_test_scaled)
forest_multi_f1 = f1_score(y_test_multi, y_pred, average='micro')
print(f'f1 score: {forest_multi_f1}')

Accuracy on training data: 0.9994665955460728
Accuracy on validation data: 0.9806795469686875
Accuracy on test data: 0.986
Features: ['quality_variation', 'air_temperature', 'process_temperature', 'rotational_speed', 'torque', 'tool_wear']
Feature importances: [0.02306586 0.13445062 0.10317048 0.21338686 0.2908579  0.23506828]
f1 score: 0.8055555555555556


## Tuned Forest

In [None]:
tuned_forest = RandomForestClassifier(criterion='entropy',
                                n_estimators=100, 
                                max_features="sqrt",
                                max_depth=10,
                                random_state=1,
                                n_jobs=2)

In [37]:
tuned_binary_forest = forest.fit(x_train_scaled, y_train_binary)

print('Accuracy on training data:', tuned_binary_forest.score(x_train_scaled, y_train_binary))
print('Accuracy on validation data:', tuned_binary_forest.score(x_val_scaled, y_val_binary))
print('Accuracy on test data:', tuned_binary_forest.score(x_test_scaled, y_test_binary))

print('Features:', features)
print ('Feature importances:', tuned_binary_forest.feature_importances_)

y_pred = tuned_binary_forest.predict(x_test_scaled)
tuned_forest_binary_f1 = f1_score(y_test_binary, y_pred)
print(f'f1 score: {tuned_forest_binary_f1}')

Accuracy on training data: 0.9994665955460728
Accuracy on validation data: 0.977348434377082
Accuracy on test data: 0.981
Features: ['quality_variation', 'air_temperature', 'process_temperature', 'rotational_speed', 'torque', 'tool_wear']
Feature importances: [0.0197098  0.13441235 0.11073296 0.24862811 0.30052366 0.18599312]
f1 score: 0.7246376811594203


In [38]:
tuned_multi_output_forest = MultiOutputClassifier(forest, n_jobs=2).fit(x_train_scaled, y_train_multi)

print('Accuracy on training data:', tuned_multi_output_forest.score(x_train_scaled, y_train_multi))
print('Accuracy on validation data:', tuned_multi_output_forest.score(x_val_scaled, y_val_multi))

print('Accuracy on test data:', tuned_multi_output_forest.score(x_test_scaled, y_test_multi))

feat_impts = [] 
for clf in tuned_multi_output_forest.estimators_:
    feat_impts.append(clf.feature_importances_)

print('Features:', features)
print ('Feature importances:', np.mean(feat_impts, axis=0))

y_pred = tuned_multi_output_forest.predict(x_test_scaled)
tuned_forest_multi_f1 = f1_score(y_test_multi, y_pred, average='micro')
print(f'f1 score: {tuned_forest_multi_f1}')

Accuracy on training data: 0.9994665955460728
Accuracy on validation data: 0.9806795469686875
Accuracy on test data: 0.986
Features: ['quality_variation', 'air_temperature', 'process_temperature', 'rotational_speed', 'torque', 'tool_wear']
Feature importances: [0.02306586 0.13445062 0.10317048 0.21338686 0.2908579  0.23506828]
f1 score: 0.8055555555555556


In [39]:
print(f'f1 score: {f1_score(y_test_multi, y_pred, average=None)}')

f1 score: [0.         1.         0.85714286 0.76923077 0.        ]


In [40]:
tuned_multi_output_forest.classes_

[array([0, 1]), array([0, 1]), array([0, 1]), array([0, 1]), array([0, 1])]