# EDA of Random Forest Classifier on Machine Failure

## libraries

In [103]:
# 3rd party
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE, ADASYN

# constants
RANDOM_SEED = 42

## exploration

In [3]:
df = pd.read_csv('../data/machine_failure.csv', index_col=0)

In [4]:
df.head(2)

Unnamed: 0_level_0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0


In [5]:
df[['TWF', 'HDF', 'PWF', 'OSF', 'RNF']].value_counts()

TWF  HDF  PWF  OSF  RNF
0    0    0    0    0      9652
     1    0    0    0       106
     0    1    0    0        80
          0    1    0        78
1    0    0    0    0        42
0    0    0    0    1        18
          1    1    0        11
     1    0    1    0         6
          1    0    0         3
1    0    0    1    0         2
               0    1         1
          1    1    0         1
dtype: int64

In [8]:
print('---Failures by Type---')
print('twf', 'hdf', 'pwf', 'osf', 'rnf')
print(f"{len(df[df['TWF'] == 1]):<3}", f"{len(df[df['HDF'] == 1]):<3}", f"{len(df[df['PWF'] == 1]):<3}", f"{len(df[df['OSF'] == 1]):<3}", f"{len(df[df['RNF'] == 1]):<3}")


---Failures by Type---
twf hdf pwf osf rnf
46  115 95  98  19 


In [9]:
df['Machine failure'].value_counts()

0    9661
1     339
Name: Machine failure, dtype: int64

In [10]:
a_failure = df[(df['TWF'] == 1) | (df['HDF'] == 1) |  (df['PWF'] == 1) | (df['OSF'] == 1) |  (df['RNF'] == 1)]

In [11]:
a_failure[a_failure['Machine failure'] == 0]

Unnamed: 0_level_0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1222,M16081,M,297.0,308.3,1399,46.4,132,0,0,0,0,0,1
1303,L48482,L,298.6,309.8,1505,45.7,144,0,0,0,0,0,1
1749,H31162,H,298.4,307.7,1626,31.1,166,0,0,0,0,0,1
2073,L49252,L,299.6,309.5,1570,35.5,189,0,0,0,0,0,1
2560,L49739,L,299.3,309.0,1447,50.4,140,0,0,0,0,0,1
3066,M17925,M,300.1,309.2,1687,27.7,95,0,0,0,0,0,1
3453,H32866,H,301.6,310.5,1602,32.3,2,0,0,0,0,0,1
5472,L52651,L,302.7,312.3,1346,61.2,170,0,0,0,0,0,1
5490,L52669,L,302.6,312.1,1499,35.0,215,0,0,0,0,0,1
5496,H34909,H,302.9,312.5,1357,55.0,12,0,0,0,0,0,1


### Suggested Filtering Method

In [94]:
# use np.where to plug in ones when HDF is true otherwise 0 for all other failures and non-failures
df['target'] = np.where(df['HDF'] == 1, 1, 0)

In [96]:
df[df['OSF'] == 1].head()

Unnamed: 0_level_0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF,target
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
70,L47249,L,298.9,309.0,1410,65.7,191,1,0,0,1,1,0,0
161,L47340,L,298.4,308.2,1282,60.7,216,1,0,0,0,1,0,0
162,L47341,L,298.3,308.1,1412,52.3,218,1,0,0,0,1,0,0
243,L47422,L,298.0,308.2,1348,58.8,202,1,0,0,0,1,0,0
249,L47428,L,298.0,308.3,1362,56.8,216,1,0,0,0,1,0,0


## preprocessing

In [97]:
# filtered data
# only include HDF positive cases and non-machine failure cases
hdf_df = df[(df['HDF']) | ((df['Machine failure'] == 0) & (df['RNF'] == 0))]

In [98]:
hdf_df['HDF'].value_counts()

0    9643
1     115
Name: HDF, dtype: int64

In [99]:
hdf_df['HDF'].value_counts(normalize=True)

0    0.988215
1    0.011785
Name: HDF, dtype: float64

In [100]:
print('Machine failure with no HDF:', hdf_df[(hdf_df['Machine failure'] == 1) & (hdf_df['HDF'] == 0)].shape[0])
print('No machine failure with HDF:', hdf_df[(hdf_df['Machine failure'] == 0) & (hdf_df['HDF'] == 1)].shape[0])

Machine failure with no HDF: 0
No machine failure with HDF: 0


In [101]:
hdf_df.columns

Index(['Product ID', 'Type', 'Air temperature [K]', 'Process temperature [K]',
       'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]',
       'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF', 'target'],
      dtype='object')

In [102]:
X = hdf_df[['Type', 'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']].copy()
X['Type'] = X['Type'].map({'L': 0, 'M': 1, 'H': 2})
X.head(2)

Unnamed: 0_level_0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,298.1,308.6,1551,42.8,0
2,0,298.2,308.7,1408,46.3,3


In [104]:
X['Tool wear [min]'].value_counts()

0      117
2       67
5       62
7       58
59      58
      ... 
236      1
237      1
239      1
241      1
246      1
Name: Tool wear [min], Length: 243, dtype: int64

In [8]:
y = hdf_df['HDF']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_SEED, stratify=y)

In [20]:
y_train.value_counts(normalize=True)

0    0.988248
1    0.011752
Name: HDF, dtype: float64

In [21]:
y_test.value_counts(normalize=True)

0    0.988115
1    0.011885
Name: HDF, dtype: float64

## random forest

In [22]:
model = RandomForestClassifier(random_state=RANDOM_SEED)
model.fit(X_train, y_train)

## testing accuracy

In [80]:
def calculate_and_display_accuracy(model, X_train, y_train, X_test, y_test, verbose: bool=True, rounding: int=4) -> list:
    model_train_accuracy = accuracy_score(y_train, model.predict(X_train))
    model_test_accuracy = accuracy_score(y_test, model.predict(X_test))
    baseline_train_accuracy = y_train.value_counts(normalize=True)[0]
    baseline_test_accuracy = y_test.value_counts(normalize=True)[0]
    if verbose:
        print(f'{"Model Training accuracy:":<58} {round(model_train_accuracy, rounding)}')
        print(f'{"Model Test accuracy:":<58} {round(model_test_accuracy, rounding)}')
        print(f'{"Baseline, Naive model (always guess 0) training accuracy:":<58} {round(baseline_train_accuracy, rounding)}')
        print(f'{"Baseline, Naive model (always guess 0) test accuracy:":<58} {round(baseline_test_accuracy, rounding)}')
    return model_train_accuracy, model_test_accuracy, baseline_train_accuracy, baseline_test_accuracy

In [81]:
calculate_and_display_accuracy(model, X_train, y_train, X_test, y_test)

Model Training accuracy:                                   1.0
Model Test accuracy:                                       0.9951
Baseline, Naive model (always guess 0) training accuracy:  0.9882
Baseline, Naive model (always guess 0) test accuracy:      0.9881


(1.0, 0.9950819672131147, 0.9882481552336704, 0.9881147540983607)

## SMOTE and ADASYN (maybe TOMEC links?)

### SMOTE

In [88]:
smote_resampler = SMOTE(random_state=RANDOM_SEED)
X_train_smote, y_train_smote = smote_resampler.fit_resample(X_train, y_train)

In [89]:
smote_rf = RandomForestClassifier(random_state=RANDOM_SEED)
smote_rf.fit(X_train_smote, y_train_smote)

In [90]:
calculate_and_display_accuracy(smote_rf, X_train, y_train, X_test, y_test)

Model Training accuracy:                                   1.0
Model Test accuracy:                                       0.9943
Baseline, Naive model (always guess 0) training accuracy:  0.9882
Baseline, Naive model (always guess 0) test accuracy:      0.9881


(1.0, 0.9942622950819672, 0.9882481552336704, 0.9881147540983607)

### ADASYN

In [91]:
adasyn_resampler = ADASYN(random_state=RANDOM_SEED)
X_train_adasyn, y_train_adasyn = adasyn_resampler.fit_resample(X_train, y_train)

In [92]:
adasyn_rf = RandomForestClassifier(random_state=RANDOM_SEED)
adasyn_rf.fit(X_train_adasyn, y_train_adasyn)

In [93]:
calculate_and_display_accuracy(adasyn_rf, X_train, y_train, X_test, y_test)

Model Training accuracy:                                   1.0
Model Test accuracy:                                       0.9955
Baseline, Naive model (always guess 0) training accuracy:  0.9882
Baseline, Naive model (always guess 0) test accuracy:      0.9881


(1.0, 0.9954918032786885, 0.9882481552336704, 0.9881147540983607)