# EDA of Random Forest Classifier on Machine Failure

## libraries

In [1]:
# 3rd party
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## exploration

In [2]:
df = pd.read_csv('../data/machine_failure.csv', index_col=0)

In [3]:
df.head(2)

Unnamed: 0_level_0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0


In [4]:
df[['TWF', 'HDF', 'PWF', 'OSF', 'RNF']].value_counts()

TWF  HDF  PWF  OSF  RNF
0    0    0    0    0      9652
     1    0    0    0       106
     0    1    0    0        80
          0    1    0        78
1    0    0    0    0        42
0    0    0    0    1        18
          1    1    0        11
     1    0    1    0         6
          1    0    0         3
1    0    0    1    0         2
               0    1         1
          1    1    0         1
dtype: int64

In [5]:
print('twf', 'hdf', 'pwf', 'osf', 'rnf')
print(f"{len(df[df['TWF'] == 1]):<3}", f"{len(df[df['HDF'] == 1]):<3}", f"{len(df[df['PWF'] == 1]):<3}", f"{len(df[df['OSF'] == 1]):<3}", f"{len(df[df['RNF'] == 1]):<3}")


twf hdf pwf osf rnf
46  115 95  98  19 


In [6]:
df['Machine failure'].value_counts()

0    9661
1     339
Name: Machine failure, dtype: int64

In [7]:
a_failure = df[(df['TWF'] == 1) | (df['HDF'] == 1) |  (df['PWF'] == 1) | (df['OSF'] == 1) |  (df['RNF'] == 1)]

In [8]:
a_failure[a_failure['Machine failure'] == 0]

Unnamed: 0_level_0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1222,M16081,M,297.0,308.3,1399,46.4,132,0,0,0,0,0,1
1303,L48482,L,298.6,309.8,1505,45.7,144,0,0,0,0,0,1
1749,H31162,H,298.4,307.7,1626,31.1,166,0,0,0,0,0,1
2073,L49252,L,299.6,309.5,1570,35.5,189,0,0,0,0,0,1
2560,L49739,L,299.3,309.0,1447,50.4,140,0,0,0,0,0,1
3066,M17925,M,300.1,309.2,1687,27.7,95,0,0,0,0,0,1
3453,H32866,H,301.6,310.5,1602,32.3,2,0,0,0,0,0,1
5472,L52651,L,302.7,312.3,1346,61.2,170,0,0,0,0,0,1
5490,L52669,L,302.6,312.1,1499,35.0,215,0,0,0,0,0,1
5496,H34909,H,302.9,312.5,1357,55.0,12,0,0,0,0,0,1


## preprocessing

In [9]:
hdf_df = df[(df['HDF']) | ((df['Machine failure'] == 0) & (df['RNF'] == 0))]

In [10]:
hdf_df[(hdf_df['Machine failure'] == 1) & (hdf_df['HDF'] == 0)].shape

(0, 13)

In [11]:
hdf_df[(hdf_df['Machine failure'] == 0) & (hdf_df['HDF'] == 1)].shape

(0, 13)

In [12]:
hdf_df.columns

Index(['Product ID', 'Type', 'Air temperature [K]', 'Process temperature [K]',
       'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]',
       'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'],
      dtype='object')

In [13]:
X = hdf_df[['Type', 'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']].copy()
X['Type'] = X['Type'].map({'L': 0, 'M': 1, 'H': 2})
X.head(2)

Unnamed: 0_level_0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
UDI,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,298.1,308.6,1551,42.8,0
2,0,298.2,308.7,1408,46.3,3


In [14]:
y = hdf_df['HDF']
y.head(2)

UDI
1    0
2    0
Name: HDF, dtype: int64

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## random forest

In [16]:
model = RandomForestClassifier(random_state=0)
model.fit(X_train, y_train)

## testing

In [17]:
print(f'Training accuracy: {accuracy_score(y_train, model.predict(X_train))}')

Training accuracy: 1.0


In [18]:
print(f'Test accuracy: {accuracy_score(y_test, model.predict(X_test))}')

Test accuracy: 0.9942622950819672


In [19]:
print(f'Baseline model training data: {len(y_train[y_train == 0]) / len(y_train)}')

Baseline model training data: 0.9892047007379066


In [20]:
print(f'Baseline model test data: {len(y_test[y_test == 0]) / len(y_test)}')

Baseline model test data: 0.9852459016393442


## resampling/SMOTE