# Binary Tabular Classification - Machine Failure

## 1. Setup

In [50]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

seed = np.random.seed(6)

In [51]:
data_root = r'C:\data\playground-series-s3e17'
train_path = os.path.join(data_root, 'train.csv')
test_path = os.path.join(data_root, 'test.csv')

## 2. EDA - investigate

what headers do we have and what is the distribution in our dataset

In [52]:
train_df = pd.read_csv(train_path, index_col="id")
test_df = pd.read_csv(test_path, index_col="id")

In [53]:
train_df.head(5)

Unnamed: 0_level_0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


In [54]:
train_df.isna().sum()

Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Machine failure            0
TWF                        0
HDF                        0
PWF                        0
OSF                        0
RNF                        0
dtype: int64

In [55]:
train_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 136429 entries, 0 to 136428
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Product ID               136429 non-null  object 
 1   Type                     136429 non-null  object 
 2   Air temperature [K]      136429 non-null  float64
 3   Process temperature [K]  136429 non-null  float64
 4   Rotational speed [rpm]   136429 non-null  int64  
 5   Torque [Nm]              136429 non-null  float64
 6   Tool wear [min]          136429 non-null  int64  
 7   Machine failure          136429 non-null  int64  
 8   TWF                      136429 non-null  int64  
 9   HDF                      136429 non-null  int64  
 10  PWF                      136429 non-null  int64  
 11  OSF                      136429 non-null  int64  
 12  RNF                      136429 non-null  int64  
dtypes: float64(3), int64(8), object(2)
memory usage: 14.6+ MB


In [56]:
len(train_df["Product ID"].unique())
train_df.drop(['Product ID'], inplace=True, axis=1)
test_df.drop(['Product ID'], inplace=True, axis=1)

In [57]:
encoder = LabelEncoder()
a = train_df['Type']
train_df['Type']=encoder.fit_transform(a)

a = test_df['Type']
test_df['Type']=encoder.transform(a)

len(train_df['Type'].unique())
train_df.head()

Unnamed: 0_level_0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,2,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,1,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,1,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,2,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


In [58]:
train_df.columns = ['Type', 'Air_temp', 'Process_temp',
       'Rotational_speed', 'Torque', 'Tool_wear',
       'Machine_failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']
train_df.columns
test_df.columns=['Type', 'Air_temp', 'Process_temp',
       'Rotational_speed', 'Torque', 'Tool_wear', 'TWF',
       'HDF', 'PWF', 'OSF', 'RNF']
test_df.columns

Index(['Type', 'Air_temp', 'Process_temp', 'Rotational_speed', 'Torque',
       'Tool_wear', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'],
      dtype='object')

In [59]:
target_column='Machine_failure'
features = [i for i in train_df.columns]
features.remove("Machine_failure")

# Entire Train DataX
X = train_df[features]
y = train_df[target_column]

NameError: name 'train_data' is not defined