# Binary Tabular Classification - Machine Failure

## 1. Setup

In [58]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

seed = np.random.seed(6)

In [59]:
data_root = r'C:\data\playground-series-s3e17'
train_path = os.path.join(data_root, 'train.csv')
test_path = os.path.join(data_root, 'test.csv')

## 2. EDA - investigate

what headers do we have and what is the distribution in our dataset

In [60]:
train_df = pd.read_csv(train_path, index_col="id")
test_df = pd.read_csv(test_path, index_col="id")


In [61]:
test_df.head(5)

Unnamed: 0_level_0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
136429,L50896,L,302.3,311.5,1499,38.0,60,0,0,0,0,0
136430,L53866,L,301.7,311.0,1713,28.8,17,0,0,0,0,0
136431,L50498,L,301.3,310.4,1525,37.7,96,0,0,0,0,0
136432,M21232,M,300.1,309.6,1479,47.6,5,0,0,0,0,0
136433,M19751,M,303.4,312.3,1515,41.3,114,0,0,0,0,0


In [62]:
train_df.head(5)

Unnamed: 0_level_0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


do we have any nans in our dataset? We need to clear them before we head into further investigation

In [63]:
train_df.isna().sum()

Product ID                 0
Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Machine failure            0
TWF                        0
HDF                        0
PWF                        0
OSF                        0
RNF                        0
dtype: int64

In [64]:
train_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 136429 entries, 0 to 136428
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   Product ID               136429 non-null  object 
 1   Type                     136429 non-null  object 
 2   Air temperature [K]      136429 non-null  float64
 3   Process temperature [K]  136429 non-null  float64
 4   Rotational speed [rpm]   136429 non-null  int64  
 5   Torque [Nm]              136429 non-null  float64
 6   Tool wear [min]          136429 non-null  int64  
 7   Machine failure          136429 non-null  int64  
 8   TWF                      136429 non-null  int64  
 9   HDF                      136429 non-null  int64  
 10  PWF                      136429 non-null  int64  
 11  OSF                      136429 non-null  int64  
 12  RNF                      136429 non-null  int64  
dtypes: float64(3), int64(8), object(2)
memory usage: 14.6+ MB


#### 2.1 categorical columns

In [65]:
len(train_df["Product ID"].unique())
train_df.drop(['Product ID'], inplace=True, axis=1)
test_df.drop(['Product ID'], inplace=True, axis=1)

encode the categorical values into numbers to use them for predictions

In [66]:
encoder = LabelEncoder()
a = train_df['Type']
train_df['Type']=encoder.fit_transform(a)

a = test_df['Type']
test_df['Type']=encoder.transform(a)

len(train_df['Type'].unique())
train_df.head()

Unnamed: 0_level_0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,2,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,1,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,1,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,2,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


#### 2.2 create features from existing ones

In [67]:
train_df.columns

Index(['Type', 'Air temperature [K]', 'Process temperature [K]',
       'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]',
       'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'],
      dtype='object')

In [68]:
test_df.columns

Index(['Type', 'Air temperature [K]', 'Process temperature [K]',
       'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'TWF',
       'HDF', 'PWF', 'OSF', 'RNF'],
      dtype='object')

In [69]:
def enhance_features(df):
    df["Power"] = df["Torque [Nm]"] * df["Rotational speed [rpm]"]
    df["temp_ratio"] = df["Process temperature [K]"] / df["Air temperature [K]"]
    df["tool_wear_speed"] = df["Tool wear [min]"] * df["Rotational speed [rpm]"]
    df["torque wear ratio"] = df["Torque [Nm]"] / (df["Tool wear [min]"] + 0.0001)
    df["torque times wear"] = df["Torque [Nm]"] * df["Tool wear [min]"]

    # Energy Consumption feature
    df['Energy Consumption'] = df['Power'] * df['Tool wear [min]'] * 60  # Assuming time is in minutes

    # Temperature Difference feature
    df['Temperature Difference'] = df['Process temperature [K]'] - df['Air temperature [K]']

    # Mechanical Stress feature
    epsilon = 0.0001  # Small value to avoid division by zero
    df['Mechanical Stress'] = df['Torque [Nm]'] / (df['Rotational speed [rpm]'] + epsilon)

    # Cooling Efficiency feature
    df['Cooling Efficiency'] = df['Temperature Difference'] / df['Rotational speed [rpm]']
    return df

In [70]:
train_df = enhance_features(train_df)
test_df = enhance_features(test_df)

train_df.columns = train_df.columns.str.replace('[\[\]]', '', regex=True)
test_df.columns = test_df.columns.str.replace('[\[\]]', '', regex=True)

In [71]:
# train_df.columns = ['Type', 'Air_temp', 'Process_temp',
       # 'Rotational_speed', 'Torque', 'Tool_wear',
       # 'Machine_failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']
# train_df.columns
# test_df.columns=['Type', 'Air_temp', 'Process_temp',
       # 'Rotational_speed', 'Torque', 'Tool_wear', 'TWF',
       # 'HDF', 'PWF', 'OSF', 'RNF']
# test_df.columns

In [72]:
target_column='Machine failure'
features = [i for i in train_df.columns]
features.remove("Machine failure")

# Entire Train DataX
X = train_df[features]
y = train_df[target_column]

In [73]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, random_state=seed, test_size=0.3)

In [74]:
rfmodel = RandomForestClassifier(random_state=seed)
rfmodel.fit(X_train, Y_train)

In [75]:
gbmodel = GradientBoostingClassifier(random_state=seed)
gbmodel.fit(X_train, Y_train)

In [76]:
xgmodel = XGBClassifier(random_state=seed)
xgmodel.fit(X_train, Y_train)

In [77]:
rfpred = rfmodel.predict(X_test)
print("ROC Area Under Curve of RandomForestClassifier : ",roc_auc_score(Y_test, rfpred))

ROC Area Under Curve of RandomForestClassifier :  0.8858851572440326


In [78]:
gbpred = gbmodel.predict(X_test)
print("ROC Area Under Curve of GradientBoostingClassifier : ", roc_auc_score(Y_test, gbpred))

ROC Area Under Curve of GradientBoostingClassifier :  0.8835223402216358


In [79]:
xgpred = xgmodel.predict(X_test)
print("ROC Area Under Curve of XGBClassifier : ", roc_auc_score(Y_test, xgpred))

ROC Area Under Curve of XGBClassifier :  0.8843471815228159


In [80]:
xgmodel.fit(X,y)

In [81]:
# preds = xgmodel.predict(test_data)
preds = xgmodel.predict_proba(test_df)[:,1]

In [82]:
submit = test_df.copy()
submit['Machine failure'] = preds
# submit['id'] = test_data.index 

In [83]:
submit.columns

Index(['Type', 'Air temperature K', 'Process temperature K',
       'Rotational speed rpm', 'Torque Nm', 'Tool wear min', 'TWF', 'HDF',
       'PWF', 'OSF', 'RNF', 'Power', 'temp_ratio', 'tool_wear_speed',
       'torque wear ratio', 'torque times wear', 'Energy Consumption',
       'Temperature Difference', 'Mechanical Stress', 'Cooling Efficiency',
       'Machine failure'],
      dtype='object')

In [84]:
col=[i for i in submit.columns]
# col.remove('id')
col.remove('Machine failure')
submit.drop(col,inplace=True, axis=1)

In [85]:
submit.head()

Unnamed: 0_level_0,Machine failure
id,Unnamed: 1_level_1
136429,0.000903
136430,0.001742
136431,0.000469
136432,0.000346
136433,0.001316


In [86]:
submit.to_csv("submission_enhanced_features.csv", index=True, header=True)