In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.discriminant_analysis import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef
from scipy.stats import hmean

from pyprojroot import here
import wandb

import seaborn as sns
import matplotlib.pyplot as plt

from xgboost import XGBClassifier

In [2]:
X_train = pd.read_csv(here('data/processed/X_train_scaled.csv'))
print(f"X_train {X_train.shape}:")
print(X_train.describe())
print()
Y_train = pd.read_csv(here('data/processed/Y_train.csv'))
print(f"Y_train {Y_train.shape}:")
print(Y_train.describe())

X_val = pd.read_csv(here('data/processed/X_val_scaled.csv'))
print(f"X_val {X_val.shape}:")
print(X_val.describe())
print()
Y_val = pd.read_csv(here('data/processed/Y_val.csv'))
print(f"Y_val {Y_val.shape}:")
print(Y_val.describe())

X_train (7499, 6):
       quality_variation  Air temperature  Process temperature  \
count        7499.000000     7.499000e+03         7.499000e+03   
mean            0.498066     1.914078e-14        -6.400830e-15   
std             0.669290     1.000067e+00         1.000067e+00   
min             0.000000    -2.345342e+00        -2.900847e+00   
25%             0.000000    -8.492234e-01        -8.121363e-01   
50%             0.000000     4.844744e-02         6.377483e-02   
75%             1.000000     7.466359e-01         7.375526e-01   
max             2.000000     2.242754e+00         2.556753e+00   

       Rotational speed        Torque     Tool wear  
count      7.499000e+03  7.499000e+03  7.499000e+03  
mean       1.549190e-16 -2.539345e-16 -7.580133e-17  
std        1.000067e+00  1.000067e+00  1.000067e+00  
min       -1.975906e+00 -3.634855e+00 -1.696408e+00  
25%       -6.417201e-01 -6.759965e-01 -8.641426e-01  
50%       -1.988368e-01  6.045571e-03 -4.711563e-04  
75%     

In [None]:
# Machine Failure
model = XGBClassifier(n_estimators=10, max_depth=3, learning_rate=1, objective='binary:logistic')
model.fit(X_train, Y_train["Machine failure"])
preds_train = model.predict(X_train)
preds_val = model.predict(X_val)
print(f"Train: {f1_score(Y_train["Machine failure"], preds_train)} -- Val: {f1_score(Y_val["Machine failure"], preds_val)}")


Train: 0.8 -- Val: 0.6436781609195402


In [48]:
# Individual Failure Types
types = ["TWF", "HDF", "PWF", "OSF", "RNF"]
model = XGBClassifier(n_estimators=10, max_depth=3, learning_rate=1, objective='binary:logistic')
model.fit(X_train, Y_train[types])
preds_train = model.predict(X_train)
preds_val = model.predict(X_val)
print(f"Train: {f1_score(Y_train[types], preds_train, average='macro')} -- Val: {f1_score(Y_val[types], preds_val, average='macro')}")
for idx, failure in enumerate(types):
    print(f"{failure} Train: {f1_score(Y_train[failure], preds_train[:,idx])} -- Val: {f1_score(Y_val[failure], preds_val[:,idx])}")

Train: 0.5111268460699772 -- Val: 0.48969696969696963
TWF Train: 0.0 -- Val: 0.0
HDF Train: 0.9005847953216374 -- Val: 0.8484848484848485
PWF Train: 0.6779661016949152 -- Val: 0.8
OSF Train: 0.84375 -- Val: 0.8
RNF Train: 0.13333333333333333 -- Val: 0.0
