# Libraries

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Dataset

In [15]:
dataset = pd.read_csv('predictive_maintenance.csv')
x = dataset.iloc[:, 2:-2].values
y = dataset.iloc[:, -1].values
print(x)
print(y)

[['M' 298.1 308.6 1551 42.8 0]
 ['L' 298.2 308.7 1408 46.3 3]
 ['L' 298.1 308.5 1498 49.4 5]
 ...
 ['M' 299.0 308.6 1645 33.4 22]
 ['H' 299.0 308.7 1408 48.5 25]
 ['M' 299.0 308.7 1500 40.2 30]]
['No Failure' 'No Failure' 'No Failure' ... 'No Failure' 'No Failure'
 'No Failure']


# Take care of missing data

In [16]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:, 1:-1])
x[:, 1:-1] = imputer.transform(x[:, 1:-1])

# Encode

#### For X

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

#### For y

In [18]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [19]:
print(y)

[1 1 1 ... 1 1 1]


# Split

In [20]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

# Feature scaling

In [21]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:,1:] = sc.fit_transform(x_train[:, 1:])
x_test[:, 1:] = sc.transform(x_test[:, 1:])

In [22]:
print(x_train)

[[0.0 0.8192209681474303 -0.6569405684527755 ... -0.9497317025755081
  1.1669831404989721 -0.5603378293947024]
 [0.0 -1.2206718808252548 1.5222077125714688 ... 0.8042378777294084
  -0.8380736281669992 -0.43496250910217926]
 [0.0 0.8192209681474303 -0.6569405684527755 ... 0.6925200700666748
  -0.8480989120103292 -0.40361867902904847]
 ...
 [1.0 -1.2206718808252548 -0.6569405684527755 ... -0.46934512962575387
  0.4150868522492329 1.2732762298834497]
 [0.0 0.8192209681474303 -0.6569405684527755 ... -0.6592654026524009
  0.7459212190791179 -1.1558706007841877]
 [0.0 0.8192209681474303 -0.6569405684527755 ... -1.0782071813876517
  1.7484496034121035 0.4896804780551795]]


# XGBoost Model

In [23]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(x_train, y_train)

# Predicting

In [24]:
y_pred = classifier.predict(x_test)

# Confusion Matrix

In [25]:
from sklearn.metrics import accuracy_score, confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[  25    1    0    0    0    0]
 [   1 2402    1    5    0    4]
 [   0    6   17    1    0    1]
 [   0    3    0   18    0    0]
 [   0    4    0    0    0    0]
 [   0   10    1    0    0    0]]


0.9848

# k-Fold cross validation

In [26]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10, n_jobs = -1)
print("Accuracy: {:.2f}%".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f}%".format(accuracies.std()*100))
print(accuracies)

Accuracy: 98.40%
Standard Deviation: 0.27%
[0.984      0.98533333 0.984      0.98533333 0.984      0.98
 0.988      0.98       0.98133333 0.988     ]


# Randomized Search

In [28]:
from sklearn.model_selection import RandomizedSearchCV
parameters = {'max_depth' : [3, 5, 7, 9, 12, 15, 17, 25],
             'learning_rate' : [0.01, 0.05, 0.1, 0.2, 0.3],
             'n_estimators' : [100, 200, 300, 400, 500],
             'subsample' : [0.2, 0.4, 0.6, 0.8, 1.0],
             'colsample_bytree' : [0.2, 0.4, 0.6, 0.8, 1.0],
             'reg_alpha' : [0, 0.1, 0.5, 1, 2, 5],
             'reg_lambda' : [0.1, 0.5, 1, 2, 5],
             'gamma' : [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],}
random_search = RandomizedSearchCV(estimator = classifier,
                           param_distributions = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1,
                           n_iter = 20,
                           error_score = 'raise')
random_search.fit(x_train, y_train)
best_accuracy = random_search.best_score_
best_parameters = random_search.best_params_
print("Best Accuracy: {:.2f}%".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 98.44%
Best Parameters: {'subsample': 0.6, 'reg_lambda': 0.5, 'reg_alpha': 0.1, 'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.3, 'gamma': 0.9, 'colsample_bytree': 1.0}
