# Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Dataset

In [2]:
dataset = pd.read_csv('predictive_maintenance.csv')
x = dataset.iloc[:, 2:-2].values
y = dataset.iloc[:, -2].values
print(x)
print(y)

[['M' 298.1 308.6 1551 42.8 0]
 ['L' 298.2 308.7 1408 46.3 3]
 ['L' 298.1 308.5 1498 49.4 5]
 ...
 ['M' 299.0 308.6 1645 33.4 22]
 ['H' 299.0 308.7 1408 48.5 25]
 ['M' 299.0 308.7 1500 40.2 30]]
[0 0 0 ... 0 0 0]


# Take care of missing data

In [3]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(x[:, 1:-1])
x[:, 1:-1] = imputer.transform(x[:, 1:-1])

# Encode

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

# Split

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

# Feature scaling

In [6]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:,1:] = sc.fit_transform(x_train[:, 1:])
x_test[:, 1:] = sc.transform(x_test[:, 1:])

In [7]:
print(x_train)

[[0.0 0.8192209681474303 -0.6569405684527755 ... -0.9497317025755081
  1.1669831404989721 -0.5603378293947024]
 [0.0 -1.2206718808252548 1.5222077125714688 ... 0.8042378777294084
  -0.8380736281669992 -0.43496250910217926]
 [0.0 0.8192209681474303 -0.6569405684527755 ... 0.6925200700666748
  -0.8480989120103292 -0.40361867902904847]
 ...
 [1.0 -1.2206718808252548 -0.6569405684527755 ... -0.46934512962575387
  0.4150868522492329 1.2732762298834497]
 [0.0 0.8192209681474303 -0.6569405684527755 ... -0.6592654026524009
  0.7459212190791179 -1.1558706007841877]
 [0.0 0.8192209681474303 -0.6569405684527755 ... -1.0782071813876517
  1.7484496034121035 0.4896804780551795]]


# CatBoost Model

In [8]:
from catboost import CatBoostClassifier
classifier = CatBoostClassifier()
classifier.fit(x_train, y_train)

Learning rate set to 0.024355
0:	learn: 0.6427768	total: 153ms	remaining: 2m 33s
1:	learn: 0.6023062	total: 159ms	remaining: 1m 19s
2:	learn: 0.5549293	total: 166ms	remaining: 55.1s
3:	learn: 0.5121469	total: 171ms	remaining: 42.6s
4:	learn: 0.4805836	total: 175ms	remaining: 34.8s
5:	learn: 0.4506952	total: 180ms	remaining: 29.7s
6:	learn: 0.4188497	total: 184ms	remaining: 26.1s
7:	learn: 0.3957221	total: 188ms	remaining: 23.3s
8:	learn: 0.3731577	total: 193ms	remaining: 21.2s
9:	learn: 0.3523465	total: 212ms	remaining: 21s
10:	learn: 0.3302721	total: 223ms	remaining: 20s
11:	learn: 0.3105816	total: 230ms	remaining: 18.9s
12:	learn: 0.2948471	total: 237ms	remaining: 18s
13:	learn: 0.2823820	total: 244ms	remaining: 17.2s
14:	learn: 0.2614628	total: 254ms	remaining: 16.7s
15:	learn: 0.2487429	total: 259ms	remaining: 16s
16:	learn: 0.2370591	total: 264ms	remaining: 15.3s
17:	learn: 0.2203938	total: 273ms	remaining: 14.9s
18:	learn: 0.2092392	total: 279ms	remaining: 14.4s
19:	learn: 0.2003

<catboost.core.CatBoostClassifier at 0x23981e8caf0>

# Predicting

In [9]:
y_pred = classifier.predict(x_test)

# Confusion Matrix

In [10]:
from sklearn.metrics import accuracy_score, confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[2403    9]
 [  32   56]]


0.9836

# k-Fold cross validation

In [11]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = x_train, y = y_train, cv = 10, n_jobs = -1)
print("Accuracy: {:.2f}%".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f}%".format(accuracies.std()*100))
print(accuracies)

Accuracy: 98.68%
Standard Deviation: 0.50%
[0.99066667 0.99066667 0.98133333 0.992      0.98933333 0.984
 0.992      0.97866667 0.98       0.98933333]


# Randomized Search

In [13]:
from sklearn.model_selection import RandomizedSearchCV
parameters = {'iterations' : [100, 200, 500, 1000],
             'learning_rate' : [0.01, 0.05, 0.1, 0.2],
             'depth' : [4, 6, 8, 10],
             'l2_leaf_reg' : [1, 3, 5, 7, 10],
             'bagging_temperature' : [0, 0.5, 1, 2],
             'subsample' : [0.2, 0.4, 0.6, 0.8, 1.0]}
random_search = RandomizedSearchCV(estimator = classifier,
                           param_distributions = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1,
                           n_iter = 20,
                           error_score = 'raise')
random_search.fit(x_train, y_train)
best_accuracy = random_search.best_score_
best_parameters = random_search.best_params_
print("Best Accuracy: {:.2f}%".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

0:	learn: 0.5890648	total: 14.3ms	remaining: 14.3s
1:	learn: 0.5017247	total: 27.5ms	remaining: 13.7s
2:	learn: 0.4347058	total: 36.1ms	remaining: 12s
3:	learn: 0.3784580	total: 47.6ms	remaining: 11.9s
4:	learn: 0.3419039	total: 53.6ms	remaining: 10.7s
5:	learn: 0.2998750	total: 64.8ms	remaining: 10.7s
6:	learn: 0.2666741	total: 75.9ms	remaining: 10.8s
7:	learn: 0.2274253	total: 87.6ms	remaining: 10.9s
8:	learn: 0.2072745	total: 116ms	remaining: 12.8s
9:	learn: 0.1785194	total: 181ms	remaining: 17.9s
10:	learn: 0.1644767	total: 219ms	remaining: 19.7s
11:	learn: 0.1508873	total: 238ms	remaining: 19.6s
12:	learn: 0.1396391	total: 250ms	remaining: 19s
13:	learn: 0.1303724	total: 276ms	remaining: 19.4s
14:	learn: 0.1213924	total: 289ms	remaining: 19s
15:	learn: 0.1132919	total: 302ms	remaining: 18.6s
16:	learn: 0.1096265	total: 311ms	remaining: 18s
17:	learn: 0.1071225	total: 317ms	remaining: 17.3s
18:	learn: 0.1015800	total: 332ms	remaining: 17.1s
19:	learn: 0.0961411	total: 348ms	remaini