In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/hard-drive-test-data/harddrive.csv')

# Data Preprocessing

I'll start by removing all models that does not have any failure. It helps to reduce the bias and unbalancement of the dataset.

In [None]:
df.head()

In [None]:
## serial number with failures
fail_hds = df[df['failure'] == 1]['serial_number'].values

In [None]:
df = df[df['serial_number'].isin(fail_hds)]

In [None]:
df.head()

In [None]:
df.reset_index(inplace=True, drop=True)

## Sorting values

I Need to sort the values first by their serial number, and then by the date. This will guarantee I'll have the database grouped by each model of HDD's cycle of "life".

In [None]:
# sorting by serial number and then by date, to get the failure as last value (end of cycle)
df = df.sort_values(['serial_number','date'])

df.reset_index(inplace=True, drop=True)

In [None]:
df.head()

## Dropping NAs

I'll remove all columns that have more than 5% of NA values. This will clean a lot the dataset.

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,8))
df.isna().sum().plot(kind='bar')
plt.show()

In [None]:
df_notna = df[df.columns[~(df.isna().sum().values/len(df) > 0.05)]]

In [None]:
df_notna.dropna(inplace=True)

In [None]:
df = df_notna.copy()

In [None]:
df.reset_index(inplace=True, drop=True)

# Multiclassifier Approach:

Every cycle of life of one HDD has a failure at the end. It means that we'll have a very imbalanced dataset. Let's consider n executions for all HDDs, and N HDDs. So, we'll have $N$ failures and $n - N$ good executions.

I'll tackle this using the Multiclassifier Approach: there will be k models in k databases, each of one I'll add m failures to the executions right before the N execution (fail).

## Finding the best m values

For this, we need to see the differences between each model of HDD start time of use and failure

In [None]:
fails = df[df['failure'] == 1].index.values

rev_fails = fails[::-1]
shift_fails = np.roll(fails[::-1], shift=-1)
shift_fails[-1] = 0
fails_dif = fails[::-1] - shift_fails

In [None]:
counts, bins = np.histogram(fails_dif)
plt.figure(figsize=(12,8))
plt.hist(bins[:-1], bins, weights=counts)
plt.show()

The idea is to select all HDDs that have more than 10 days of interval between start and fail.

In [None]:
fails_dif_ordered = fails_dif[::-1]

models = [features for (features, i) in zip(df[df['failure'] == 1].serial_number.values, range(0, len(df[df['failure'] == 1].index.values))) if fails_dif[i]>10]
#np.where(features.append()fails_dif_ordered > 10

In [None]:
final_df = df[df.serial_number.isin(models)]
final_df.reset_index(inplace=True, drop=True)

In [None]:
final_df.head()

In [None]:
final_df.shape

Now we are good to create k databases to train the model. I'll create 4 databases, with the values: [1, 3, 5, 7], for the number of failures in each cycle. After all, we we'll have almost 50% of database of failures, creating the balanced one.

This approach is good because it's not very recommended creating synthetic data of failure values. With this approach, it's possible to see when the failure is starting to happen.

In [None]:
m = [3, 5, 7]

The code below will create 3 failures cases, each one for each m value. The case of 1 fail value is already done.

In [None]:
failures = []
fails = final_df[final_df['failure'] == 1].index.values
for i in m:
    failure = list(range(0, len(final_df), 1)) 
    f = 0
    while f < len(failure):
        if f+i-1 in fails:
            
            for values in range(f,f+i,1):
                failure[values] = 1
                
            f=f+i
        else:
            failure[f] = 0
            f=f+1
        
    failures.append(failure)

In [None]:
dfs = []
for i in range(0, 4, 1):
    if i == 0:
        dfs.append(final_df)
    else:
        aux_df = final_df.copy()
        aux_df['failure'] = failures[i-1]
        dfs.append(aux_df)

In [None]:
dfs[1].tail()

Now, we have 4 databases, and we need only take out date, serial number and model to start the training.

In [None]:
for df in dfs:
    df.drop(['date', 'serial_number', 'model'], axis=1, inplace=True)

In [None]:
dfs[2].tail(10)

Finally, the 4 databases are ready to train.

# Building the Model

I'll use the Gradient Boosting, trained in each dataset. First, I'll train a GridSearch to find the optimum parameters for each dataset.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix

In [None]:
models = []
accs = [] #accuracies
recs = [] #recalls
precs = [] #precisions
f1s = [] #f1 scores
rocs = [] #roc auc scores

for df in dfs:
    
    X = df.drop('failure', axis=1)
    y = df.failure
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    
    
    gbrt = GradientBoostingClassifier(max_features='sqrt', random_state=0)
    learning_rates = [0.1, 0.05, 0.01]
    n_estimators = [16, 32, 64]
    max_depths = [5, 10, 15]
    
    params = {'learning_rate': learning_rates, 'n_estimators':n_estimators, 'max_depth':max_depths}
    clf = GridSearchCV(gbrt, params)
    clf.fit(X_train, y_train)
    models.append(clf)
    
    final_pred = clf.predict(X_test)
    accs.append(accuracy_score(y_test, final_pred))
    recs.append(recall_score(y_test, final_pred))
    precs.append(precision_score(y_test, final_pred))
    f1s.append(f1_score(y_test, final_pred))
    rocs.append(roc_auc_score(y_test, final_pred))
    print(clf.best_params_)
    

# Evaluating

Off course, in the first model (k=1), we will see an almost perfect accuracy but a zero recall. This means that everytime the model guesses that won't happen a failure. But, despite that, the recall and AUC-ROC values improve everytime k grows, but accuracy does not fall a lot. This is very good.

In [None]:
x = ['k=1', 'k=3', 'k=5', 'k=7']

plt.figure(figsize=(12,8))
plt.bar(x, accs)
plt.title('Accuracy values of each model')

In [None]:
plt.figure(figsize=(12,8))
plt.bar(x, recs)
plt.title('Recall values of each model')
plt.show()

In [None]:
plt.figure(figsize=(12,8))
plt.bar(x, precs)
plt.title('Precision values of each model')
plt.show()

In [None]:
plt.figure(figsize=(12,8))
plt.bar(x, f1s)
plt.title('F1-Score values of each model')
plt.show()

In [None]:
plt.figure(figsize=(12,8))
plt.bar(x, rocs)
plt.title('AUC-ROC values of each model')
plt.show()

This approch can be used in real life, eitheir in HDD or in big factories that use a lot of automated machines, with constant failures during the production.

The idea of taking this approach is from the paper of Gian Antonio Susto, Andrea Schirru, Simone Pampuri, Sean McLoone and Alessandro Beghi, "Machine Learning for Predictive Maintenance:a Multiple Classiﬁer Approach". 