## Objective
How many of the trucks that broke down did you actualy predict?<br>
**Precision and Recall score**<br>
**Notes**:Minimize false positives and false negatives. <br>
the column we are trying to predict is called **failure** with binary value 0 for non-failure and 1 for failure

In [1]:
import numpy as np
import io
import datetime
import datetime as dt
from scipy import stats
from scipy.stats import norm, skew
import datetime as dt
from math import radians, cos, sin, asin,sqrt
import glob
import pandas_profiling
import os
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
%matplotlib inline
np.random.seed(7)

  from ._conv import register_converters as _register_converters


In [2]:
# needed to upload csv to colab
#from google.colab import files
#uploaded = files.upload()

In [3]:
data = pd.read_csv('failures.csv')

In [4]:
data.head()

Unnamed: 0,date,device,failure,attribute1,attribute2,attribute3,attribute4,attribute5,attribute6,attribute7,attribute8,attribute9
0,2015-01-01,S1F01085,0,215630672,56,0,52,6,407438,0,0,7
1,2015-01-01,S1F0166B,0,61370680,0,3,0,6,403174,0,0,0
2,2015-01-01,S1F01E6Y,0,173295968,0,0,0,12,237394,0,0,0
3,2015-01-01,S1F01JE0,0,79694024,0,0,0,6,410186,0,0,0
4,2015-01-01,S1F01R2B,0,135970480,0,0,0,15,313173,0,0,3


In [5]:
# groupby on device and then take the max of all the numerical columns
data = data.groupby(['device'], as_index=False).max()

In [6]:
# Going to get the month, and day of the week.
# convert date to datetime
data['date'] = pd.to_datetime(data.date)
# get the day of the week
data['day_of_week'] = data.date.dt.weekday_name
# getting month
data['month'] = [i.strftime('%b') for i in data['date']]

In [7]:
# function to normalize the data
def Normal(data):
    mean_data = np.mean(data)
    std_data = np.std(data)
    norm_data = (data-mean_data)/std_data
    return norm_data

In [8]:
# FOR NORMALIZING THE DATA

#  Pull out device id 
device_id = data['device']
# drop the columns we dont want(attribute7 because its the same as attribute8)
X = data.drop(['failure','attribute7', 'date', 'device'], axis=1)
# get list of integer columns to normalize
int_cols = [col for col in X if X[col].dtype == 'int64']
cat_cols = [col for col in X if X[col].dtype != 'int64']
# normalize
#X = X[int_cols].apply(lambda x: (x-np.mean(x))/(np.std(x)+1e-10)).join(X[cat_cols])
X = Normal(X[int_cols]).join(X[cat_cols])
# get dummies
X_normal = pd.get_dummies(X, columns= ['month', 'day_of_week'],drop_first=True)
y = data['failure']

In [10]:
from sklearn.model_selection import train_test_split
# splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_normal,y, test_size=.2, random_state =12, stratify=y)

In [11]:
# checking value counts to compare after smote to confirm it upsampled
y_train_df = pd.DataFrame(y_train)
y_train_df['failure'].value_counts()

0    849
1     85
Name: failure, dtype: int64

### Upsampling with Smote

In [12]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=12, sampling_strategy='minority')
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

In [13]:
y_train_res_df = pd.DataFrame(y_train_res)
y_train_res_df[0].value_counts()

1    849
0    849
Name: 0, dtype: int64

### Setting up the keras deep learning model

In [14]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

Using TensorFlow backend.


In [52]:
# creating variable for input dim
input_dim = len(X_train.columns)
model = Sequential()
#First layer
model.add(Dense(32, input_dim=input_dim, activation='relu'))
# hidden layer
#model.add(Dense(4, activation='relu'))
#output layer. Notice sigmoid(read it was best for binary classification)
model.add(Dense(1, activation='sigmoid'))
#summary
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_19 (Dense)             (None, 32)                800       
_________________________________________________________________
dense_20 (Dense)             (None, 1)                 33        
Total params: 833
Trainable params: 833
Non-trainable params: 0
_________________________________________________________________


For metrics I want precision and recall which were recently removed as of keras 2.0. So I found this package someone made called keras_metrics

In [53]:
import keras_metrics
from keras import optimizers
adam = optimizers.Adam(lr=.001)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=[keras_metrics.precision(), keras_metrics.recall(), 'accuracy'])

In [54]:
# train the model
model.fit(X_train_res,y_train_res, epochs=20,batch_size=10) #  batch size is the number of training examples in one forward/backward pass

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1ff92949cc0>

In [55]:
# test the model
scores = model.evaluate(X_test, y_test, batch_size=10)
for i in range(len(model.metrics_names)):
  print("\n%s: %.2f%%" % (model.metrics_names[i], scores[i]*100))


loss: 16.37%

precision: 85.71%

recall: 85.71%

acc: 97.44%


In [56]:
from sklearn.metrics import classification_report
# Evaluate TEST model class prediction accuracy
print("[INFO] Evaluating network...")
predictions = model.predict(X_test)
print(classification_report(y_test, np.round(predictions,0)))

[INFO] Evaluating network...
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       213
           1       0.86      0.86      0.86        21

   micro avg       0.97      0.97      0.97       234
   macro avg       0.92      0.92      0.92       234
weighted avg       0.97      0.97      0.97       234



In [57]:
cm = confusion_matrix(y_test, np.round(predictions,0))
list1 = ["Actual 0", "Actual 1"]
list2 = ["Predicted 0", "Predicted 1"]
cm_df = pd.DataFrame(cm, list1, list2)
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,210,3
Actual 1,3,18


In [59]:
# Evaluate TRAIN model class prediction accuracy
print("[INFO] Evaluating network...")
trainPreds = model.predict(X_train)
print(classification_report(y_train, np.round(trainPreds,0)))

[INFO] Evaluating network...
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       849
           1       0.93      0.96      0.95        85

   micro avg       0.99      0.99      0.99       934
   macro avg       0.96      0.98      0.97       934
weighted avg       0.99      0.99      0.99       934



In [60]:
cm = confusion_matrix(y_train, np.round(trainPreds,0))
list1 = ["Actual 0", "Actual 1"]
list2 = ["Predicted 0", "Predicted 1"]
cm_df = pd.DataFrame(cm, list1, list2)
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,843,6
Actual 1,3,82
