### Import all the necessary libraries.

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn import metrics

#### Import the valve dataset with all the extracted features and choose only the ones with highest correlation with respect to column 'class'

In [23]:
df = pd.read_csv("dataset/pump_features.csv")
correlation = df.corr()
#plt.figure(figsize=(14,8))
#sns.heatmap(correlation,annot=True,linewidth=0,vmin=-1)
correlation['class'].sort_values(ascending = False)

class             1.000000
amp_mean          0.491441
y_std             0.446353
melspec_mean      0.442739
rms_mean          0.433178
melspec_std       0.390713
y_max             0.386973
amp_min           0.345665
zcr_mean          0.103197
y_mean            0.089914
zcr_min           0.082238
flat_max          0.080119
zcr_max           0.064419
spec_cent_mean    0.062948
flat_mean         0.049926
spec_cent_min     0.027408
spec_roll_mean    0.023528
spec_roll_min    -0.004405
y_min            -0.359156
Name: class, dtype: float64

#### We can see that 'rms_mean','y_max','y_std','melspec_mean','amp_mean' have highest correlation and so we will choose those features and create a model for valve machine.

In [24]:
df1 = df.filter(['rms_mean','y_max','y_std','melspec_mean','amp_mean','class'],axis=1)
df1.head()

Unnamed: 0,rms_mean,y_max,y_std,melspec_mean,amp_mean,class
0,0.005032,0.027334,0.005068,0.005801,0.014499,0
1,0.005439,0.034021,0.005452,0.006715,0.015554,0
2,0.00521,0.024207,0.005226,0.006221,0.014665,0
3,0.005834,0.036947,0.005865,0.007527,0.017364,0
4,0.004999,0.022523,0.005028,0.005714,0.014255,0


In [28]:
# Separate features and target
X = df.loc[:,['rms_mean','y_max','y_std','melspec_mean','amp_mean']]
y = df.iloc[:,-1]

In [29]:
# Split the data for train and test

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3,random_state=42)

In [30]:
# data standardization with  sklearn
from sklearn.preprocessing import StandardScaler

# copy of datasets
X_train_stand = X_train.copy()
X_test_stand = X_test.copy()


# fit on training data column
scale = StandardScaler().fit(X_train_stand)
    
# transform the training data column
X_train_stand = scale.transform(X_train_stand)
    
# transform the testing data column
X_test_stand = scale.transform(X_test_stand)

#### Create a model for predicting.

Based on our research RandomForestClassifier is best suited for the 'pump machine'.

In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

model1 = RandomForestClassifier(random_state=42)
model1.fit(X_train_stand, y_train)
# Predict
y_pred1 = model1.predict(X_test_stand)

#Evaluation of the model

print('Test accuracy for Random Forest is:',metrics.accuracy_score(y_test,y_pred1)*100)
print('\nConfusion matrix:\n',metrics.confusion_matrix(y_test,y_pred1))
print('\n Classification Report: \n',metrics.classification_report(y_test,y_pred1))


Test accuracy for Random Forest is: 93.34389857369256

Confusion matrix:
 [[1119    4]
 [  80   59]]

 Classification Report: 
               precision    recall  f1-score   support

           0       0.93      1.00      0.96      1123
           1       0.94      0.42      0.58       139

    accuracy                           0.93      1262
   macro avg       0.93      0.71      0.77      1262
weighted avg       0.93      0.93      0.92      1262



#### Checking the cross validation score for the model.

In [39]:
from sklearn.model_selection import cross_val_score
cross_val_score(RandomForestClassifier(),X, y)

array([0.96432818, 0.94530321, 0.91319857, 0.89179548, 0.91082045])

In [33]:
cross_val_score(RandomForestClassifier(class_weight="balanced"),X, y)

array([0.96313912, 0.94292509, 0.91795482, 0.89774078, 0.91795482])

#### Create a model for it to run

In [34]:
import pickle 

In [36]:
filename = 'pump_model.sav'
pickle.dump(model1, open(filename, 'wb'))

In [37]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(X_test_stand)