### Import all the necessary libraries.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn import metrics

#### Import the valve dataset with all the extracted features and choose only the ones with highest correlation with respect to column 'class'

In [3]:
df = pd.read_csv("dataset/valve_features.csv")
correlation = df.corr()
#plt.figure(figsize=(14,8))
#sns.heatmap(correlation,annot=True,linewidth=0,vmin=-1)
correlation['class'].sort_values(ascending = False)

class             1.000000
y_mean            0.027144
y_min             0.026164
zcr_min           0.014071
spec_roll_min     0.011795
spec_cent_min     0.010260
zcr_mean          0.008412
spec_cent_mean    0.000492
spec_roll_mean   -0.002033
flat_mean        -0.005123
amp_min          -0.005767
melspec_std      -0.008701
flat_max         -0.009779
melspec_mean     -0.029909
y_std            -0.032896
rms_mean         -0.046125
y_max            -0.052506
zcr_max          -0.052856
amp_mean         -0.084064
Name: class, dtype: float64

#### We can see that `rms_mean`,`y_max`,`zcr_max`,`amp_mean` have highest correlation and so we will choose those features and create a model for valve machine.

In [6]:
df1 = df.filter(['rms_mean','y_max','zcr_max','amp_mean','class'],axis=1)
df1.head()

Unnamed: 0,rms_mean,y_max,zcr_max,amp_mean,class
0,0.004988,0.083072,0.19873,0.012551,0
1,0.004762,0.091593,0.214355,0.014089,0
2,0.005101,0.083659,0.200684,0.015995,0
3,0.005138,0.082879,0.197754,0.016069,0
4,0.005219,0.075072,0.1875,0.015526,0


In [7]:
# Separate features and target
X = df.loc[:,['rms_mean','y_max','zcr_max','amp_mean']]
y = df.iloc[:,-1]

In [8]:
# Divide the data into Training, and Test Set

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3,random_state=42)

In [22]:
# data standardization with  sklearn
from sklearn.preprocessing import StandardScaler

# copy of datasets
X_train_stand = X_train.copy()
X_test_stand = X_test.copy()


# fit on training data column
scale = StandardScaler().fit(X_train_stand)
    
# transform the training data column
X_train_stand = scale.transform(X_train_stand)
    
# transform the testing data column
X_test_stand = scale.transform(X_test_stand)

#### Create a model for predicting.

Based on our research RandomForestClassifier is best suited for the 'valve machine'.

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

model1 = RandomForestClassifier(random_state=42)
model1.fit(X_train_stand, y_train)
# Predict
y_pred1 = model1.predict(X_test_stand)

#Evaluation of the model

print('Test accuracy for Random Forest is:',metrics.accuracy_score(y_test,y_pred1)*100)
print('\nConfusion matrix:\n',metrics.confusion_matrix(y_test,y_pred1))
print('\n Classification Report: \n',metrics.classification_report(y_test,y_pred1))


Test accuracy for Random Forest is: 93.68505195843325

Confusion matrix:
 [[1093   16]
 [  63   79]]

 Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.99      0.97      1109
           1       0.83      0.56      0.67       142

    accuracy                           0.94      1251
   macro avg       0.89      0.77      0.82      1251
weighted avg       0.93      0.94      0.93      1251



#### Checking the cross validation score for the model.

In [11]:
from sklearn.model_selection import cross_val_score
cross_val_score(RandomForestClassifier(),X, y, cv=5)

array([0.96402878, 0.95443645, 0.95083933, 0.92086331, 0.86570743])

In [10]:
cross_val_score(RandomForestClassifier(class_weight="balanced"),X, y)

array([0.95683453, 0.94364508, 0.95083933, 0.91846523, 0.8705036 ])

#### Create a model for it to run

In [None]:
import pickle 

In [None]:
filename = 'valve_model.sav'
pickle.dump(KNN_model, open(filename, 'wb'))

In [None]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(X_test_stand)