### Import all the necessary libraries.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn import metrics

#### Import the valve dataset with all the extracted features and choose only the ones with highest correlation with respect to column 'class'

In [9]:
df = pd.read_csv("../Dataset/slider_features.csv")
correlation = df.corr()
correlation['class'].sort_values(ascending = False)

class             1.000000
amp_mean          0.519006
spec_roll_mean    0.472774
melspec_std       0.465831
spec_cent_mean    0.459535
y_std             0.451597
rms_mean          0.431885
melspec_mean      0.423381
y_max             0.379127
zcr_mean          0.329398
flat_mean         0.278730
flat_max          0.246673
zcr_min           0.196231
zcr_max           0.182205
spec_cent_min     0.152008
spec_roll_min     0.117988
y_mean           -0.007646
amp_min          -0.019480
y_min            -0.412523
Name: class, dtype: float64

#### We can see that `spec_roll_mean`,`melspec_std`,`y_std`,`amp_mean`,`spec_cent_mean`,`rms_mean` have highest correlation and so we will choose those features and create a model for valve machine.

In [10]:
df1 = df.filter(['spec_roll_mean','melspec_std','y_std','amp_mean','spec_cent_mean','rms_mean','class'],axis=1)
df1.head()

Unnamed: 0,spec_roll_mean,melspec_std,y_std,amp_mean,spec_cent_mean,rms_mean,class
0,3214.092862,0.029762,0.005328,0.013499,1372.581294,0.005268,0
1,2035.462247,0.024763,0.005365,0.014137,985.649834,0.00533,0
2,2356.461853,0.028967,0.005878,0.0154,1086.078007,0.005845,0
3,3284.712775,0.029873,0.005626,0.01458,1394.776445,0.005581,0
4,1914.206832,0.029198,0.005836,0.015175,938.608348,0.005797,0


In [11]:
# Separate features and target
X = df.loc[:,['spec_roll_mean','melspec_std','y_std','amp_mean','spec_cent_mean','rms_mean']]
y = df.iloc[:,-1]

In [12]:
# Divide the data into Training, and Test Set

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3,random_state=42)

In [13]:
# data standardization with  sklearn
from sklearn.preprocessing import StandardScaler

# copy of datasets
X_train_stand = X_train.copy()
X_test_stand = X_test.copy()


# fit on training data column
scale = StandardScaler().fit(X_train_stand)
    
# transform the training data column
X_train_stand = scale.transform(X_train_stand)
    
# transform the testing data column
X_test_stand = scale.transform(X_test_stand)

#### Create a model for predicting.

Based on our research RandomForestClassifier is best suited for the 'slider machine'.

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

model1 = RandomForestClassifier(random_state=42)
model1.fit(X_train_stand, y_train)
# Predict
y_pred1 = model1.predict(X_test_stand)

#Evaluation of the model

print('Test accuracy for Random Forest is:',metrics.accuracy_score(y_test,y_pred1)*100)
print('\nConfusion matrix:\n',metrics.confusion_matrix(y_test,y_pred1))
print('\n Classification Report: \n',metrics.classification_report(y_test,y_pred1))


Test accuracy for Random Forest is: 92.51423921887714

Confusion matrix:
 [[959  14]
 [ 78 178]]

 Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.99      0.95       973
           1       0.93      0.70      0.79       256

    accuracy                           0.93      1229
   macro avg       0.93      0.84      0.87      1229
weighted avg       0.93      0.93      0.92      1229



#### Checking the cross validation score for the model.

In [17]:
from sklearn.model_selection import cross_val_score
cross_val_score(RandomForestClassifier(),X, y, cv=5)

array([0.97924298, 0.98778999, 0.92796093, 0.86446886, 0.75794621])

In [18]:
cross_val_score(RandomForestClassifier(class_weight="balanced"),X, y)

array([0.98534799, 0.98412698, 0.92673993, 0.86568987, 0.76772616])

#### Create a model for it to run

In [None]:
import pickle 

In [None]:
filename = 'slider_model.sav'
pickle.dump(KNN_model, open(filename, 'wb'))

In [None]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(X_test_stand)