<a href="https://colab.research.google.com/github/StudentHagal/Thesis/blob/main/Runtime_Evaluation_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0) Preparation

## Load Requirements

In [1]:
# Libraries
import sys
import numpy as np
import pandas as pd
import sklearn
import joblib
import hashlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from timeit import default_timer as timer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA


#Extension
!pip install ipython-autotime
%load_ext autotime
!git clone https://github.com/StudentHagal/Thesis.git

print('Load completed')

Collecting ipython-autotime
  Downloading ipython_autotime-0.3.1-py2.py3-none-any.whl (6.8 kB)
Installing collected packages: ipython-autotime
Successfully installed ipython-autotime-0.3.1
Cloning into 'Thesis'...
remote: Enumerating objects: 255, done.[K
remote: Counting objects: 100% (255/255), done.[K
remote: Compressing objects: 100% (244/244), done.[K
remote: Total 255 (delta 90), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (255/255), 16.94 MiB | 3.47 MiB/s, done.
Resolving deltas: 100% (90/90), done.
Load completed
time: 12.3 s (started: 2022-01-10 13:52:19 +00:00)


In [2]:
# Check the versions of libraries
# !!! WARNING !!!
# Important because model result may be different for other version
 
print('Python: {}'.format(sys.version))
print('numpy: {}'.format(np.__version__))
print('pandas: {}'.format(pd.__version__))
print('sklearn: {}'.format(sklearn.__version__))
print('joblib: {}'.format(joblib.__version__))

Python: 3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]
numpy: 1.19.5
pandas: 1.1.5
sklearn: 1.0.1
joblib: 1.1.0
time: 7.05 ms (started: 2022-01-10 13:52:31 +00:00)


## Global Variables

In [3]:
#Global variables declaration

#Stored file path                     
raw_file = "/content/Thesis/Resources/database.csv"
#environment path         
model_dir = '/content/Thesis/Resources/models/'                                  
process_dir = '/content/Thesis/Resources/processed/'
#Column names for dataset
col_names = ['No', 'Datetime', 'Control_Mode', 'L1', 'L2', 'L3', 'Frequency', 'kVA_Total',           
             'kVA_L1', 'kVA_L2', 'kVA_L3', 'KVAr', 'Oil_Pressure', 'Coolant_Temp', 
             'Charger_Alternator', 'PF_Avg', 'PF_L1', 'PF_L2', 
             'PF_L3', 'L1_N', 'L2_N', 'L3_N', 'Source_Ext_Voltage', 'ECU_Temp', 
             'RPM', 'Train_code']
features = ['Oil_Pressure', 'Coolant_Temp', 'PF_Avg', 'ECU_Temp']
#Unused columns in dataset
unused_col = ['No', 'Datetime', 'Control_Mode', 'L1', 'L2', 'L3', 'Frequency',                       
              'kVA_Total', 'kVA_L1', 'kVA_L2', 'kVA_L3', 'KVAr', 'Charger_Alternator',
              'RPM', 'PF_L1', 'PF_L2', 'PF_L3', 'L1_N','L2_N', 'L3_N',
              'Source_Ext_Voltage', 'Train_code']


time: 10.2 ms (started: 2022-01-10 13:52:31 +00:00)


## Function Declaration

In [4]:
# Function that returns the SHA-2 hash of the file
def hash_file(filepath):

   # make a hash object with SHA-2
   h = hashlib.sha256()

   # open file for reading in binary mode
   with open(filepath,'rb') as file:
       # loop till the end of the file
       chunk = 0
       while chunk != b'':
           # read only 1024 bytes at a time
           chunk = file.read(1024)
           h.update(chunk)

   # return the hex representation of digest
   return h.hexdigest()


time: 6.47 ms (started: 2022-01-10 13:52:31 +00:00)


# 1) Prediction Test

## Generate Random Data

In [5]:
#Runtime testing dataset
#Generate n samples for each features uniformly at range of min(0) to any integer

s_oil = np.random.uniform(low=0.0, high=700.0, size=3000000)
s_coolant = np.random.uniform(low=0.0, high=100.0, size=3000000)
s_pf = np.random.uniform(low=0.0, high=1.5, size=3000000)
s_ecu = np.random.uniform(low=0.0, high=75.0, size=3000000)

df_random = pd.DataFrame({'Oil_Pressure':s_oil, 'Coolant_Temp':s_coolant,
                          'PF_Avg':s_pf, 'ECU_Temp':s_ecu})
df_random

Unnamed: 0,Oil_Pressure,Coolant_Temp,PF_Avg,ECU_Temp
0,459.847976,55.079949,0.773934,32.694955
1,368.172066,31.582689,0.086697,20.542290
2,371.308868,67.882821,0.750865,29.305062
3,452.428671,59.691615,1.463705,14.121522
4,659.838613,45.932712,1.051276,31.413218
...,...,...,...,...
2999995,335.962051,94.569793,0.225025,55.355323
2999996,552.783977,4.010791,0.239621,51.583666
2999997,172.470790,65.491847,0.599189,59.251903
2999998,444.600232,51.262029,1.192304,31.602411


time: 315 ms (started: 2022-01-10 13:52:31 +00:00)


## Runtime test

### ML model

In [6]:
#Import and load Model
Opsi1_LR = joblib.load(model_dir+'Opsi1_LR')
Opsi1_KNN = joblib.load(model_dir+'Opsi1_KNN')
Opsi1_DT = joblib.load(model_dir+'Opsi1_DT')
Opsi1_SVM = joblib.load(model_dir+'Opsi1_SVM')
Opsi1_RF = joblib.load(model_dir+'Opsi1_RF')
Opsi1_LDA = joblib.load(model_dir+'Opsi1_LDA')
Opsi1_NB = joblib.load(model_dir+'Opsi1_NB')
Opsi3_LR = joblib.load(model_dir+'Opsi3_LR')
Opsi3_KNN = joblib.load(model_dir+'Opsi3_KNN')
Opsi3_DT = joblib.load(model_dir+'Opsi3_DT')
Opsi3_SVM = joblib.load(model_dir+'Opsi3_SVM')
Opsi3_RF = joblib.load(model_dir+'Opsi3_RF')
Opsi3_LDA = joblib.load(model_dir+'Opsi3_LDA')
Opsi3_NB = joblib.load(model_dir+'Opsi3_NB')

# save into a list for easy training & cross validation
models = []
models.append(('LR_Opsi1', Opsi1_LR))
models.append(('KNN_Opsi1', Opsi1_KNN))
models.append(('DT_Opsi1', Opsi1_DT))
models.append(('SVM_Opsi1', Opsi1_SVM))
models.append(('RF_Opsi1', Opsi1_RF))
models.append(('LDA_Opsi1', Opsi1_LDA))
models.append(('NB_Opsi1', Opsi1_NB))
models.append(('LR_Opsi3', Opsi3_LR))
models.append(('KNN_Opsi3', Opsi3_KNN))
models.append(('DT_Opsi3', Opsi3_DT))
models.append(('SVM_Opsi3', Opsi3_SVM))
models.append(('RF_Opsi3', Opsi3_RF))
models.append(('LDA_Opsi3', Opsi3_LDA))
models.append(('NB_Opsi3', Opsi3_NB))

#Loop for evaluation of ML model
model_predictions = []
names = []
runtimes = []

for name, model in models:
  start = timer() #Start measure Runtime
  #Show metrics to evaluate performance
  pred = model.predict(df_random)  #predict result from model
  model_predictions.append(pred)
  names.append(name)
  print('%s Predict Completed!' % (name))
  end = timer() #End measure Runtime
  duration = end-start
  runtimes.append(duration)
  print('Runtime (s) : %f' % (duration))



LR_Opsi1 Predict Completed!
Runtime (s) : 0.218718
KNN_Opsi1 Predict Completed!
Runtime (s) : 118.010537
DT_Opsi1 Predict Completed!
Runtime (s) : 0.210087
SVM_Opsi1 Predict Completed!
Runtime (s) : 1351.848275
RF_Opsi1 Predict Completed!
Runtime (s) : 24.397070
LDA_Opsi1 Predict Completed!
Runtime (s) : 0.178461
NB_Opsi1 Predict Completed!
Runtime (s) : 0.360470
LR_Opsi3 Predict Completed!
Runtime (s) : 0.136936
KNN_Opsi3 Predict Completed!
Runtime (s) : 114.893142
DT_Opsi3 Predict Completed!
Runtime (s) : 0.098962
SVM_Opsi3 Predict Completed!
Runtime (s) : 748.047621
RF_Opsi3 Predict Completed!
Runtime (s) : 18.266464
LDA_Opsi3 Predict Completed!
Runtime (s) : 0.136169
NB_Opsi3 Predict Completed!
Runtime (s) : 0.357519
time: 39min 37s (started: 2022-01-10 13:52:31 +00:00)


### Rule-based

In [7]:
#Rule-Based_Statistical

#Stat_Outlier = (value < q25 - (q75-q25) * 1.5) or (value > q75 + (q75-q25) * 1.5)
Outlier_Oil = (df_random['Oil_Pressure'] < 544.000) | (df_random['Oil_Pressure'] > 672.000)
Outlier_Coolant =(df_random['Coolant_Temp'] < 67.000) | (df_random['Coolant_Temp'] > 91.000) 
Outlier_ECU = (df_random['ECU_Temp'] < 18.000) | (df_random['ECU_Temp'] > 66.000) 
Outlier_PFA = (df_random['PF_Avg'] < 0.615) | (df_random['PF_Avg'] > 1.055)

#Stat_Maintenance = (q25 - (q75-q25) * 1.5 <= value <= q25) or (q75 <= value <= q75 + (q75-q25) * 1.5)
Maintenance_Oil = (df_random['Oil_Pressure'].between(544.000, 592.00)) | (df_random['Oil_Pressure'].between(624.00, 672.000))
Maintenance_Coolant = (df_random['Coolant_Temp'].between(67.000, 76.00)) | (df_random['Coolant_Temp'].between(82.00, 91.000)) 
Maintenance_ECU = (df_random['ECU_Temp'].between(18.000, 36.00)) | (df_random['ECU_Temp'].between(48.00, 66.000))
Maintenance_PFA = (df_random['PF_Avg'].between(0.615, 0.78) | (df_random['PF_Avg'].between(0.89, 1.055)))

#Stat_Normal = q25 < value < q75 
Normal_Oil = df_random['Oil_Pressure'].between(592.00, 624.00, inclusive=False) 
Normal_Coolant = df_random['Coolant_Temp'].between(76.00, 82.00, inclusive=False)
Normal_ECU = df_random['ECU_Temp'].between(36.00, 48.00, inclusive=False) 
Normal_PFA = df_random['PF_Avg'].between(0.78, 0.89, inclusive=False)

#Label Condition
Outlier =  Outlier_Oil & Outlier_Coolant & Outlier_ECU & Outlier_PFA
Maintenance = Maintenance_Oil | Maintenance_Coolant |  Maintenance_ECU | Maintenance_PFA

#Check conditions for each row in dataframe
df_RB_Statistical = df_random.copy()
df_RB_Statistical['Label'] = 'Normal'
df_RB_Statistical.loc[Outlier, 'Label'] = 'Outlier'
df_RB_Statistical.loc[Maintenance, 'Label'] = 'Maintenance'

# class distribution
print(df_RB_Statistical.groupby('Label').size())
print('Total sample : %d' % len(df_RB_Statistical))


Label
Maintenance    2140962
Normal          385479
Outlier         473559
dtype: int64
Total sample : 3000000
time: 637 ms (started: 2022-01-10 14:32:09 +00:00)


In [8]:
#Rule-based_Clustering

#Label Condition based on P1 and P2 Component from PCA result and clusters_dbscan
#need to pre-process features data with PCA

# Scaling and Normalize data
features_norm = ['Oil_Pressure_Norm',	'Coolant_Temp_Norm',	'PF_Avg_Norm',	'ECU_Temp_Norm']
X_scaled = StandardScaler().fit_transform(df_random)         #Bring all the attributes to a comparable level (Z-score normalization)                      
X_normalized = normalize(X_scaled)                            #Centering data to Origin
X_normalized = pd.DataFrame(X_normalized, columns=features_norm)     #Converting the numpy array into a pandas DataFrame

# fit transform data
pca = PCA(n_components = 4)
PCA_comp = ['P1', 'P2', 'P3', 'P4'] #set Px value based on the numbers of features
X_transform = pca.fit_transform(X_normalized)
X_transform = pd.DataFrame(X_transform, columns=PCA_comp)

# Select two PCA components with highest variation (Dimensionality reduction)
df_cluster = pd.DataFrame(dict(x=X_transform['P1'], y=X_transform['P2']))

#cluster_0 = P1 > 0.25 AND P2 < 0.2
cluster_0 = (df_cluster['x'] > 0.25) & (df_cluster['y'] < 0.2)
#cluster_1 = P1 > 0.25 AND P2 > 0.2
cluster_1 = (df_cluster['x'] > 0.25) & (df_cluster['y'] > 0.2)
#cluster_2 = P1 <= 0.25 AND P2 <= 0.2
cluster_2 = (df_cluster['x'] <= 0.25) & (df_cluster['y'] <= 0.2)

#Add label to dataset
df_RB_Clustering = df_RB_Statistical.copy()
df_RB_Clustering['Label_cluster'] = np.nan
df_RB_Clustering.loc[cluster_0, 'Label_cluster'] = 0
df_RB_Clustering.loc[cluster_1, 'Label_cluster'] = 1
df_RB_Clustering.loc[cluster_2, 'Label_cluster'] = 2

# Label distribution result
print(df_RB_Clustering.groupby('Label_cluster').size())
print('Total sample : %d' % len(df_RB_Clustering))

Label_cluster
0.0     636020
1.0     367968
2.0    1257810
dtype: int64
Total sample : 3000000
time: 1.16 s (started: 2022-01-10 14:32:09 +00:00)


## Results

In [9]:
df_eval = pd.DataFrame({'Model':names, 'Runtime':runtimes})
df_eval

Unnamed: 0,Model,Runtime
0,LR_Opsi1,0.218718
1,KNN_Opsi1,118.010537
2,DT_Opsi1,0.210087
3,SVM_Opsi1,1351.848275
4,RF_Opsi1,24.39707
5,LDA_Opsi1,0.178461
6,NB_Opsi1,0.36047
7,LR_Opsi3,0.136936
8,KNN_Opsi3,114.893142
9,DT_Opsi3,0.098962


time: 14.6 ms (started: 2022-01-10 14:32:11 +00:00)


In [10]:
df_pred = pd.DataFrame(model_predictions, index=names)
df_pred = df_pred.transpose()

# class distribution
for col in df_pred.columns :
  print(df_pred.groupby(col).size())

# Rule-Based statistical distribution result
print(df_RB_Statistical.groupby('Label').size())
print('Total sample : %d' % len(df_RB_Statistical))
# Rule-Based clustering distribution result
print(df_RB_Clustering.groupby('Label_cluster').size())
print('Total sample : %d' % len(df_RB_Clustering))

LR_Opsi1
Maintenance    3000000
dtype: int64
KNN_Opsi1
Maintenance    2287424
Normal          699748
Outlier          12828
dtype: int64
DT_Opsi1
Maintenance    2688304
Normal          220849
Outlier          90847
dtype: int64
SVM_Opsi1
Maintenance    2998668
Normal            1329
Outlier              3
dtype: int64
RF_Opsi1
Maintenance    2842592
Normal          130453
Outlier          26955
dtype: int64
LDA_Opsi1
Maintenance    3000000
dtype: int64
NB_Opsi1
Maintenance    3000000
dtype: int64
LR_Opsi3
2    3000000
dtype: int64
KNN_Opsi3
2    3000000
dtype: int64
DT_Opsi3
2    3000000
dtype: int64
SVM_Opsi3
2    3000000
dtype: int64
RF_Opsi3
2    3000000
dtype: int64
LDA_Opsi3
2    3000000
dtype: int64
NB_Opsi3
2    3000000
dtype: int64
Label
Maintenance    2140962
Normal          385479
Outlier         473559
dtype: int64
Total sample : 3000000
Label_cluster
0.0     636020
1.0     367968
2.0    1257810
dtype: int64
Total sample : 3000000
time: 4min 7s (started: 2022-01-10 14:32:11 