## Anomaly Detection Using Isolation Forest and Local Outlier Factor

<div style="text-align: right"> By Smit Doshi (001475186) </div>

#### Importing Required Libraries

In [1]:
import pandas as pd
from scipy.io import arff
from io import StringIO
import numpy as np
from IPython.display import Markdown as md

In [2]:
#### Loading the Dataset

In [3]:
data, meta = arff.loadarff('Dataset_Challenge6.arff')

In [4]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,command_address,response_address,command_memory,response_memory,command_memory_count,response_memory_count,comm_read_function,comm_write_fun,resp_read_fun,resp_write_fun,...,cycletime,rate,setpoint,control_mode,control_scheme,pump,solenoid,crc_rate,measurement,time
0,4.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,20.0,2.0,1.0,0.0,0.0,0.0,0.528736,1.106868
1,4.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,20.0,2.0,1.0,0.0,0.0,0.0,0.54023,1.043098
2,4.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,20.0,2.0,1.0,0.0,0.0,0.0,0.54023,1.266332
3,4.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,20.0,2.0,1.0,0.0,0.0,0.0,0.54023,1.11628
4,4.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,20.0,2.0,1.0,0.0,0.0,0.0,0.528736,1.257916


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97019 entries, 0 to 97018
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   command_address        97019 non-null  float64
 1   response_address       97019 non-null  float64
 2   command_memory         97019 non-null  float64
 3   response_memory        97019 non-null  float64
 4   command_memory_count   97019 non-null  float64
 5   response_memory_count  97019 non-null  float64
 6   comm_read_function     97019 non-null  float64
 7   comm_write_fun         97019 non-null  float64
 8   resp_read_fun          97019 non-null  float64
 9   resp_write_fun         97019 non-null  float64
 10  sub_function           97019 non-null  float64
 11  command_length         97019 non-null  float64
 12  resp_length            97019 non-null  float64
 13  gain                   97019 non-null  float64
 14  reset                  97019 non-null  float64
 15  de

In [6]:
df.describe()

Unnamed: 0,command_address,response_address,command_memory,response_memory,command_memory_count,response_memory_count,comm_read_function,comm_write_fun,resp_read_fun,resp_write_fun,...,cycletime,rate,setpoint,control_mode,control_scheme,pump,solenoid,crc_rate,measurement,time
count,97019.0,97019.0,97019.0,97019.0,97019.0,97019.0,97019.0,97019.0,97019.0,97019.0,...,97019.0,97019.0,97019.0,97019.0,97019.0,97019.0,97019.0,97019.0,97019.0,97019.0
mean,4.585226,3.719436,182.918099,216.65717,8.990012,16.737464,3.045878,10.0,2.449963,9.298591,...,1.0,0.0,24.166607,0.899659,0.992146,0.056381,0.027366,0.0,-2.7725819999999997e+34,1.139626
std,8.951157,1.021543,3.674067,59.504855,0.282658,4.596942,0.724514,0.0,0.893052,2.553856,...,0.0,0.0,14.322356,0.991003,0.088275,0.230657,0.163148,0.0,1.818736e+36,0.091356
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,1.0,0.0,...,1.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,-1.51532e+38,1.0
25%,4.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,1.0,10.0,...,1.0,0.0,20.0,0.0,1.0,0.0,0.0,0.0,0.2183908,1.057822
50%,4.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,20.0,0.0,1.0,0.0,0.0,0.0,0.3218391,1.138303
75%,4.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,20.0,2.0,1.0,0.0,0.0,0.0,5.258621,1.219292
max,249.0,4.0,255.0,233.0,9.0,18.0,19.0,10.0,3.0,10.0,...,1.0,0.0,90.0,2.0,1.0,1.0,1.0,0.0,2.299564e+38,1.299998


#### Importing Machine Learning Libraries

In [7]:
from sklearn.ensemble import IsolationForest 
from sklearn.neighbors import LocalOutlierFactor

#### Initiating Isolation Forest Algorithm

In [8]:
model = IsolationForest(random_state=101,verbose=1,n_jobs=-1)

In [9]:
model.fit(df)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 out of   8 | elapsed:    0.6s remaining:    1.8s
[Parallel(n_jobs=8)]: Done   8 out of   8 | elapsed:    0.7s finished


IsolationForest(n_jobs=-1, random_state=101, verbose=1)

#### Displaying all the outliers

In [10]:
df[model.predict(df) == -1]

Unnamed: 0,command_address,response_address,command_memory,response_memory,command_memory_count,response_memory_count,comm_read_function,comm_write_fun,resp_read_fun,resp_write_fun,...,cycletime,rate,setpoint,control_mode,control_scheme,pump,solenoid,crc_rate,measurement,time
721,135.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,20.0,2.0,1.0,0.0,0.0,0.0,0.597701,1.291552
1505,130.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,20.0,2.0,1.0,0.0,0.0,0.0,0.632184,1.040217
1510,132.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,20.0,2.0,1.0,0.0,0.0,0.0,0.609195,1.231605
1516,137.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,20.0,2.0,1.0,0.0,0.0,0.0,0.597701,1.024334
1542,147.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,20.0,2.0,1.0,0.0,0.0,0.0,0.620690,1.019239
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89893,4.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,90.0,0.0,1.0,0.0,0.0,0.0,0.264368,1.011546
90081,4.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,90.0,0.0,1.0,0.0,0.0,0.0,0.264368,1.001464
90088,4.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,90.0,0.0,1.0,0.0,0.0,0.0,0.252874,1.025803
90209,4.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,90.0,0.0,1.0,0.0,0.0,0.0,0.229885,1.290393


In [11]:
md('The Number of Outliers Detected are {}.'.format(df[model.predict(df) == -1].shape[0]))

The Number of Outliers Detected are 11770.

In [12]:
anomaly_scores = model.decision_function(df)
anomaly_scores

array([0.14151037, 0.12747443, 0.1300129 , ..., 0.14617157, 0.15242374,
       0.14986186])

In [13]:
np.where(anomaly_scores <=- 0.2)

(array([73134, 78490, 79224]),)

In [14]:
df[model.decision_function(df) <= -0.2]

Unnamed: 0,command_address,response_address,command_memory,response_memory,command_memory_count,response_memory_count,comm_read_function,comm_write_fun,resp_read_fun,resp_write_fun,...,cycletime,rate,setpoint,control_mode,control_scheme,pump,solenoid,crc_rate,measurement,time
73134,4.0,0.0,183.0,0.0,9.0,0.0,3.0,10.0,3.0,0.0,...,1.0,0.0,20.0,0.0,1.0,1.0,1.0,0.0,-1.5845629999999998e+29,1.001061
78490,4.0,0.0,183.0,0.0,9.0,0.0,3.0,10.0,3.0,0.0,...,1.0,0.0,20.0,0.0,1.0,1.0,1.0,0.0,0.0,1.000422
79224,4.0,0.0,183.0,0.0,9.0,0.0,3.0,10.0,3.0,0.0,...,1.0,0.0,20.0,0.0,1.0,1.0,1.0,0.0,0.0,1.000622


In [15]:
md('The Number of Outliers Detected that have Average Anomaly scores less than or equal to -0.2 are {}.'.format(df[model.decision_function(df) <= -0.2].shape[0]))

The Number of Outliers Detected that have Average Anomaly scores less than or equal to -0.2 are 3.

#### Initiating Local Outlier Factor Algorithm

In [16]:
LOF = LocalOutlierFactor(n_neighbors=3, n_jobs = -1)

In [17]:
LOF.fit(df)

LocalOutlierFactor(n_jobs=-1, n_neighbors=3)

#### Displaying all the outliers detected by LOF algorithms

In [18]:
df[LOF.fit_predict(df) == -1].shape

(10564, 26)

In [19]:
md('The Number of Outliers Detected are {}.'.format(df[LOF.fit_predict(df) == -1].shape[0]))

The Number of Outliers Detected are 10564.

#### Displaying all the outlier having LOF Scores less than or equal to -40

In [20]:
df[LOF.negative_outlier_factor_ <= -40 ]

Unnamed: 0,command_address,response_address,command_memory,response_memory,command_memory_count,response_memory_count,comm_read_function,comm_write_fun,resp_read_fun,resp_write_fun,...,cycletime,rate,setpoint,control_mode,control_scheme,pump,solenoid,crc_rate,measurement,time
7421,4.0,4.0,171.0,233.0,1.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,20.0,0.0,1.0,0.0,0.0,0.0,0.000000e+00,1.108957
15861,4.0,4.0,183.0,233.0,9.0,18.0,0.0,10.0,3.0,10.0,...,1.0,0.0,20.0,2.0,1.0,1.0,0.0,0.0,1.344828e+01,1.148771
15868,4.0,4.0,183.0,233.0,9.0,18.0,2.0,10.0,3.0,10.0,...,1.0,0.0,20.0,2.0,1.0,1.0,1.0,0.0,2.018391e+01,1.115493
16205,4.0,4.0,183.0,233.0,9.0,18.0,0.0,10.0,3.0,10.0,...,1.0,0.0,20.0,2.0,1.0,0.0,0.0,0.0,9.436782e+00,1.027672
16210,4.0,4.0,183.0,233.0,9.0,18.0,0.0,10.0,3.0,10.0,...,1.0,0.0,20.0,2.0,1.0,0.0,0.0,0.0,8.827586e+00,1.108582
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84878,4.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,20.0,0.0,1.0,0.0,0.0,0.0,-4.951374e+37,1.060862
84885,4.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,20.0,0.0,1.0,0.0,0.0,0.0,-2.814530e+24,1.077522
84900,4.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,20.0,0.0,1.0,0.0,0.0,0.0,-5.637325e+22,1.157789
85490,4.0,4.0,183.0,233.0,9.0,18.0,3.0,10.0,3.0,10.0,...,1.0,0.0,80.0,2.0,1.0,1.0,1.0,0.0,6.436781e-01,1.179524


In [21]:
md('The Number of Outliers Having LOF scores less than or equal to -40 are {}.'.format(df[LOF.negative_outlier_factor_ <= -40 ].shape[0]))

The Number of Outliers Having LOF scores less than or equal to -40 are 302.