# Anomali Detection using Isolation Forest

**Setup**

In [1]:
from azureml.core import Workspace, Datastore, Dataset

from azureml.core.experiment import Experiment

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from pandas.plotting import parallel_coordinates

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np

#import seaborn as sns
from sklearn.ensemble import IsolationForest


In [2]:
ws = Workspace.from_config("./config/config_2")
az_store = Datastore.get(ws, 'azureml_ds_b01')
az_dataset = Dataset.get_by_name(ws, "Petrosani_01")
#az_dataset_Turda = Dataset.get_by_name(ws, "Turda_01")
az_default_store = ws.get_default_datastore()
df = az_dataset.to_pandas_dataframe()
#df_Turda_ = az_dataset_Turda.to_pandas_dataframe()

In [3]:
df_clean = df.drop(['Temp_Id', 'Location_Id'], 1)
df_clean

  """Entry point for launching an IPython kernel.


Unnamed: 0,Date_Id,CoolingSource_Id,Temp_Out,Temp_Floor,Temp_Room,FanSpeed,ControllerStateNumber
0,2020-12-31 23:00:00,0,5.6,16.3,24.7,57,7.0
1,2020-12-31 23:02:00,0,5.6,16.0,24.8,58,7.0
2,2020-12-31 23:04:00,0,5.6,15.7,24.9,59,7.0
3,2020-12-31 23:06:00,0,5.8,15.4,24.7,57,7.0
4,2020-12-31 23:08:00,0,5.5,15.1,24.7,58,7.0
...,...,...,...,...,...,...,...
333624,2022-05-18 14:20:00,0,17.7,18.3,26.2,63,6.0
333625,2022-05-18 14:22:00,0,17.8,18.4,26.2,63,6.0
333626,2022-05-18 14:24:00,0,17.9,18.3,26.2,63,6.0
333627,2022-05-18 14:26:00,0,18.9,18.8,26.2,63,6.0


# Isolation Forest fitting & adjusting

In [16]:
model=IsolationForest(n_estimators=500, max_samples='auto', contamination=float(0.1),max_features=1.0)
model.fit(df_clean[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])

  "X does not have valid feature names, but"


IsolationForest(contamination=0.1, n_estimators=500)

**Assign labels and prediction values**

In [17]:
df_clean['scores']=model.decision_function(df_clean[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])
df_clean['anomaly']=model.predict(df_clean[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])
df_clean.head(20)

Unnamed: 0,Date_Id,CoolingSource_Id,Temp_Out,Temp_Floor,Temp_Room,FanSpeed,ControllerStateNumber,scores,anomaly
0,2020-12-31 23:00:00,0,5.6,16.3,24.7,57,7.0,0.167976,1
1,2020-12-31 23:02:00,0,5.6,16.0,24.8,58,7.0,0.174815,1
2,2020-12-31 23:04:00,0,5.6,15.7,24.9,59,7.0,0.173038,1
3,2020-12-31 23:06:00,0,5.8,15.4,24.7,57,7.0,0.16447,1
4,2020-12-31 23:08:00,0,5.5,15.1,24.7,58,7.0,0.16201,1
5,2020-12-31 23:10:00,0,5.3,14.8,24.6,56,7.0,0.152098,1
6,2020-12-31 23:12:00,0,5.6,14.5,24.6,56,7.5,0.143716,1
7,2020-12-31 23:14:00,0,5.5,14.2,25.7,67,7.5,0.131474,1
8,2020-12-31 23:16:00,0,5.5,14.1,25.7,66,7.5,0.138259,1
9,2020-12-31 23:18:00,0,5.6,14.2,25.7,67,7.5,0.130903,1


**Observe total anomalies**

In [18]:
anomaly=df_clean.loc[df_clean['anomaly']==-1]
anomaly_index=list(anomaly.index)
print(anomaly[['FanSpeed', 'Temp_Out', 'Temp_Floor','anomaly', 'scores']])

        FanSpeed  Temp_Out  Temp_Floor  anomaly    scores
968           39      -2.3        19.0       -1 -0.057370
969           16      -2.3        19.2       -1 -0.098225
970           16      -1.9        18.8       -1 -0.102677
971           16      -1.3        18.6       -1 -0.097823
972           16      -0.5        18.7       -1 -0.092963
...          ...       ...         ...      ...       ...
332946         0      22.0        25.3       -1 -0.066416
332947         0      21.3        27.4       -1 -0.087497
332948         0      21.1        27.8       -1 -0.092061
332949         0      20.9        28.1       -1 -0.102992
332950        70      19.9        26.3       -1 -0.085254

[33363 rows x 5 columns]


**Isolate anomalies based on conditions and other criteria**

In [19]:
#outliers_counter = len(df[df['FanSpeed'] > 80 & df['Temp_Room'] > 20 & (df['Temp_Room'] > (df['Temp_Out'] + 10))])
outliers_counter = len(df_clean[df_clean['scores'] < 0 ])
outliers_counter

33363

In [20]:
scores=df_clean.loc[((df_clean['scores'] < 0) & (df_clean['FanSpeed'] > 0) & (df_clean['ControllerStateNumber'] > 0) \
    & (df_clean['ControllerStateNumber'] != 71.0) & (df_clean['Temp_Out'] < 26)
    & (df_clean['ControllerStateNumber'] != 71.1)
    & (df_clean['ControllerStateNumber'] != 72.0)
    & (df_clean['ControllerStateNumber'] != 73.0)
    & (df_clean['ControllerStateNumber'] != 74.0)
    & (df_clean['ControllerStateNumber'] != 74.4)
    & (df_clean['ControllerStateNumber'] != 71.4)
    )]
    
scores

Unnamed: 0,Date_Id,CoolingSource_Id,Temp_Out,Temp_Floor,Temp_Room,FanSpeed,ControllerStateNumber,scores,anomaly
983,2021-01-02 07:46:00,1,-0.9,18.0,21.1,22,9.5,-0.082650,-1
984,2021-01-02 07:48:00,1,-1.9,16.6,20.3,15,9.5,-0.085031,-1
985,2021-01-02 07:50:00,1,-2.2,16.3,19.9,15,9.5,-0.084558,-1
986,2021-01-02 07:52:00,1,-2.2,15.9,19.5,15,9.5,-0.082637,-1
987,2021-01-02 07:54:00,0,-2.6,10.3,21.6,26,9.1,-0.104069,-1
...,...,...,...,...,...,...,...,...,...
332912,2022-05-17 14:31:00,2,21.7,21.3,28.6,79,15.1,-0.032422,-1
332913,2022-05-17 14:33:00,2,22.7,21.3,28.6,79,15.1,-0.033507,-1
332914,2022-05-17 14:35:00,2,21.7,21.3,28.6,79,15.1,-0.032422,-1
332915,2022-05-17 14:37:00,1,21.5,23.3,27.7,70,24.2,-0.015860,-1


In [9]:
scores=df_clean.loc[((df_clean['scores'] < -0.05) & (df_clean['Temp_Out'] < 25) )]
scores[['anomaly']].count()

anomaly    5239
dtype: int64

In [10]:
scaled = df_clean.loc[((df_clean['scores'] < -0) & (df_clean['Temp_Out'] < 25) & (df_clean['FanSpeed'] > 0) )]
scaled = scaled[['scores', 'anomaly', 'FanSpeed']]
scaler = StandardScaler()
X = StandardScaler().fit_transform(scaled)
X

array([[-0.69346035,  0.        , -1.06546084],
       [-2.4638674 ,  0.        , -2.017024  ],
       [-2.43690287,  0.        , -2.017024  ],
       ...,
       [ 0.25062278,  0.        ,  0.58943162],
       [ 0.02097192,  0.        ,  0.21708081],
       [-2.20565908,  0.        ,  0.21708081]])

# Accuracy calculations

In [11]:
accuracy = 100*list(df_clean['anomaly']).count(-1)/(outliers_counter)
print("Accuracy percentage:", accuracy)

Accuracy percentage: 100.0


In [12]:
result = (scores['anomaly'].count()/df_clean['anomaly'].count()) * 100
print("Percentage of anomalies:", result)

Percentage of anomalies: 1.5703071375689763


In [15]:
outliers_counter = len(df_clean[df_clean['scores'] < 0])
outliers_counter

33363

# Isolation Forest for March, April and May

In [14]:
marchstart = '2021-03-01'
marchend = '2021-03-31'
#
aprilstart = '2021-04-01'
aprilend = '2021-04-30'
#
maystart = '2021-05-01'
mayend = '2021-05-31'

# create features
maskmarch = (df['Date_Id'] > marchstart) & (df['Date_Id'] <= marchend)
maskapril = (df['Date_Id'] > aprilstart) & (df['Date_Id'] <= aprilend)
maskmay = (df['Date_Id'] > maystart) & (df['Date_Id'] <= mayend)



df_march = df.loc[maskmarch]
df_april = df.loc[maskapril]
df_may = df.loc[maskmay]


In [None]:
# march
model_march = IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.1),max_features=1.0)
model_march.fit(df_march[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])

In [None]:
# april
model_april = IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.1),max_features=1.0)
model_april.fit(df_april[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])


In [None]:

# may
model_may = IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.1),max_features=1.0)
model_may.fit(df_may[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])

In [None]:
# march
df_march['scores']=model_march.decision_function(df_march[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])
df_march['anomaly']=model_march.predict(df_march[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])

In [None]:
# april
df_april['scores']=model_april.decision_function(df_april[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])
df_april['anomaly']=model_april.predict(df_april[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])

In [None]:
# may
df_may['scores']=model_may.decision_function(df_may[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])
df_may['anomaly']=model_may.predict(df_may[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])

In [231]:
df_total = pd.DataFrame()

df_total[['March-FanSpeed', 'March-Temp_Room', 'March-Temp_Out', 'March-Temp_Floor', 'March-scores', 'March-anomaly']] = df_march[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor', 'scores', 'anomaly']]
df_total[['April-FanSpeed', 'April-Temp_Room', 'April-Temp_Out', 'April-Temp_Floor', 'April-scores', 'April-anomaly']] = df_april[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor', 'scores', 'anomaly']]
df_total[['May-FanSpeed', 'May-Temp_Room', 'May-Temp_Out', 'May-Temp_Floor', 'May-scores', 'May-anomaly']] = df_may[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor', 'scores', 'anomaly']]


In [None]:
df_total