# Anomali Detection using Isolation Forest

**Setup**

In [1]:
from azureml.core import Workspace, Datastore, Dataset

from azureml.core.experiment import Experiment

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from pandas.plotting import parallel_coordinates

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np

#import seaborn as sns
from sklearn.ensemble import IsolationForest


In [2]:
ws = Workspace.from_config("./config/config_2")
az_store = Datastore.get(ws, 'azureml_ds_b01')
az_dataset = Dataset.get_by_name(ws, "Petrosani_01")
#az_dataset_Turda = Dataset.get_by_name(ws, "Turda_01")
az_default_store = ws.get_default_datastore()
df = az_dataset.to_pandas_dataframe()
#df_Turda_ = az_dataset_Turda.to_pandas_dataframe()

In [None]:
df_clean = df.drop(['Temp_Id', 'Location_Id'], 1)
df_clean

# Isolation Forest fitting & adjusting

In [None]:
model=IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.1),max_features=1.0)
model.fit(df_clean[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])

**Assign labels and prediction values**

In [None]:
df_clean['scores']=model.decision_function(df_clean[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])
df_clean['anomaly']=model.predict(df_clean[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])
df_clean.head(20)

**Observe total anomalies**

In [None]:
anomaly=df_clean.loc[df_clean['anomaly']==-1]
anomaly_index=list(anomaly.index)
print(anomaly[['FanSpeed', 'Temp_Out', 'Temp_Floor','anomaly', 'scores']])

**Isolate anomalies based on conditions and other criteria**

In [None]:
#outliers_counter = len(df[df['FanSpeed'] > 80 & df['Temp_Room'] > 20 & (df['Temp_Room'] > (df['Temp_Out'] + 10))])
outliers_counter = len(df_clean[df_clean['scores'] < 0 ])
outliers_counter

In [None]:
scores=df_clean.loc[((df_clean['scores'] < 0) & (df_clean['FanSpeed'] > 0) & (df_clean['ControllerStateNumber'] > 0) \
    & (df_clean['ControllerStateNumber'] != 71.0) & (df_clean['Temp_Out'] < 26)
    & (df_clean['ControllerStateNumber'] != 71.1)
    & (df_clean['ControllerStateNumber'] != 72.0)
    & (df_clean['ControllerStateNumber'] != 73.0)
    & (df_clean['ControllerStateNumber'] != 74.0)
    & (df_clean['ControllerStateNumber'] != 74.4)
    & (df_clean['ControllerStateNumber'] != 71.4)
    )]
    
scores

In [None]:
scores=df_clean.loc[((df_clean['scores'] < -0.05) & (df_clean['Temp_Out'] < 25) )]
scores[['anomaly']].count()

In [None]:
scaled = df_clean.loc[((df_clean['scores'] < -0) & (df_clean['Temp_Out'] < 25) & (df_clean['FanSpeed'] > 0) )]
scaled = scaled[['scores', 'anomaly', 'FanSpeed']]
scaler = StandardScaler()
X = StandardScaler().fit_transform(scaled)
X

# Accuracy calculations

In [None]:
accuracy = 100*list(df_clean['anomaly']).count(-1)/(outliers_counter)
print("Accuracy percentage:", accuracy)

In [None]:
result = (scores['anomaly'].count()/df_clean['anomaly'].count()) * 100
print("Percentage of anomalies:", result)

In [None]:
outliers_counter = len(df_clean[df_clean['scores'] > 0.1])
outliers_counter

# Isolation Forest for March, April and May

In [213]:
marchstart = '2021-03-01'
marchend = '2021-03-31'
#
aprilstart = '2021-04-01'
aprilend = '2021-04-30'
#
maystart = '2021-05-01'
mayend = '2021-05-31'

# create features
maskmarch = (df['Date_Id'] > marchstart) & (df['Date_Id'] <= marchend)
maskapril = (df['Date_Id'] > aprilstart) & (df['Date_Id'] <= aprilend)
maskmay = (df['Date_Id'] > maystart) & (df['Date_Id'] <= mayend)



df_march = df.loc[maskmarch]
df_april = df.loc[maskapril]
df_may = df.loc[maskmay]


In [None]:
# march
model_march = IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.1),max_features=1.0)
model_march.fit(df_march[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])

In [None]:
# april
model_april = IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.1),max_features=1.0)
model_april.fit(df_april[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])


In [None]:

# may
model_may = IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.1),max_features=1.0)
model_may.fit(df_may[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])

In [None]:
# march
df_march['scores']=model_march.decision_function(df_march[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])
df_march['anomaly']=model_march.predict(df_march[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])

In [None]:
# april
df_april['scores']=model_april.decision_function(df_april[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])
df_april['anomaly']=model_april.predict(df_april[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])

In [None]:
# may
df_may['scores']=model_may.decision_function(df_may[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])
df_may['anomaly']=model_may.predict(df_may[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor']])

In [231]:
df_total = pd.DataFrame()

df_total[['March-FanSpeed', 'March-Temp_Room', 'March-Temp_Out', 'March-Temp_Floor', 'March-scores', 'March-anomaly']] = df_march[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor', 'scores', 'anomaly']]
df_total[['April-FanSpeed', 'April-Temp_Room', 'April-Temp_Out', 'April-Temp_Floor', 'April-scores', 'April-anomaly']] = df_april[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor', 'scores', 'anomaly']]
df_total[['May-FanSpeed', 'May-Temp_Room', 'May-Temp_Out', 'May-Temp_Floor', 'May-scores', 'May-anomaly']] = df_may[['FanSpeed', 'Temp_Room', 'Temp_Out', 'Temp_Floor', 'scores', 'anomaly']]


In [None]:
df_total