<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-Libraries" data-toc-modified-id="Import-Libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import Libraries</a></span></li><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load Data</a></span></li><li><span><a href="#Models" data-toc-modified-id="Models-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Models</a></span><ul class="toc-item"><li><span><a href="#IsolationForest" data-toc-modified-id="IsolationForest-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>IsolationForest</a></span></li><li><span><a href="#LocalOutlierFactor" data-toc-modified-id="LocalOutlierFactor-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>LocalOutlierFactor</a></span></li><li><span><a href="#OneClassSVM" data-toc-modified-id="OneClassSVM-3.3"><span class="toc-item-num">3.3&nbsp;&nbsp;</span>OneClassSVM</a></span></li><li><span><a href="#DBSCAN" data-toc-modified-id="DBSCAN-3.4"><span class="toc-item-num">3.4&nbsp;&nbsp;</span>DBSCAN</a></span></li></ul></li></ul></div>

## Import Libraries

In [277]:
import os
import plotly.express as px
import numpy as np
import pandas as pd
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objects as go

init_notebook_mode(connected = True)
import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import DBSCAN
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.cluster import KMeans

## Load Data

In [315]:
data = pd.read_csv("E:/github/data/ec2_request_latency_system_failure.csv")
# data taken from https://github.com/numenta/NAB/blob/master/data/realKnownCause/

In [258]:
data

Unnamed: 0,timestamp,value
0,2014-03-07 03:41:00,45.868
1,2014-03-07 03:46:00,47.606
2,2014-03-07 03:51:00,42.580
3,2014-03-07 03:56:00,46.030
4,2014-03-07 04:01:00,44.992
...,...,...
4027,2014-03-21 03:21:00,25.352
4028,2014-03-21 03:26:00,38.216
4029,2014-03-21 03:31:00,22.864
4030,2014-03-21 03:36:00,66.260


In [259]:
data.isnull().sum()

timestamp    0
value        0
dtype: int64

In [260]:
fig = px.scatter(x=data.timestamp, y=data.value)
fig.update_layout(title = "Data Distribution")
fig.show()

## Models

### IsolationForest

In [261]:
clf = IsolationForest(random_state=0,
                     n_estimators=10,
                      warm_start=True).fit([[i] for i in train.value])

In [262]:
clf.predict([[i] for i in test.value])

array([ 1, -1,  1, ..., -1, -1, -1])

In [263]:
data['outlier'] = clf.predict([[i] for i in data.value])

In [264]:
fig = px.scatter(x=data.timestamp, y=data.value,color=data.outlier)
fig.show()

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html
    

+1 is inlier, -1 is outlier

> Based on the isolation forest model, the outliers are predicted well in this case

### LocalOutlierFactor

In [265]:
from sklearn.neighbors import LocalOutlierFactor
# can only be used for predict time , novelity 

In [266]:
clf = LocalOutlierFactor(n_neighbors=2, novelty=True)
clf.fit([[i] for i in data.value])

LocalOutlierFactor(n_neighbors=2, novelty=True)

In [267]:
data['LocalOutlierFactor'] = clf.predict([[i] for i in data.value])

In [268]:
fig = px.scatter(x=data.timestamp, y=data.value,color=data.LocalOutlierFactor)
fig.update_layout(title = "LocalOutlierFactor (Negative scores represent outliers, positive scores represent inliers)")
fig.show()

### OneClassSVM

In [269]:
clf = OneClassSVM(gamma='auto')
clf.fit([[i] for i in train.value])
data['OneClassSVM'] = clf.predict([[i] for i in data.value])
fig = px.scatter(x=data.timestamp, y=data.value,color=data.OneClassSVM)
fig.update_layout(title = "Outlier detection using OneClassSVM (1 for inliers, -1 for outliers)")
fig.show()

### DBSCAN

In [323]:
outlier_detection = DBSCAN(
 eps = .2, 
 metric="mahalanobis", 
 min_samples = 50,
 n_jobs = -1)
data['DBSCAN'] = clusters.fit_predict([[i] for i in data.value])
labels = []
for i in data['DBSCAN']:
    if i != 0:
        labels.append(" outlier->                  ")
    else:
        labels.append("")
fig = px.scatter(x=data.timestamp, y=data.value,
                 text=labels,
                 color=data.DBSCAN)
fig.update_layout(title = "Outlier predictions from DBSCAN using mahalanobis distance (System Latency failure data)", 
                 yaxis_title="Outlier Score",
                xaxis_title="Observations")
fig.show()