# Anomaly detection in time series

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
plt.rcParams["figure.figsize"] = (9,6)

## Read the data
The dataset can be downloaded from `GitHub`

The labels can be obtained from `GitHub`

This is real-life data on CPU utilization of an EC2 instance in the AWS cloud. Data was recorded every 5 minutes, starting on February 14th at 14:30. The dataset contains 4032 data points. It is available through the Numenta Anomaly Benchmark (NAB) repository under the AGPL-3.0 license.

In [None]:
df = pd.read_csv('./data/ec2_cpu_utilization.csv')
df.head()

In [None]:
# Labels taken from the link above. We are looking at the labels for ec2_cpu_utilization_24ae8d dataset

anomalies_timestamp = [
        "2014-02-26 22:05:00",
        "2014-02-27 17:15:00"
    ]

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

df.head()

In [None]:
df['is_anomaly'] = 1

for each in anomalies_timestamp:
    df.loc[df['timestamp'] == each, 'is_anomaly'] = -1
    
df.sample(10)

In [None]:
anomaly_df = df.loc[df['is_anomaly'] == -1]
inlier_df = df.loc[df['is_anomaly'] == 1]

In [None]:
fig, ax = plt.subplots()

ax.scatter(inlier_df.index, inlier_df['value'], color='blue', s=3, label='Inlier')
ax.scatter(anomaly_df.index, anomaly_df['value'], color='red', label='Anomaly')
ax.set_xlabel('Time')
ax.set_ylabel('CPU usage')
ax.legend(loc=2)

plt.grid(False)
fig.autofmt_xdate()
plt.tight_layout()

## Baseline: median absolute deviation (MAD)

In [None]:
import seaborn as sns

sns.kdeplot(df['value']);
plt.grid(False)
plt.axvline(0.134, 0, 1, c='black', ls='--')
plt.tight_layout()

In [None]:
from scipy.stats import median_abs_deviation

mad = median_abs_deviation(df['value'])
median = np.median(df['value'])

print(median)
print(mad)

def compute_robust_z_score(x):
  return 0.6745*(x-median)/mad

In [None]:
df['z-score'] = df['value'].apply(compute_robust_z_score)

df.sample(10)

In [None]:
df['baseline'] = 1
df.loc[df['z-score'] >= 3.5, 'baseline'] = -1
df.loc[df['z-score'] <= -3.5, 'baseline'] = -1

### Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(df['is_anomaly'], df['baseline'], labels=[1, -1])

disp_cm = ConfusionMatrixDisplay(cm, display_labels=[1, -1])

disp_cm.plot();

plt.grid(False)
plt.tight_layout()

## Isolation forest

In [None]:
from sklearn.ensemble import IsolationForest

In [None]:
train = df[:3550]
test = df[3550:]

In [None]:
contamination = 1/len(train)

iso_forest = IsolationForest(contamination=contamination, random_state=42)

X_train = train['value'].values.reshape(-1,1)

iso_forest.fit(X_train)

In [None]:
preds_iso_forest = iso_forest.predict(test['value'].values.reshape(-1,1))

### Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(test['is_anomaly'],preds_iso_forest,labels=[1,-1])

disp_cm = ConfusionMatrixDisplay(cm,display_labels=[1,-1])
disp_cm.plot();

plt.grid(False)
plt.tight_layout()

## Local outlier factor (LOF)

In [None]:
from sklearn.neighbors import LocalOutlierFactor

lof = LocalOutlierFactor(contamination=contamination, novelty=True)

lof.fit(X_train)

In [None]:
preds_lof = lof.predict(test['value'].values.reshape(-1,1))

In [None]:
cm = confusion_matrix(test['is_anomaly'], preds_lof, labels=[1, -1])

disp_cm = ConfusionMatrixDisplay(cm, display_labels=[1, -1])

disp_cm.plot();