# Model Comparison
    - One-Class SVM
    - Isolation Forest
    - Local Outlier Factor
    - Novelty Local Outlier Factor

In [6]:
import warnings
import numpy as np 
import pandas as pd 

from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,f1_score, recall_score, precision_score

from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

warnings.filterwarnings('ignore')

df_train = pd.read_csv('train.csv')

- Data Pre-processing

In [7]:
df_train['is_anomaly'] = np.where(df_train['is_anomaly'] == False, 0, 1) 
df_train["Is_anomaly"] = df_train['is_anomaly'].values

df_train.drop('is_anomaly', axis = 1 ,inplace = True)

X_train = df_train[['timestamp','value','predicted']]
y_train = df_train['Is_anomaly']
df_train.head(10)

Unnamed: 0,timestamp,value,predicted,Is_anomaly
0,1425008573,42,44.0725,0
1,1425008873,41,50.70939,0
2,1425009173,41,81.40512,0
3,1425009473,61,39.950367,0
4,1425009773,44,35.35016,0
5,1425010073,27,27.713638,0
6,1425010373,37,41.54571,0
7,1425010673,36,38.74362,0
8,1425010973,49,40.859787,0
9,1425011273,36,25.444733,0


- Model Comparison

In [3]:
anomaly_detectors = {  
    "Isolation Forest Detector"   :IsolationForest(n_estimators  = 100, 
                                                   max_samples   = 50, 
                                                   contamination = 0.03, 
                                                   bootstrap     = True, 
                                                   random_state  = 99),
    "Local Outlier Factor"        :LocalOutlierFactor(n_neighbors = 2, 
                                                      algorithm   = 'brute', 
                                                      leaf_size   = 10, 
                                                      metric      = 'minkowski',
                                                      novelty     = False, 
                                                      p           = 1, 
                                                      metric_params = None, 
                                                      contamination = 0.03,
                                                      n_jobs        = -1),
    "Novelty Local Outlier Factor":LocalOutlierFactor(n_neighbors = 2, 
                                                      algorithm   = 'brute', 
                                                      leaf_size   = 10, 
                                                      metric      = 'minkowski',
                                                      novelty     = True, 
                                                      p           = 1, 
                                                      metric_params = None, 
                                                      contamination = 0.03,
                                                      n_jobs        = -1),
    "One-Class SVM Detector"      :OneClassSVM(kernel= 'rbf', 
                                               gamma = 0.001 ,
                                               nu    = 0.001)}

In [4]:
for i, (name, model) in enumerate(anomaly_detectors.items()):
    if name == 'Isolation Forest Detector':
        model.fit(X_train)
        y_pred = model.predict(X_train)
    elif name == "Local Outlier Factor":
        y_pred = model.fit_predict(X_train)
    elif name == "Novelty Local Outlier Factor":
        model.fit(X_train, y_train)
        y_pred = model.predict(X_train)
    elif name == "One-Class SVM Detector":
        model.fit(X_train)
        y_pred = model.predict(X_train)
    else:    
        break
        
    #Update values, 1 for Anomalies
    y_pred[y_pred ==  1] = 0
    y_pred[y_pred == -1] = 1
    num_errors = (y_pred != y_train).sum()

    print("--------------{}--------------".format(name))
    print("Total errors: {}".format(num_errors))
    print("Classification Report :")
    print(classification_report(y_train,y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_train, y_pred))

--------------Isolation Forest Detector--------------
Total errors: 1033
Classification Report :
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     15054
           1       0.23      0.14      0.17       776

    accuracy                           0.93     15830
   macro avg       0.59      0.56      0.57     15830
weighted avg       0.92      0.93      0.93     15830

Confusion Matrix:
[[14688   366]
 [  667   109]]
--------------Local Outlier Factor--------------
Total errors: 921
Classification Report :
              precision    recall  f1-score   support

           0       0.96      0.98      0.97     15054
           1       0.35      0.21      0.26       776

    accuracy                           0.94     15830
   macro avg       0.65      0.60      0.62     15830
weighted avg       0.93      0.94      0.94     15830

Confusion Matrix:
[[14744   310]
 [  611   165]]
--------------Novelty Local Outlier Factor--------------
Tot

- Test Data Analysis

In [5]:
df_test  = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,timestamp,value,predicted
0,1396332000,20.0,20.0
1,1396332300,20.0,20.0
2,1396332600,20.0,20.0
3,1396332900,20.0,20.0
4,1396333200,20.0,20.0


In [6]:
X_test = df_test

- Specify Novelty Detector

In [7]:
IRF_detector = IsolationForest(n_estimators  = 100, 
                                                   max_samples   = len(X_train), 
                                                   contamination = 0.03, 
                                                   bootstrap     = True, 
                                                   random_state  = 99).fit(X_train.values)
IRF_predictions = pd.Series(IRF_detector.predict(X_test.values))

- Inliers  1
- Outliers -1

In [8]:
IRF_predictions.value_counts()

 1    3951
-1       9
dtype: int64

In [9]:
data={"timestamp":[],"is_anomaly":[]}
for id,pred in zip(df_test["timestamp"].unique(), IRF_predictions):
    data["timestamp"].append(id)
    data["is_anomaly"].append(pred)


In [10]:
output = pd.DataFrame(data,columns=["timestamp","is_anomaly"])
output

Unnamed: 0,timestamp,is_anomaly
0,1396332000,1
1,1396332300,1
2,1396332600,1
3,1396332900,1
4,1396333200,1
...,...,...
3955,1397518500,1
3956,1397518800,1
3957,1397519100,1
3958,1397519400,1
