<a href="https://colab.research.google.com/github/VictoKu1/Anomaly_Detection/blob/master/Task1_Anomaly_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Anomaly Detection**

---

**Moriya Bitton || Victor Kushnir**

In [8]:
# Imports


import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


## **Part 1 - Data** 

---



---



In [11]:
from google.colab import files

data_to_load = files.upload()
target_to_load = files.upload()

ModuleNotFoundError: No module named 'google'

In [None]:
# Reading the data and fixing columns names

import io

df = pd.read_csv(io.BytesIO(data_to_load['conn_attack.csv']), 
                 names=["record_id", "duration_", "src_bytes", "dst_bytes"], 
                 index_col='record_id')
columns = df.columns

target = pd.read_csv(io.BytesIO(target_to_load['conn_attack_anomaly_labels.csv']), 
                 names=["id", "label"], 
                 index_col='id')

In [None]:
df.head()

In [None]:
target.head()

##### **Shape**

In [None]:
df.shape

In [None]:
columns.shape

In [None]:
target.shape

##### **Analyzing the data frame** 

In [None]:
df.info()

In [None]:
df.describe()

##### **Data visualization**

In [None]:
fig, axes = plt.subplots(ncols=3, figsize=(18, 6))

# Data distributed only on the "duration_" feature 
sns.scatterplot(data=df.duration_, ax=axes[0]).set(title="duration_")

# Data distributed only on the "src_bytes" feature 
sns.scatterplot(data=df.src_bytes, ax=axes[1]).set(title="src_bytes")

# Data distributed only on the "dst_bytes" feature 
sns.scatterplot(data=df.dst_bytes, ax=axes[2]).set(title="dst_bytes")

## **Part 2 - Possible labels to compare**

---



---



##### **Assumption**

Essentially, we want to find ways to locate outliers without relying on any model to find the most appropriate model.


I found that the clustering algorithm will probably work because the data is mostly clustered in the center and only a small amount is scattered far away from it. As a result, isolation forests won't work, but DB SCAN will.

In [None]:
q = df.duration_.quantile(0.99)
q_low_1 = df.duration_.quantile(0.01)
q_hi_1 = df.duration_.quantile(0.99)

q = df.src_bytes.quantile(0.99)
q_low_2 = df.src_bytes.quantile(0.01)
q_hi_2  = df.src_bytes.quantile(0.99)

q = df.duration_.quantile(0.99)
q_low_3 = df.dst_bytes.quantile(0.01)
q_hi_3  = df.dst_bytes.quantile(0.99)

# creating the guess
guess = np.zeros_like(df.src_bytes)

guess[(df.duration_>q_hi_1) | (df.duration_<q_low_1) | 
      (df.src_bytes>q_hi_2) | (df.src_bytes<q_low_2) | 
      (df.dst_bytes>q_hi_3) | (df.dst_bytes<q_low_3)] = 1 

## **Part 3 - Models**


---



---




In [None]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score

In [None]:
# Finding important features

df1 = df.drop(columns=columns[0])
df2 = df.drop(columns=columns[1])
df3 = df.drop(columns=columns[2])

df4 = df.drop(columns=[columns[0], columns[1]])
df5 = df.drop(columns=[columns[0], columns[2]])
df6 = df.drop(columns=[columns[1], columns[2]])

### **Part 3.1 - Isolation Forest**


In [None]:
from sklearn.ensemble import IsolationForest

# Find if Isolation Forest might have good prediction
def run_if_models(max_f_, df):
  isf_prediction = []
  for i in range(100, 1000, 100):
      isf = IsolationForest(n_estimators=i, max_features=max_f_, n_jobs=-1)
      isf_labels = pd.Series(isf.fit_predict(df))
      isf_labels = isf_labels.map({1: 0, -1: 1})
      isf_prediction.append([i, f1_score(y_true=guess, y_pred=isf_labels)])

  sns.lineplot(x=np.array(isf_prediction)[:, 0], y=np.array(isf_prediction)[:, 1])  

In [None]:
# Running the model using all features

run_if_models(3, df)

In [None]:
# Running the model without "duration"

run_if_models(2, df1)

In [None]:
# Running the model without "src_bytes"

run_if_models(2, df2)

In [None]:
# Running the model without "dst_bytes"

run_if_models(2, df3)

In [None]:
# Running the model without "duration" , "src_bytes"

run_if_models(1, df4)

In [None]:
# Running the model without "duration" , "dst_bytes"

run_if_models(1, df5)

In [None]:
# Running the model without "src_bytes" , "dst_bytes"

run_if_models(1, df6)

### **Part 3.2 - DBSCAN**

In [None]:
from sklearn.cluster import DBSCAN

# Mapping function to fit the correct labels
def _map(x):
    if x == -1:
        return 0
    return 1

# find if DBSCAN might have good prediction.
def run_dbs_models(df):
  dbs_prediction = []
  for i in range(50, 300, 50):
      dbs = DBSCAN(min_samples=i)
      dbs_labels = pd.Series(dbs.fit_predict(df))
      dbs_labels = dbs_labels.apply(lambda x: _map(x))
      dbs_prediction.append([i, f1_score(y_true=guess, y_pred=dbs_labels)])

  sns.lineplot(x=np.array(dbs_prediction)[:, 0], y=np.array(dbs_prediction)[:, 1])

In [None]:
# Running the model using all features

run_dbs_models(df)

In [None]:
# Running the model without "duration"

run_dbs_models(df1)

In [None]:
# Running the model without "src_bytes"

run_dbs_models(df2)

In [None]:
# Running the model without "dst_bytes"

run_dbs_models(df3)

In [None]:
# Running the model without "duration" , "src_bytes"

run_dbs_models(df4)

In [None]:
# Running the model without "duration" , "dst_bytes"

run_dbs_models(df5)

In [None]:
# Running the model without "src_bytes" , "dst_bytes"

# run_dbs_models(df6) --> Memory error!

## **Part 4 - Model evaluation and comparison**

---



In [None]:
from sklearn.metrics import plot_confusion_matrix

In [None]:
DBS = DBSCAN(min_samples=150, n_jobs=-1)
DBS_labels = pd.Series(DBS.fit_predict(df)).apply(lambda x: _map(x))

##### **Evaluation:** y_guess & y_DB_Scan

In [None]:
confusion_matrix(y_true=guess, y_pred=DBS_labels)

In [None]:
print(classification_report(y_true=guess, y_pred=DBS_labels))

##### **Evaluation:** y_guess & y_true

In [None]:
confusion_matrix(y_true=guess, y_pred=target.label)

In [None]:
print(classification_report(y_true=guess, y_pred=target.label))

##### **Evaluation:** y_DB_Scan & y_true

In [None]:
confusion_matrix(y_true=DBS_labels, y_pred=target.label)

In [None]:
print(classification_report(y_true=DBS_labels, y_pred=target.label))

## **Part 5 - Export the output**

---


In [None]:
DBS_labels = DBS_labels.set_axis(target.index)
df["is_anomaly"] = DBS_labels

output = df.drop(columns=["duration_", "src_bytes", "dst_bytes"])

In [None]:
output.info()

In [None]:
output.head()

In [None]:
output.describe()

In [None]:
output.corr()

In [None]:
output.to_csv('output.csv', encoding='utf-8', index=False)

## **Part 6 - Conclusion**

---

*   According to the F_scores, the isolation forest is not the right solution.

*   DBSCAN does fit this problem

*   While "src_bytes" is the most important feature, utilizing all features helps detect outliers more effectively.

*   Our guesses weren't accurate.

*   There is a high correlation between the real label and the prediction, which means that the prediction is very close to the real label. 

