In [1]:
# required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest
import warnings
warnings.filterwarnings('ignore')

In [2]:
# data for one day
file_name = 'netflow_day-03'
column_names = ['Time','Duration','SrcDevice','DstDevice','Protocol','SrcPort','DstPort','SrcPackets','DstPackets','SrcBytes','DstBytes']
data = pd.read_csv(file_name, header = None)
data.columns = column_names

In [3]:
# keeping only the last row of the same transaction since connections are cumulative
five_tuple = ['Time', 'SrcDevice', 'DstDevice', 'Protocol', 'SrcPort', 'DstPort']
data = data.drop_duplicates(subset = five_tuple, keep = 'last')

# faster way i think...yet to try (not sure which column to use)
# data = data.loc[data.groupby(five_tuple).Duration.idxmax()]

In [4]:
# removing characters from port numbers
data['SrcPort'].replace(regex = True, inplace = True, to_replace = r'\D', value = r'')
data['DstPort'].replace(regex = True, inplace = True, to_replace = r'\D', value = r'')

# convert to numeric for binning later
data['SrcPort'] = data['SrcPort'].astype(int)
data['DstPort'] = data['DstPort'].astype(int)

In [5]:
# removing digits from 'SrcDevice' and 'DestDevice'
data['SrcDevice'].replace(regex = True, inplace = True, to_replace = r'\d', value = r'')
data['DstDevice'].replace(regex = True, inplace = True, to_replace = r'\d', value = r'')

In [6]:
# replace 'Time' column with hour intervals
start_time = data['Time'].min()
end_time = data['Time'].max()
hour_bins = np.arange(start_time - 1, end_time + 2, step = 3600)
data['Time'] = np.searchsorted(hour_bins, data['Time'].values)

In [7]:
# bin source and destination port numbers into categories
# 3 categories for now...0-1023, 1024-49151, 49152-65535...
port_bins = [-1, 1023, 49151]
data['SrcPort'] = np.searchsorted(port_bins, data['SrcPort'].values)
data['DstPort'] = np.searchsorted(port_bins, data['DstPort'].values)

In [9]:
# converting 'Time', 'Protocol', 'SrcPort' and 'DstPort' to categorical variables for model
data['Time'] = data['Time'].astype('category')
data['Protocol'] = data['Protocol'].astype('category')
data['SrcPort'] = data['SrcPort'].astype('category')
data['DstPort'] = data['DstPort'].astype('category')

# converting to categorical reduces time while label encoding
data['SrcDevice'] = data['SrcDevice'].astype('category')
data['DstDevice'] = data['DstDevice'].astype('category')

# converting character variables to encodings
data['SrcDevice'] = data['SrcDevice'].cat.codes
data['DstDevice'] = data['DstDevice'].cat.codes

# converting them back to categorical since encoding makes them numeric
data['SrcDevice'] = data['SrcDevice'].astype('category')
data['DstDevice'] = data['DstDevice'].astype('category')

In [18]:
data.head(50)

Unnamed: 0,Time,Duration,SrcDevice,DstDevice,Protocol,SrcPort,DstPort,SrcPackets,DstPackets,SrcBytes,DstBytes
0,1,0,1,1,6,2,1,0,5,0,784
1,1,0,1,1,17,3,1,1,0,77,0
2,1,0,1,1,6,2,3,6,5,1379,1770
3,1,0,1,1,17,3,1,1,0,64,0
4,1,0,1,1,17,3,1,1,0,64,0
5,1,0,1,1,6,2,3,4,2,186,94
6,1,0,1,1,6,3,2,3,4,164,216
7,1,0,1,1,17,3,1,1,1,59,59
8,1,0,1,0,6,3,1,5,0,2048,0
9,1,0,1,1,17,3,1,1,1,70,70


In [19]:
data.dtypes

Time          category
Duration         int64
SrcDevice     category
DstDevice     category
Protocol      category
SrcPort       category
DstPort       category
SrcPackets       int64
DstPackets       int64
SrcBytes         int64
DstBytes         int64
dtype: object

In [None]:
# isolation forest model
clf = IsolationForest(n_estimators = 100, max_samples = 100000)
clf.fit(data)
isolation_scores = clf.predict(data)

In [None]:
# performance of model