In [1]:
# required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest
import warnings
warnings.filterwarnings('ignore')

In [7]:
# data for one day
file_name = 'netflow_day-03'
column_names = ['Time','Duration','SrcDevice','DstDevice','Protocol','SrcPort','DstPort','SrcPackets','DstPackets','SrcBytes','DstBytes']
day = pd.read_csv(file_name, header = None)
day.columns = column_names

In [8]:
# keeping only the last row of the same transaction since connections are cumulative
five_tuple = ['Time', 'SrcDevice', 'DstDevice', 'Protocol', 'SrcPort', 'DstPort']
dataset = day.drop_duplicates(subset = five_tuple, keep = 'last')

# faster way i think...yet to try (not sure which column to use)
# dataset = day3.loc[day3.groupby(five_tuple).Duration.idxmax()]

In [9]:
# removing characters from port numbers
dataset['SrcPort'].replace(regex = True, inplace = True, to_replace = r'\D', value = r'')
dataset['DstPort'].replace(regex = True, inplace = True, to_replace = r'\D', value = r'')

# convert to numeric for binning later
dataset['SrcPort'] = dataset['SrcPort'].astype(int)
dataset['DstPort'] = dataset['DstPort'].astype(int)

In [10]:
# removing digits from 'SrcDevice' and 'DestDevice'
dataset['SrcDevice'].replace(regex = True, inplace = True, to_replace = r'\d', value = r'')
dataset['DstDevice'].replace(regex = True, inplace = True, to_replace = r'\d', value = r'')

In [11]:
# replace 'Time' column with hour intervals
start_time = dataset['Time'].min()
end_time = dataset['Time'].max()
hour_bins = np.arange(start_time - 1, end_time + 2, step = 3600)
dataset['Time'] = np.searchsorted(hour_bins, dataset['Time'].values)

In [12]:
# categorize source and destination port numbers
# 3 categories for now...0-1023, 1024-49151, 49152-65535...
port_bins = [1023, 49151]
dataset['SrcPort'] = np.searchsorted(port_bins, dataset['SrcPort'].values)
dataset['DstPort'] = np.searchsorted(port_bins, dataset['DstPort'].values)

In [None]:
# converting 'Time', 'Protocol', 'SrcPort' and 'DstPort' to categorical variables for model
dataset['Time'] = dataset['Time'].astype('category')
dataset['Protocol'] = dataset['Protocol'].astype('category')
dataset['SrcPort'] = dataset['SrcPort'].astype('category')
dataset['DstPort'] = dataset['DstPort'].astype('category')

In [None]:
# converting character variables to encodings
labelencoder = LabelEncoder()
categorical_variables = ['SrcDevice', 'DstDevice']
for column in categorical_variables:
    dataset[column] = labelencoder.fit_transform(dataset[column])

In [None]:
dataset.head()

In [None]:
# isolation forest model
clf = IsolationForest(n_estimators = 100, max_samples = 100000)
clf.fit(dataset)
isolation_scores = clf.predict(dataset)

In [1]:
# performance of model