### Importing libraries

In [1]:
# required libraries
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest
import warnings
warnings.filterwarnings('ignore')

### Getting data prepared for the model

In [2]:
# function to get data for one day
def process_data_for_model(file_name):
    # start time
    start_time = time.time()
    
    # data for one day
    column_names = ['Time','Duration','SrcDevice','DstDevice','Protocol','SrcPort','DstPort','SrcPackets','DstPackets','SrcBytes','DstBytes']
    day = pd.read_csv(file_name, header = None)
    day.columns = column_names
    print('File read in successfully.')
    
    # keeping only the last row of the same transaction since connections are cumulative
    five_tuple = ['Time', 'SrcDevice', 'DstDevice', 'Protocol', 'SrcPort', 'DstPort']
    day = day.drop_duplicates(subset = five_tuple, keep = 'last')
    
    # faster way i think...yet to try (not sure which column to use)
    # day = day.loc[day.groupby(five_tuple).Duration.idxmax()]
    print('Dropped duplicates.')
    
    # removing characters from port numbers
    day['SrcPort'].replace(regex = True, inplace = True, to_replace = r'\D', value = r'')
    day['DstPort'].replace(regex = True, inplace = True, to_replace = r'\D', value = r'')
    
    # convert to numeric for binning later
    day['SrcPort'] = day['SrcPort'].astype(int)
    day['DstPort'] = day['DstPort'].astype(int)
    print('Removed non-digits from SrcPort and DstPort.')
    
    # removing digits from 'SrcDevice' and 'DestDevice'
    day['SrcDevice'].replace(regex = True, inplace = True, to_replace = r'\d', value = r'')
    day['DstDevice'].replace(regex = True, inplace = True, to_replace = r'\d', value = r'')
    print('Removed digits from SrcDevice and DstDevice.')
    
    # replace 'Time' column with hour intervals
    start_time = day['Time'].min()
    end_time = day['Time'].max()
    hour_bins = np.arange(start_time - 1, end_time + 2, step = 3600)
    day['Time'] = np.searchsorted(hour_bins, day['Time'].values)
    print('Time column binned into hourly intervals.')
    
    # bin source and destination port numbers into categories
    # 3 categories for now...0-1023, 1024-49151, 49152-65535...
    port_bins = [-1, 1023, 49151]
    day['SrcPort'] = np.searchsorted(port_bins, day['SrcPort'].values)
    day['DstPort'] = np.searchsorted(port_bins, day['DstPort'].values)
    print('SrcPort and DstPort put into bins.')
    
    # converting 'Time', 'Protocol', 'SrcPort' and 'DstPort' to categorical variables for model
    day['Time'] = day['Time'].astype('category')
    day['Protocol'] = day['Protocol'].astype('category')
    day['SrcPort'] = day['SrcPort'].astype('category')
    day['DstPort'] = day['DstPort'].astype('category')
    
    # converting to categorical reduces time while label encoding
    day['SrcDevice'] = day['SrcDevice'].astype('category')
    day['DstDevice'] = day['DstDevice'].astype('category')
    
    # converting character variables to encodings
    day['SrcDevice'] = day['SrcDevice'].cat.codes
    day['DstDevice'] = day['DstDevice'].cat.codes
    
    # converting them back to categorical since encoding makes them numeric
    day['SrcDevice'] = day['SrcDevice'].astype('category')
    day['DstDevice'] = day['DstDevice'].astype('category')
    print('Categorical variables encoded and dtype changed to category.')
    
    # print time taken for execution
    time_taken_in_seconds = time.time() - start_time
    print('Time taken:', time_taken_in_seconds, 'seconds')
    
    # return processed dataset
    return day

In [3]:
# file names from day 03-09
file_names = ['netflow_day-03', 'netflow_day-04', 'netflow_day-05', 'netflow_day-06', 'netflow_day-07', 'netflow_day-08', 'netflow_day-09']

# get data for day03
data = process_data_for_model(file_names[0])

File read in successfully.
Dropped duplicates.
Removed non-digits from SrcPort and DstPort.
Removed digits from SrcDevice and DstDevice.
Time column binned into hourly intervals.
SrcPort and DstPort put into bins.
Categorical variables encoded and dtype changed to category.
Time taken: 1549218087.127815 seconds


In [4]:
data.head(50)

Unnamed: 0,Time,Duration,SrcDevice,DstDevice,Protocol,SrcPort,DstPort,SrcPackets,DstPackets,SrcBytes,DstBytes
0,1,0,1,1,6,2,1,0,5,0,784
1,1,0,1,1,17,3,1,1,0,77,0
2,1,0,1,1,6,2,3,6,5,1379,1770
3,1,0,1,1,17,3,1,1,0,64,0
4,1,0,1,1,17,3,1,1,0,64,0
5,1,0,1,1,6,2,3,4,2,186,94
6,1,0,1,1,6,3,2,3,4,164,216
7,1,0,1,1,17,3,1,1,1,59,59
8,1,0,1,0,6,3,1,5,0,2048,0
9,1,0,1,1,17,3,1,1,1,70,70


### Model building

In [None]:
# isolation forest model
clf = IsolationForest(n_estimators = 100, max_samples = 100000)
clf.fit(data)
isolation_scores = clf.predict(data)

In [None]:
# model evaluation