Train Model using  NetFlow files from csv format

Import dependecies
pip install numpy,pandas,xgboost,sklearn,pickle

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier,LocalOutlierFactor
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier,IsolationForest
from sklearn.cluster import DBSCAN,KMeans
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import model_selection
from pickle import dump
import time
from pickle import load

Data set can be downloaded from here

https://rdm.uq.edu.au/files/650f1fa0-ef9c-11ed-b5f6-b1a04f482c13

Columns needed from csv files
Example:
172.31.66.17,51128,23.36.69.189,443,6,91.0,152,0,3,0,194,4285680,0,anomaly_name

After data normalization some field will be dropped
'src_ip', 'dst_ip','l7_proto','anomaly'
In this way these values can be written randomly and will be not used in learning process

field 'label' if equels 0 - is normal, if equals 1 - is anomaly, training model dont take it in to cosideration, just needed for validation

In [2]:
flow_fields = [
    "src_ip",
    "src_port",
    "dst_ip",
    "dst_port",
    "ip_protocol",
    "l7_proto",
    "in_bytes",
    "out_bytes",
    "in_pkts",
    "out_pkts",
    "tcp_flags",
    "duration",
    "label",
    "anomaly",
    "notneeded"
]

with open("datasets/NF-UQ-NIDS.csv", "r") as csvfile:
    # pass input data stream as open("data.csv", "r") to csv.reader for testing
    # read and process line by line don't read into list
    df_new = pd.read_csv(csvfile, names=flow_fields)

In [69]:
df_src_msk = df_new['anomaly'] == 'Benign'
df_src = df_new[df_src_msk]

Scaler for data standartization

In [89]:
def do_scl(df_num, cols):
    print("Original values:\n", df_num)

    scaler = RobustScaler()
    scaler_temp = scaler.fit_transform(df_num)

    std_df = pd.DataFrame(scaler_temp, columns =cols)

    print("\nScaled values:\n", std_df)

    return std_df

cat_cols = ['ip_protocol']

Process standatrization and normalization primitive

In [90]:
def process(dataframe):
    df_num = dataframe.drop(cat_cols, axis=1)
    num_cols = df_num.columns
    scaled_df = do_scl(df_num, num_cols)

    dataframe.drop(labels=num_cols, axis="columns", inplace=True)
    dataframe[num_cols] = scaled_df[num_cols]

    #print("Before encoding:")
    #print(dataframe['ip_protocol'])
    
    #uncomment for categorical features
    #dataframe = pd.get_dummies(dataframe, columns = ['ip_protocol'])

    #print("\nColumns after encoding:")
    #print(dataframe.filter(regex='^protocol_type_'))
    
    return dataframe

Drop not necessary columns and process scaling

In [117]:
df = df_src.drop(['src_ip', 'dst_ip','l7_proto','anomaly','notneeded'] ,axis=1)


In [118]:
df.reset_index(inplace=True)
df.drop(['index'],axis=1, inplace=True)

In [120]:
scaled_train = process(df)

Original values:
          src_port  dst_port  in_bytes  out_bytes  in_pkts  out_pkts  \
0           62073     56082      9672        416       11         8   
1           32284      1526      1776        104        6         2   
2              21     21971      1842       1236       26        22   
3           23800     46893       528       8824       10        12   
4           63062        21      1786       2340       32        34   
...           ...       ...       ...        ...      ...       ...   
9208043        80        80    240264          0      852         0   
9208044        80        80   2330065          0     2523         0   
9208045         0         0   1054423          0     1513         0   
9208046       365       565     62422          0     1357         0   
9208047     50850      8883     11300       1664       32        32   

         tcp_flags  duration  label  
0               25        15      0  
1               25         0      0  
2              

Split data in to training and validation sets

In [122]:
y = scaled_train['label'].values
y = y.astype('int')

X = scaled_train.drop(['label'], axis=1)

x_train, x_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=42)
x_train_reduced, x_test_reduced, y_train_reduced, y_test_reduced = \
    train_test_split(X, y, test_size=0.2, random_state=42)

n_estimators=1500 - the higher the value, the longer the training takes

Start learning process
These values can be tuned for best performance:
'random_state=47, contamination=0.01,n_estimators=1000'

In [40]:
clf = IsolationForest(random_state=47,n_jobs=-1, contamination=0.001,n_estimators=100)
clf.fit(x_train)



Start prediction. Trained model know only x_train data

In [48]:
predict_test = clf.predict(x_test)
predict_train = clf.predict(x_train)

Normalize predicted values to source values for testing prediction:  0- is normal behviour, 1 - is anomaly

In [124]:
predict_test[predict_test == 1] = 0
predict_train[predict_train == 1] = 0

predict_test[predict_test == -1] = 1
predict_train[predict_train == -1] = 1

Validate predicted values to known values and calculate accuracy

In [126]:
test_accuracy = metrics.accuracy_score(y_test,predict_test)
train_accuracy = metrics.accuracy_score(y_train,predict_train)

(1.0, 1.0, 0.9999425874061887)

Print results for accuracy

In [None]:
n_error_test = predict_test[predict_test == 1].size
n_error_outliers = predict_train[predict_train == 1].size

print( "errors novel regular: %d/40 ; errors novel abnormal: %d/40"
    % (n_error_test, n_error_outliers))

print("Training Accuracy " + "IsolationForestClassifier" + " {}  Test Accuracy ".format(train_accuracy*100) + 'IsolationForestClassifier' + " {}".format(test_accuracy*100))

Save trained model for future usage

In [127]:
with open("IsolationForestModel_100.pkl", "wb") as f:
    dump(clf, f, protocol=5)