In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier,LocalOutlierFactor
from sklearn.preprocessing import RobustScaler,normalize, StandardScaler
from sklearn.ensemble import RandomForestClassifier,IsolationForest
from sklearn.cluster import DBSCAN,KMeans
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import model_selection
from pickle import dump
import time

In [2]:
flow_fields = [
    "src_ip",
    "src_port",
    "dst_ip",
    "dst_port",
    "ip_protocol",
    "l7_proto",
    "in_bytes",
    "out_bytes",
    "in_pkts",
    "out_pkts",
    "tcp_flags",
    "duration",
    "label",
    "anomaly"
]

with open("datasets/NF-CSE-CIC-IDS2018.csv", "r") as csvfile:
    # pass input data stream as open("data.csv", "r") to csv.reader for testing
    # read and process line by line don't read into list
    df_src = pd.read_csv(csvfile, names=flow_fields)

In [3]:
def do_scl(df_num, cols):
    print("Original values:\n", df_num)

    scaler = RobustScaler()
    scaler_temp = scaler.fit_transform(df_num)

    with open("IsolationForestModel_test.pkl", "wb") as f:
        dump(scaler, f, protocol=5)
    
    std_df = pd.DataFrame(scaler_temp, columns =cols)

    print("\nScaled values:\n", std_df)

    return std_df

cat_cols = ['ip_protocol']

In [4]:
def process(dataframe):
    df_num = dataframe.drop(cat_cols, axis=1)
    num_cols = df_num.columns
    scaled_df = do_scl(df_num, num_cols)

    dataframe.drop(labels=num_cols, axis="columns", inplace=True)
    dataframe[num_cols] = scaled_df[num_cols]

    print("Before encoding:")
    print(dataframe['ip_protocol'])

    dataframe = pd.get_dummies(dataframe, columns = ['ip_protocol'])

    print("\nColumns after encoding:")
    print(dataframe.filter(regex='^protocol_type_'))
    
    return dataframe

In [5]:
df = df_src.drop(['src_ip', 'dst_ip','l7_proto','anomaly'] ,axis=1)
scaled_train = process(df)

Original values:
          src_port  dst_port  in_bytes  out_bytes  in_pkts  out_pkts  \
0           51128       443       152          0        3         0   
1             443     51036       994        979        7         7   
2           12262       445       585        344        5         4   
3           61023        53       136        168        2         2   
4             443     51037        72         40        1         1   
...           ...       ...       ...        ...      ...       ...   
8392396        22     40810      2601          0       12         0   
8392397     15476        23        44          0        1         0   
8392398        23     15476        40          0        1         0   
8392399     56407        53        72          0        1         0   
8392400        53     56407       126          0        1         0   

         tcp_flags  duration  label  
0              194   4285680      0  
1               24   4234714      0  
2              

In [6]:
y = scaled_train['label'].values
y = y.astype('int')

X = scaled_train.drop(['label'], axis=1)

x_train, x_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=42)
x_train_reduced, x_test_reduced, y_train_reduced, y_test_reduced = \
    train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
kernal_evals = dict()

In [24]:
#clf = IsolationForest(random_state=47,n_jobs=-1, contamination=0.05,n_estimators=1000) #Work 84.3%
#clf = IsolationForest(random_state=47,n_jobs=-1, contamination=0.01,n_estimators=1000) #Work 86.3%
#clf = IsolationForest(random_state=47,n_jobs=-1, contamination=0.02,n_estimators=1500) #Work 86.26% time 83 min
clf = IsolationForest(random_state=47,n_jobs=-1, contamination=0.001,n_estimators=100) #Work 86.9% time 112 min
clf.fit(x_train)



In [25]:
predict_test = clf.predict(x_test)
predict_train = clf.predict(x_train)
#time 129min

In [26]:
predict_test[predict_test == 1] = 0
predict_train[predict_train == 1] = 0

predict_test[predict_test == -1] = 1
predict_train[predict_train == -1] = 1

In [27]:
test_accuracy = metrics.accuracy_score(y_test,predict_test)
train_accuracy = metrics.accuracy_score(y_train,predict_train)

train_accuracy,test_accuracy

(0.8693296911491349, 0.8691126083643484)

In [28]:
n_error_test = predict_test[predict_test == 1].size
n_error_outliers = predict_train[predict_train == 1].size

print( "errors novel regular: %d/40 ; errors novel abnormal: %d/40"
    % (n_error_test, n_error_outliers))

print("Training Accuracy " + "IsolationForestClassifier" + " {}  Test Accuracy ".format(train_accuracy*100) + 'IsolationForestClassifier' + " {}".format(test_accuracy*100))

errors novel regular: 16744/40 ; errors novel abnormal: 66665/40
Training Accuracy IsolationForestClassifier 86.93296911491349  Test Accuracy IsolationForestClassifier 86.91126083643485


In [34]:
with open("IsolationForestModel_86_91.pkl", "wb") as f:
    dump(clf, f, protocol=5)

In [38]:
X_val = x_train.values

In [40]:
X_std = StandardScaler().fit_transform(X_val)

In [42]:
# Apply DBSCAN for anomaly detection with increased epsilon
dbscan = DBSCAN(eps=1, min_samples = 2) # Increase eps
labels = dbscan.fit(X_std)

MemoryError: 

In [None]:
print(labels.labels_ )
pred_y = labels.fit_predict(x_test)

In [31]:
plt.scatter(X[:, 0], X[:, 1], c=pred_y, cmap="plasma")
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")

NameError: name 'labels' is not defined

In [None]:
anomalies = x_test[labels == 1]