In [None]:
import pandas as pd
import numpy as np
import keras
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten

from keras.preprocessing import sequence
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.metrics import accuracy_score
import os
import matplotlib.pyplot as plt
from keras.utils import plot_model
from sklearn.metrics import confusion_matrix
import time

In [None]:
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [None]:
# Some globals
data_set_path = 'NSL-KDD/KDDTrain+.txt'
test_data_set_path = 'NSL-KDD/KDDTest+.txt'
headers = ['duration', 'protocol_type','service','flag', 'src_bytes', 'dst_bytes','land', 'wrong_fragment', 'urgent', 'hot',
         'num_failed_logins', 'logged_in','num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
         'num_shells', 'num_access_files', 'num_outbound_cmds',  'is_host_login', 'is_guest_login', 'count', 'srv_count',
         'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate','srv_diff_host_rate',
         'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
         'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate','dst_host_srv_rerror_rate',
         'type', 'difficulty']

In [None]:
# Reads the test datafile, removes the training attacks and replaces all attack names with 'attack', and returns.
def prepare_test_data(attack1, attack2):
    test_dataset = pd.read_csv(test_data_set_path, names=headers, header=0)
    test_dt = test_dataset.loc[(test_dataset['type'] != attack1) & (test_dataset['type'] != attack2)]
    test_dt['type'] = test_dt['type'].str.replace(r"^(.(?<!normal))*?$", "attack")
    return test_dt

In [None]:
# This function returns a "train_data" df based on the two attacks selected
def prepare_train_subset(normal_data, full_dataset, attack1, attack2):
    train_subset = normal_data.copy()
    attack1_data = full_dataset.loc[full_dataset['type'] == attack1]
    attack2_data = full_dataset.loc[full_dataset['type'] == attack2]
    train_subset = train_subset.append(attack1_data)
    train_subset = train_subset.append(attack2_data)
    train_subset = train_subset.replace(attack1, 'attack')
    train_subset = train_subset.replace(attack2, 'attack')
    print("Training subset with " + attack1 + " and " + attack2 + " with shape ")
    print(train_subset.shape)
    return shuffle(train_subset)

In [None]:
# Used in encoding uniformly. This returns unique categories across the train  & test datasets
def get_unique_categories(train_data, test_data):
    temp_service = train_data.service.unique().tolist()
    temp_service.extend(test_data.service.unique().tolist())
    service_categories = set(temp_service)
    services = [ x for x in iter(service_categories) ]

    protocol_type = train_data.protocol_type.unique().tolist()
    protocol_type.extend(test_data.protocol_type.unique().tolist())
    protocol = set(protocol_type)
    protocols = [x for x in iter(protocol)]


    flag = train_data.flag.unique().tolist()
    flag.extend(test_data.flag.unique().tolist())
    flag_val = set(flag)
    flag_values = [x for x in iter(flag_val)]


    categories = [protocols, services, flag_values]
    return categories

In [None]:
# Do label and one hot encoding on the train and test datasets
def encode_values(X, test_X, Y, test_Y, categories):
    # First label encode 'normal' and 'attack' labels
    labelencoder_Y = LabelEncoder()
    processed_Y = labelencoder_Y.fit_transform(Y)
    processed_test_Y = labelencoder_Y.transform(test_Y)

    preprocess = OneHotEncoder(categories=categories)
    processed_X = preprocess.fit_transform(X[:,1:4]).toarray()
    processed_X = pd.concat([pd.DataFrame(X[:,0]), pd.DataFrame(processed_X), pd.DataFrame(X[:,5:])], axis=1, ignore_index=True).values
    processed_test_X = preprocess.transform(test_X[:, 1:4]).toarray()
    processed_test_X = pd.concat([pd.DataFrame(test_X[:,0]), pd.DataFrame(processed_test_X), pd.DataFrame(test_X[:,5:])], axis=1, ignore_index=True).values

    print("Shape of train data after encoding is {0} and test data is {1}".format(processed_X.shape, processed_test_X.shape))
    return processed_X, processed_test_X, processed_Y, processed_test_Y


In [None]:
def prepare_data_for_classification(train_data, test_data):
    categories = get_unique_categories(train_data, test_data)
    X = train_data.iloc[:, 0:-2].values
    Y = train_data.iloc[:, -2].values
    test_X = test_data.iloc[:, 0:-2].values
    test_Y = test_data.iloc[:, -2].values
    print("Training data has shape of {0} and test data has shape of {1}".format(X.shape, test_X.shape))
    X_Train, X_Test, Y_Train, Y_Test = encode_values(X, test_X, Y, test_Y, categories)
    sc = StandardScaler()
    X_Train = sc.fit_transform(X_Train)
    X_Test = sc.transform(X_Test)
    print("Shapes of train and test data are {0}, {1}".format(X_Train.shape, X_Test.shape))
    x_train,x_valid,y_train,y_valid = train_test_split(X_Train, Y_Train, test_size = 0.15, random_state = 42)
    return x_train,x_valid,y_train,y_valid, X_Test, Y_Test

In [None]:
# Train the FNN. Split train data to train & validation, encode train, validation and test datasets, predict results and return the history obj for visualization
def train_nn(x_train,x_valid,y_train,y_valid,X_Test,Y_Test):
    start_time = time.time()
    classifier = Sequential()
    classifier.add(Dense(units = 12, kernel_initializer = 'uniform', activation = 'relu', input_dim = x_train.shape[1]))
    classifier.add(Dense(units = 12, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])
    classifierHistory = classifier.fit(x_train, y_train, batch_size = 100, validation_data=(x_valid, y_valid), epochs = 100)
    train_time = time.time() - start_time
    start_time = time.time()
    y_pred = classifier.predict(X_Test)
    y_pred = (y_pred > 0.9)
    predict_time = time.time() - start_time 
    return accuracy_score(Y_Test, y_pred), classifierHistory, Y_Test, y_pred, train_time, predict_time

In [None]:
def train_rnn(_nTimesteps, x_train, x_valid, y_train, y_valid, X_Test, Y_Test):
    X_train_sequence = []
    y_train_sequence = []
    x_valid_sequence = []
    y_valid_sequence = []
    X_Test_Sequence = []
    Y_Test_Sequence = []
    start_time = time.time()
    for i in range(_nTimesteps, np.shape(X_Test)[0]):    
        X_Test_Sequence.append(X_Test[i - _nTimesteps : i, :])  
        Y_Test_Sequence.append(Y_Test[i - _nTimesteps : i, ])
        
    for i in range(_nTimesteps, np.shape(x_train)[0]):    
        X_train_sequence.append(x_train[i - _nTimesteps : i, :])    
        y_train_sequence.append(y_train[i - _nTimesteps : i, ])
    
    for i in range(_nTimesteps, np.shape(x_valid)[0]):
        x_valid_sequence.append(x_valid[i - _nTimesteps : i, :])
        y_valid_sequence.append(y_valid[i - _nTimesteps : i, ])
        
    X_train_sequence = np.array(X_train_sequence)
    y_train_sequence = np.array(y_train_sequence)
    x_valid_sequence = np.array(x_valid_sequence)
    y_valid_sequence = np.array(y_valid_sequence)
    X_Test_Sequence = np.array(X_Test_Sequence)
    Y_Test_Sequence = np.array(Y_Test_Sequence)
    # Initializing the RNN
    classifier = Sequential()
    classifier.add(LSTM(units = 20, return_sequences = True))
    classifier.add(Dropout(0.2))
    classifier.add(Flatten())
    classifier.add(Dense(_nTimesteps, activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])
    classifierHistory = classifier.fit(X_train_sequence, y_train_sequence, batch_size = 100, validation_data=(x_valid_sequence, y_valid_sequence), epochs = 100)
    train_time = time.time() - start_time
    start_time = time.time()
    y_pred = classifier.predict(X_Test_Sequence)
    y_pred = (y_pred > 0.99999)
    predict_time = time.time() - start_time
    return accuracy_score(Y_Test_Sequence, y_pred), classifierHistory, Y_Test_Sequence, y_pred, train_time, predict_time

In [None]:
# Plots a graph given all params.
def plot_graph(values, title, xlabel, ylabel, legend):
    # Plot training & validation accuracy values
    plt.clf()
    for value in values:
        plt.plot(value)
    plt.title(title)
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)
    plt.legend(legend, loc='upper left')
    plt.show()

In [None]:
# Helper to plot a bar graph.
def show_bar(xLabels, values, title, yLabel):
    plt.clf()
    y_pos = np.arange(len(xLabels))
    plt.barh(y_pos, values)
    plt.title(title)
    plt.yticks(y_pos, xLabels)
    plt.xlabel(yLabel)
    plt.show()

In [None]:
# Visualization driver. Plots graphs and bar charts
def visualize(results):
    legends = results['attacks']
    plot_graph(results['acc'], "Model Accuracy", "Epochs", "Accuracy", legends)
    plot_graph(results['val_acc'], "Model Validation Accuracy", "Epochs", "Valdiation Accuracy", legends)
    plot_graph(results['loss'], "Model Loss", "Epochs", "Loss", legends)
    plot_graph(results['val_loss'], "Model Validation Loss", "Epochs", "Validation Loss", legends)
    show_bar(legends, results['accuracy'], "Test Accuracy of each combination", "Test Accuracy")
    show_bar(legends, results['time'], "Time Taken to train each comibation", "Time (in seconds)")


In [None]:
def gather_metrics(results, classifierHistory, train_time, predict_time, accuracy, attack1, attack2):
    results["attacks"].append("{0} and {1}".format(attack1, attack2))
    results["accuracy"].append(accuracy)
    results["loss"].append(classifierHistory.history['loss'])
    results["val_loss"].append(classifierHistory.history['val_loss'])
    results["acc"].append(classifierHistory.history['acc'])
    results["val_acc"].append(classifierHistory.history['val_acc'])
    results["train_time"].append(train_time)
    results["predict_time"].append(predict_time)
    return results

In [None]:
def computeConfusionmatrix(results, y_test, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    results["cm"].append([tn, fp, fn, tp])
    return results

In [None]:
def enumerate_n_values(x_train,x_valid,y_train,y_valid,X_Test,Y_Test, attack1, attack2):
    all_res = []
    for i in range(1,6):
        rnn_results = {"attacks" : [], "loss" : [], "val_loss" :[], "accuracy": [], "acc": [], "val_acc":[], "train_time": [], "predict_time":[], "cm": []}
        print("Training rrn with n =  {0}".format(i))
        rnn_accuracy, rnn_classifierHistory, rnn_y_test, rnn_y_pred, train_time, predict_time = train_rnn(i,x_train,x_valid,y_train,y_valid,X_Test, Y_Test)
        rnn_results = gather_metrics(rnn_results, rnn_classifierHistory, train_time, predict_time, rnn_accuracy, attack1, attack2)
        rnn_results["n"] = i
        all_res.append(rnn_results)
    return all_res

In [None]:
#driver method
def project1main():
    attack2_types = ['neptune', 'warezmaster', 'nmap', 'teardrop']
    attack1_types = ['satan', 'portsweep', 'buffer_overflow','multihop']
    full_dataset = pd.read_csv(data_set_path, names=headers, header=0)
    val = pd.value_counts(full_dataset['type'])
    Labels = val.index.values
    Values = val.tolist()
    show_bar(Labels, Values,'Distribution of Attacks', 'Number of Samples')
    # First grab the normal data out
    normal_data = full_dataset.loc[full_dataset['type'] == 'normal']
    ann_results = { "attacks" : [], "loss" : [], "val_loss" :[], "accuracy": [], "acc": [], "val_acc":[], "train_time": [], "predict_time":[], "cm": []}
    for attack1, attack2 in zip(attack1_types, attack2_types):
        test_dataset = prepare_test_data(attack1, attack2)
        train_df = prepare_train_subset(normal_data, full_dataset, attack1, attack2)
        x_train,x_valid,y_train,y_valid,X_Test, Y_Test = prepare_data_for_classification(train_df, test_dataset)
        accuracy, classifierHistory, y_test, y_pred, train_time, predict_time = train_nn(x_train, x_valid, y_train, y_valid, X_Test, Y_Test)
        ann_results = gather_metrics(ann_results, classifierHistory, train_time, predict_time, accuracy, attack1, attack2)
        computeConfusionmatrix(ann_results, y_test, y_pred)
        print("Dataset has {0} and {1} attacks. The accuracy score was {2} and {3}".format(attack1, attack2, accuracy, rnn_accuracy))
        del(train_df)
        del(test_dataset)
    visualize(ann_results)

In [None]:
def compare_ANN_RNN():
    full_dataset = pd.read_csv(data_set_path, names=headers, header=0)
    normal_data = full_dataset.loc[full_dataset['type'] == 'normal']
    attack1 = 'warezmaster'
    attack2 = 'portsweep'
    test_dataset = prepare_test_data(attack1, attack2)
    ann_results = { "attacks" : [], "loss" : [], "val_loss" :[], "accuracy": [], "acc": [], "val_acc":[], "train_time": [], "predict_time":[], "cm": []}
    train_df = prepare_train_subset(normal_data, full_dataset, attack1, attack2)
    x_train, x_valid, y_train, y_valid, X_Test, Y_Test = prepare_data_for_classification(train_df, test_dataset)
    accuracy, classifierHistory, y_test, y_pred, train_time, predict_time = train_nn(x_train, x_valid, y_train, y_valid, X_Test, Y_Test)
    results = enumerate_n_values(x_train,x_valid,y_train,y_valid,X_Test,Y_Test,attack1, attack2)
    print(ann_results)
    print(results)
    return results, ann_results

In [None]:
def compareAnnRnnNewData():
    port_Scan_headers = ['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count', 'Down/Up Ratio', 'Average Packet Size', 'Avg Fwd Segment Size', 'Avg Bwd Segment Size', 'Fwd Header Length.1', 'Fwd Avg Bytes/Bulk', 'Fwd Avg Packets/Bulk', 'Fwd Avg Bulk Rate', 'Bwd Avg Bytes/Bulk', 'Bwd Avg Packets/Bulk', 'Bwd Avg Bulk Rate', 'Subflow Fwd Packets', 'Subflow Fwd Bytes', 'Subflow Bwd Packets', 'Subflow Bwd Bytes', 'Init_Win_bytes_forward', 'Init_Win_bytes_backward', 'act_data_pkt_fwd', 'min_seg_size_forward', 'Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min', 'Label']
    port_scan_dataset = pd.read_csv('MachineLearningCVE/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv', names = port_Scan_headers,header=0)
    val = pd.value_counts(port_scan_dataset.Label)
    Labels = val.index.values
    Values = val.tolist()
    show_bar(Labels, Values,'Distribution of Attacks', 'Number of Samples')
    port_scan_dataset = port_scan_dataset.replace(1.7976931348623157e+308, 0.0)
    port_scan_dataset = port_scan_dataset.replace("Infinity", 0.0)
    port_scan_dataset = port_scan_dataset.fillna(0)
    X = port_scan_dataset.iloc[:, 0:-1].values
    Y = port_scan_dataset.iloc[:, -1].values
    labelencoder_Y = LabelEncoder()
    Y = labelencoder_Y.fit_transform(Y)
    sc = StandardScaler()
    X  = sc.fit_transform(X)
    x_train,x_test,y_train,y_test = train_test_split(X, Y, test_size = 0.15, random_state = 42)
    x_train,x_valid,y_train,y_valid = train_test_split(x_train,y_train, test_size=0.10, random_state = 42)
    accuracy, classifierHistory, y_test, y_pred, train_time, predict_time = train_nn(x_train, x_valid, y_train, y_valid, x_test, y_test)
    rnn_accuracy, rnn_classifierHistory, rnn_y_test, rnn_y_pred, rnn_train_time, rnn_predict_time = train_rnn(1,x_train,x_valid,y_train,y_valid, x_test, y_test)
    print("ANN Accuracy: {0} took {1} to train {2} to predict".format(accuracy, train_time, predict_time))
    print("RNN Accuracy: {0} took {1} to train {2} to predict".format(rnn_accuracy, rnn_train_time, rnn_predict_time))
    return accuracy, rnn_accuracy
    

In [None]:
def project2main():
    results, ann_results = compare_ANN_RNN()
    accuracy, rnn_accuracy = compareAnnRnnNewData()
    accuracies = []
    train_times = []
    predict_times = []
    n_values = []
    for result in results:
        n_values.append(result['n'])
        accuracies.append(result['accuracy'][0])
        train_times.append(result['train_time'][0])
        predict_times.append(result['predict_time'][0])
    show_bar(n_values,accuracies, "Accuracy vs N", "Accuracy")
    show_bar(n_values,train_times, "Train Time vs N", "Train Time")
    show_bar(n_values,predict_times, "Predict Time vs N", "Predict Time")

In [None]:
if __name__== "__main__":
    project1main()
    project2main()