In [None]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
import time

In [None]:
def parse_attack_types(filename):
    """
    Generate a mapping that looks like:

    {
        'teardrop': {
            'encoding': 0,
            'category': 'dos'
        },
        'smurf': {
            'encoding': 1,
            'category': 'dos'
        },
        ...
    }

    The 'encoding' becomes important in some learning algorithms. We have to encode text
    into numbers so some algorithms can process them.
    """
    attack_map = {}
    attack_encoding = {}
    count = 0
    with open(filename) as f:
        lines = f.readlines()
    for line in lines:
        attack, category = line.split()
        if attack not in attack_map:
            attack_map[attack] = {
                'encoding': count,
                'category': category
            }
            count += 1
    return attack_map


def encode_data(train_data, cols):
    """
    Encode any strings in the training data so that they are integers.
    Also return the map of encodings.
    """
    
    """ Fulbert -- Change the funciton a bit, so cols is a parameter """
    encodings = {}
    for col in cols:
        unique_values = train_data[col].unique()
        mapping = {}
        reverse_mapping = {}  # Used for lookup later if we need it
        for j, value in enumerate(unique_values):
            mapping[value] = j
            reverse_mapping[j] = value
        # Encode strings like ('tcp', 'udp', 'icmp') into (0, 1, 2)
        train_data[col] = train_data[col].map(mapping)
        encodings[col] = reverse_mapping
    return encodings


def parse_data(filename):
    return pd.read_csv(filename, header=None)

In [None]:
print('Running project')
attack_map = parse_attack_types(r'C:\Users\CODER\CTDUML\CTDUML\.venv\CyberThreatDetection\Dataset/attack_types.txt')
print('Attack mapping:')
print(attack_map)
train_data = parse_data(r'C:\Users\CODER\CTDUML\CTDUML\.venv\CyberThreatDetection\Dataset/Dataset.data_10_percent')
print('Raw data:')
print(train_data[:2])
## See labeled data distribution
encodings = encode_data(train_data, (1, 2, 3))
print('Encoded data:')
print(train_data[:2])
print('Encodings:')
print(encodings)

In [None]:
## Plot frequency of each labels
train_data[41].value_counts().plot(kind='bar')
train_data[41].value_counts()

In [None]:
'''Describe Data using Panda Describe for 10%''' 
train_data.describe()

In [None]:
"""Try different attack mapping
        normal = 0  
        dos = 1
        probe = 2 
        r2l = 3 
        u2r = 4 
        
        We will end up with 5 classes instead of 23 classes
        
        One thing to note when mapping the 5 classes, there is a period at the end of the 
        label on the original file!!! Don't forget about that. 
"""

def revised_attack_mapping(attack_map):
    revised_attack_map = {}
    for name, value in attack_map.items():
        revised_attack_map[name + "."] = (value['category']) #Don't forget about the period at the end!!
    return revised_attack_map

def attack_category_encoding():
    """
        normal = 0  
        dos = 1
        probe = 2 
        r2l = 3 
        u2r = 4 
    """    
    attack_category_map = {}
    attack_category_map ['normal'] = 0
    attack_category_map ['dos'] = 1
    attack_category_map ['probe'] = 2
    attack_category_map ['r2l'] = 3
    attack_category_map ['u2r'] = 4
    return attack_category_map

category_attack_map = revised_attack_mapping(attack_map)
category_attack_map['normal.'] = "normal" #Removing the period at the end!!

In [None]:
category_attack_map

In [None]:
attack_category_map = attack_category_encoding()
attack_category_map

In [None]:
## Map 10% Dataset
train_data[41] = train_data[41].map(category_attack_map)
train_data[41].value_counts().plot(kind='bar')
train_data[41].value_counts()

In [None]:
print(train_data[41].value_counts())

In [None]:
## Method to test sklearn classifier
def test_classifier(clf):
    start = time.time()
    clf = clf.fit(X_train, y_train)
    training_ends = time.time()
    prediction = clf.predict(X_test)
    prediction_ends = time.time()
    result = (metrics.classification_report(y_test, prediction, output_dict = True))
    training_time = training_ends - start
    testing_time = prediction_ends - training_ends
    print (metrics.classification_report(y_test, prediction))
    acc = metrics.accuracy_score(y_test, prediction)
    print ("Accuracy Score: %s" % acc)
    print ("Classifier Training time = %s" % training_time)
    print ("Classifier Prediction time = %s" % testing_time)
    train_time.append(training_time)
    test_time.append(testing_time)
    accuracy.append(acc)
    return clf, result


In [None]:
## Split train and test set 
X = train_data.drop(columns=[41])
y = train_data[[41]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=15)
algorithm= []
train_time = []
test_time = []
accuracy = []

In [None]:
## Multinomial Naive Bayes Classifier
clf_MultinomialNB = MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
clf_MultinomialNB, mreport= test_classifier(clf_MultinomialNB)
algorithm.append("MultinomialNB")
## Bernoulli Naive Bayes Classifier
clf_BernoulliNB = BernoulliNB(alpha=0.01, class_prior=None, fit_prior=True)
clf_BernoulliNB, breport = test_classifier(clf_BernoulliNB)
algorithm.append("BernoulliNB")
## Complement Naive Bayes Classifier
clf_ComplementNB = ComplementNB(alpha=0.01, class_prior=None, fit_prior=True)
clf_ComplementNB, creport = test_classifier(clf_ComplementNB)
algorithm.append("ComplementNB")
## Linear SVC Classifier
clf_LinearSVC = LinearSVC(random_state=0, tol=1e-5)
clf_LinearSVC, lreport= test_classifier(clf_LinearSVC)
algorithm.append("LinearSVC")
## Decision Tree Classifier
clf_DecisionTree = DecisionTreeClassifier()
clf_LinearSVC, treereport = test_classifier(clf_DecisionTree)
algorithm.append("DecisionTree")

In [None]:
"""
    normal = 0  
    dos = 1
    probe = 2 
    r2l = 3 
    u2r = 4 
"""    

def extract_algorithm_result (report):
    result = []
    getresult_inlist(report, 'normal', result)
    getresult_inlist(report, 'dos', result)
    getresult_inlist(report, 'probe', result)
    getresult_inlist(report, 'r2l', result)
    getresult_inlist(report, 'u2r', result)
    return result
def getresult_inlist (report, label, resultlist):
    for index in report[label]:
        resultlist.append(report[label][index])
         # Precision, recall, f1-score, support
    

In [None]:
def plot_algorithm_result(report, title):
    #indexes = [0,1,2,4,5,6,8,9,10,12,13,14,16,17,18]
    result = extract_algorithm_result(report)
    ## Plot Precision
    indexes = [0,4,8,12,16]
    algorithm_title = ("Precision %s" % title)
    plot_metric(result, indexes, algorithm_title)
        
    ## Plot Recall
    indexes = [1,5,9,13,17]
    algorithm_title = ("Recall %s" % title)
    plot_metric(result, indexes, algorithm_title)
    
    ## Plot f1-score
    indexes = [2,6,10,14,18]
    algorithm_title = ("F1-Score of %s" % title)
    plot_metric(result, indexes, algorithm_title)
        
    ## Plot Support
    indexes = [3,7,11,15,19]
    algorithm_title = ("Support %s" % title)
    plot_support(result, indexes, algorithm_title) # Note this one is a different function

def plot_metric(result, indexes, algorithm_title):
    fig, ax = plt.subplots()
    Yval = [result[x] * 100 for x in indexes]
    Xval = np.linspace(1,7,5)
    barlist = plt.bar(Xval,Yval)
    barlist[0].set_color('g')
    barlist[1].set_color('r')
    barlist[2].set_color('magenta')
    barlist[3].set_color('yellow')
    barlist[4].set_color('darkblue')
    plt.suptitle(algorithm_title)
    plt.legend((barlist[0], barlist[1], barlist[2], barlist[3], barlist[4]),('normal', 'dos', 'probe', 'r2l', 'u2r' ), loc='best')

def plot_support(result, indexes, algorithm_title):
    fig, ax = plt.subplots()
    Yval = [result[x] for x in indexes]
    Xval = np.linspace(1,7,5)
    barlist = plt.bar(Xval,Yval)
    barlist[0].set_color('g')
    barlist[1].set_color('r')
    barlist[2].set_color('magenta')
    barlist[3].set_color('yellow')
    barlist[4].set_color('darkblue')
    plt.suptitle(algorithm_title)
    plt.legend((barlist[0], barlist[1], barlist[2], barlist[3], barlist[4]),('normal', 'dos', 'probe', 'r2l', 'u2r' ), loc='best')

In [None]:
## Plot MultinomialNB Result 
plot_algorithm_result(mreport, "MultinomialNB on 10% datasets")

In [None]:
## Plot BernoulliNB Result 
plot_algorithm_result(breport, "BernoulliNB on 10% datasets")

In [None]:
## Plot ComplementNB Result 
plot_algorithm_result(creport, "ComplementNB on 10% datasets")


In [None]:
## Plot LinearSVC Result 
plot_algorithm_result(lreport, "LinearSVC on 10% datasets")


In [None]:
## Plot DecisionTree Result 
plot_algorithm_result(treereport, "DecisionTree on 10% datasets")

In [None]:
## Load full dataset. Plot and print frequency of each class in the full dataset
train_data_2 = parse_data(r'C:\Users\CODER\CTDUML\CTDUML\.venv\CyberThreatDetection\Dataset\kddcup.data.corrected')
train_data_2.head()
encodings = encode_data(train_data_2, (1, 2, 3))
train_data_2[41].value_counts().plot(kind='bar')
train_data_2[41].value_counts()
algorithm_10 = algorithm
train_time_10 = train_time
test_time_10 = test_time
accuracy_10 = accuracy
algorithm= []
train_time = []
test_time = []
accuracy = []

In [None]:
## Plot and print frequency of each class after mapping
train_data_2[41] = train_data_2[41].map(category_attack_map)
train_data_2[41].value_counts().plot(kind='bar')
train_data_2[41].value_counts()

In [None]:
#train_data_2[41] = train_data_2[41].map(attack_category_encoding())
#train_data_2[41].value_counts().plot(kind='bar')

In [None]:
'''Describe Data using Panda Describe for 100%''' 
train_data_2.describe()

In [None]:
X = train_data_2.drop(columns=[41])
y = train_data_2[[41]]
#X = X.drop(columns=[19])
#X = X.drop(columns=[20])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=15)

In [None]:
## Bernoulli Naive Bayes Classifier
clf_BernoulliNB = BernoulliNB(alpha=0.01, class_prior=None, fit_prior=True)
clf_BernoulliNB, breport= test_classifier(clf_BernoulliNB)
plot_algorithm_result(breport, "BernoulliNB on 100% datasets")

In [None]:
## Decision Tree Classifier
clf_DecisionTree = DecisionTreeClassifier()
clf_DecisionTree, treereport = test_classifier(clf_DecisionTree)
plot_algorithm_result(treereport, "DecisionTree on 100% datasets")

In [None]:
def tune_classifier(clf):
    start = time.time()
    clf = clf.fit(X_train, y_train)
    training_ends = time.time()
    prediction = clf.predict(X_test)
    prediction_ends = time.time()
    result = (metrics.classification_report(y_test, prediction, output_dict = True))
    training_time = training_ends - start
    testing_time = prediction_ends - training_ends
    print (metrics.classification_report(y_test, prediction))
    acc = metrics.accuracy_score(y_test, prediction)
    print ("Accuracy Score: %s" % acc)
    print ("Classifier Training time = %s" % training_time)
    print ("Classifier Prediction time = %s" % testing_time)
    return clf, result

In [None]:
## Decision Tree Classifier Parameter tuning
clf_DecisionTree = DecisionTreeClassifier(criterion = 'entropy')
clf_DecisionTree = test_classifier(clf_DecisionTree)

In [None]:
clf_DecisionTree = DecisionTreeClassifier(criterion = 'entropy', class_weight = 'balanced')
clf_DecisionTree = test_classifier(clf_DecisionTree)

In [None]:
weight={}
weight['dos'] = 1
weight['normal'] = 1
weight['probe'] = 1
weight['r2l'] = 1
tree_report_entropy = []
for classweight in range (1, 202, 25):
    weight['u2r'] = classweight
    clf_DecisionTree = DecisionTreeClassifier(criterion = 'entropy', class_weight = weight)
    clf_DecisionTree, report = tune_classifier(clf_DecisionTree)
    tree_report_entropy.append(report)

In [None]:
def plot_parameter_tuning(report, title):
    fig, ax = plt.subplots()
    precision_list = []
    recall_list = []
    f1_score_list = []    
    for i in range (0,9):
        precision_list.append(tree_report_entropy[i]['u2r']['precision'])
        recall_list.append(tree_report_entropy[i]['u2r']['recall'])
        f1_score_list.append(tree_report_entropy[i]['u2r']['f1-score'])
    X = list(range(1, 202, 25))
    plt.plot(X, precision_list, label = "Precision")
    plt.plot(X, recall_list, label = "Recall")
    plt.plot(X, f1_score_list, label = "f1_score_list")
    plt.xlabel("u2r Weight")
    plt.legend(loc = "best")
    plt.suptitle(title)

In [None]:
weight={}
weight['dos'] = 1
weight['normal'] = 1
weight['probe'] = 1
weight['r2l'] = 1
tree_report = []
for classweight in range (1, 202, 25):
    weight['u2r'] = classweight
    clf_DecisionTree = DecisionTreeClassifier(class_weight = weight)
    clf_DecisionTree, report = tune_classifier(clf_DecisionTree)
    tree_report.append(report)

In [None]:
def plot_parameter_tuning(report, title):
    fig, ax = plt.subplots()
    precision_list = []
    recall_list = []
    f1_score_list = []    
    for i in range (0,9):
        precision_list.append(tree_report_entropy[i]['u2r']['precision'])
        recall_list.append(tree_report_entropy[i]['u2r']['recall'])
        f1_score_list.append(tree_report_entropy[i]['u2r']['f1-score'])
    X = list(range(1, 202, 25))
    plt.plot(X, precision_list, label = "Precision")
    plt.plot(X, recall_list, label = "Recall")
    plt.plot(X, f1_score_list, label = "f1_score_list")
    plt.xlabel("u2r Weight")
    plt.legend(loc = "best")
    plt.suptitle(title)
plot_parameter_tuning(tree_report_entropy, 'Decision Tree Parameter Tuning (Entropy Criteria)')
plot_parameter_tuning(tree_report, 'Decision Tree Parameter Tuning (GINI Criteria)')

In [None]:
## Final Decision Tree Classifier
weight={}
weight['dos'] = 1
weight['normal'] = 1
weight['probe'] = 1
weight['r2l'] = 1
weight['u2r'] = 102
clf_DecisionTree_final = DecisionTreeClassifier(criterion = 'entropy', class_weight = weight)
clf_DecisionTree_final, treereport_final = test_classifier(clf_DecisionTree_final)
## Plot Final DecisionTree Result 
plot_algorithm_result(treereport_final, "Final DecisionTree on 100% datasets")

In [None]:
## Saving Model
import pickle
pickle.dump(clf_DecisionTree_final, open("DT_model.pkl","wb"))