## UNSW-NB15
Implementación de los IDS separadamente para cada ataque usando algoritmos de aprendizaje supervisado

In [1]:
#  Se requiere el archivo all_data.csv para la ejecución de este código.
#  El archivo all_data.csv debe estar ubicado en el mismo directorio que el programa.

#   El objetivo de este código es aplicar algoritmos de aprendizaje automático al dataset y observar su desempeño
#   Los algoritmos usados son: Naive Bayes, QDA, Random Forest, ID3, AdaBoost, MLP y K Nearest Neighbors
#   Las medidas de rendimiento calculadas y mostradas son accuracy, precision, recall y F1-score. También se recoge para cada uno el tiempo que ha tardado en calcular.
#   Se creará también un archivo CSV (results_1.csv) con los resultados y un directorio (result_graph) con los correspondientes gráficos

##  the some codes parts used for calculation and graphing are taken from the following site.
##  http://scikit-learn.org


from sklearn import metrics
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

import matplotlib.pyplot as plt
import numpy as np
#%matplotlib inline
import os
import pandas as pd
import csv
import time
import warnings
import math
warnings.filterwarnings("ignore")


result="./results/results_1.csv" #Los resultados se guardarán en este archivo
csv_files=os.listdir("attacks") # Asignamos a una lista los nombres de los ataques, a partir de los ficheros contenidos en el directorio "attacks"
path=".\\attacks\\"
repetition=10


def folder(f_name):
    try:
        if not os.path.exists(f_name):
            os.makedirs(f_name)
    except OSError:
        print ("The folder could not be created!")

folder_name="./results/"
folder(folder_name)
folder_name="./results/result_graph_1/"
folder(folder_name)


#   Los algoritmos utilizados se manejarán con una diccionario:
ml_list={
"Naive Bayes":GaussianNB(),
"QDA":QDA(),
"Random Forest":RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
"ID3" :DecisionTreeClassifier(max_depth=5,criterion="entropy"),
"AdaBoost":AdaBoostClassifier(),
"MLP":MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500),
"Nearest Neighbors":KNeighborsClassifier(3)}



# Los parámetros a tener en cuenta para cada ataque quedan definidos en el siguiente diccionario
# Se usarán, para cada ataque, los 4 parámetros con más importancia. Estos datos pueden verse en el archivo importance_list_for_attack_files.csv
features={  'Generic': ["service","dbytes","dsport","Sload",'Label'], 
            'Exploits': ["sttl","dsport","sbytes","dbytes", 'Label'],
            'Fuzzers': ["sttl","sloss","sport","Sload", 'Label'],
            'DoS': ["sttl","dsport","dbytes","sbytes", 'Label'],
            'Reconnaissance': ["sttl","dsport","proto","Dpkts",'Label'],
            'Analysis': ["sttl","dsport","dstip","proto", 'Label'],
            'Backdoor': ["sttl","dsport","dbytes","Dload", 'Label'],
            'Shellcode': ["sttl","dsport","sbytes","dbytes",'Label'],
            'Worms': ["sttl","dsport","sbytes","sport",'Label']}

seconds=time.time() #Para calcular el tiempo de cálculo



with open(result, "w", newline="",encoding="utf-8") as f:#Creamos un archivo csv para guardar los resultados
    wrt = csv.writer(f)
    wrt.writerow(["File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"])

#Este bucle itera sobre la lista de ataques, ejecutando para cada uno todos los algoritmos de ML
for j in csv_files: 
    print ('%-17s %-17s  %-15s %-15s %-15s %-15s %-15s' % ("File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"))
    a=[]
    
    feature_list=list(features[j[0:-4]])
    df=pd.read_csv(path+j,usecols=feature_list)#read an attack file.
    df=df.fillna(0)
    attack_or_not=[]
    for i in df['Label']: #Cambia la etiqueta normal por "1" y la de ataque por "0".
        
        if i =="BENIGN":
            attack_or_not.append(1)
        else:
            attack_or_not.append(0)           
    df["Label"]=attack_or_not

    
    y = df["Label"] #Se separan las etiquetas de los datos, Label=y Data=X 
    del df["Label"]
    feature_list.remove('Label')
    X = df[feature_list]

    
    for ii in ml_list: #El bucle itera sobre la lista de algoritmos a aplicar
        precision=[]
        recall=[]
        f1=[]
        accuracy=[]
        t_time=[]
        for i in range(repetition): # Se repite el algoritmo 10 veces para hacer validación cruzada
            second=time.time()#Para recoger el dato de tiempo

            # Validación cruzada:
            X_train, X_test, y_train, y_test = train_test_split(X, y,#  data (X) and labels (y) are divided into 2 parts to be sent to the machine learning algorithm (80% train,%20 test). 
                test_size = 0.20, random_state = repetition)#  So, in total there are 4 tracks: training data(X_train), training tag (y_train), test data(X_test) and test tag(y_test).


            #Aplicamos el algoritmo de ML:
            clf = ml_list[ii] #Escogemos el algoritmo concreto                                                                         
            clf.fit(X_train, y_train)
            predict =clf.predict(X_test)
        
            #Se obtienen los datos de rendimiento tras la ejecución:  
            f_1=f1_score(y_test, predict, average='macro')
            pr=precision_score(y_test, predict, average='macro')
            rc=recall_score(y_test, predict, average='macro')

            
            precision.append(float(pr))
            recall.append(float(rc))
            f1.append(float(f_1))
            accuracy.append(clf.score(X_test, y_test))
            t_time.append(float((time.time()-second)))


            
        print ('%-17s %-17s  %-15s %-15s %-15s %-15s %-15s' % (j[0:-4],ii,str(round(np.mean(accuracy),2)),str(round(np.mean(precision),2)), 
            str(round(np.mean(recall),2)),str(round(np.mean(f1),2)),str(round(np.mean(t_time),4))))#the result of the ten repetitions is printed on the screen.

        with open(result, "a", newline="",encoding="utf-8") as f: # all the values found are saved in the opened file.
            wrt = csv.writer(f)
            for i in range(0,len(t_time)):
                wrt.writerow([j[0:-4],ii,accuracy[i],precision[i],recall[i],f1[i],t_time[i]])#file name, algorithm name, precision, recall and f-measure are writed in CSV file
        a.append(f1)


    #   Se generan gráficos de caja y bigotes, y se guardan en el directorio result_graph_1:
        
    #ml=["Naive Bayes","QDA","Random Forest","ID3","AdaBoost","MLP","Nearest Neighbors"]
    #temp=0
    #fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(12, 6), sharey=True)
    #for c in range(2):
    #    for b in range(4):
    #        axes[c, b].boxplot(a[temp] )
    #        axes[c, b].set_title(str(j[0:-4])+" - "+str(ml[temp]),fontsize=7)
    #        axes[c, b].set_ylabel(("F measure"))
    #        temp+=1
    #        if temp==7:
    #            break
    #    if temp==7:
    #        break
    #plt.savefig(folder_name+j[0:-4]+".pdf",bbox_inches='tight', papertype = 'a4', orientation = 'portrait', format = 'pdf')
    #plt.show()
    print("\n------------------------------------------------------------------------------------------------------\n\n")
    
print("Cálculo completado")
print("Tiempo total de ejecución: = ",time.time()- seconds ,"segundos")


File              ML algorithm       accuracy        Precision       Recall          F1-score        Time           
Analysis          Naive Bayes        0.85            0.83            0.83            0.83            0.0153         
Analysis          QDA                0.99            0.98            0.98            0.98            0.0171         
Analysis          Random Forest      1.0             1.0             1.0             1.0             0.0316         
Analysis          ID3                1.0             1.0             0.99            0.99            0.0116         
Analysis          AdaBoost           1.0             1.0             1.0             1.0             0.1863         
Analysis          MLP                0.94            0.95            0.93            0.93            0.7534         
Analysis          Nearest Neighbors  0.99            0.99            0.99            0.99            0.27           

---------------------------------------------------------------

In [8]:
##  "all_data.csv" file is required for the operation of the program.
##  "all_data.csv" file must be located in the same directory as the program.

##  the purpose of this program is to apply machine learning algorithms to the dataset and observe the performance of algorithms.
##  the algorithms used are:Naive Bayes, QDA, Random Forest, ID3, AdaBoost, MLP, Nearest Neighbors
##  As the program display output data include: file name, machine learning algorithm name, accuracy,Precision, Recall, F1-score,Time
##  the program will create a CSV file that prints the results and a folder containing graphics.

##  the some codes parts used for calculation and graphing are taken from the following site.
##  http://scikit-learn.org


from sklearn import metrics


from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

#Añadimos 5 nuevos algoritmos de clasificación (supervisado):
from sklearn.linear_model import LogisticRegression #Logistic Regression
from sklearn.svm import SVC #Support Vector Machine
from sklearn.naive_bayes import MultinomialNB #Naive Bayes Multinomial
from sklearn.linear_model import SGDClassifier #Stochastic Gradient Descent Classifier
from sklearn.ensemble import GradientBoostingClassifier #Gradient Boosting Classifier
#from lightgbm import LGBMClassifier
#from xgboost.sklearn import XGBClassifier

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import Normalizer

import matplotlib.pyplot as plt
import numpy as np
#%matplotlib inline
import os
import pandas as pd
import csv
import time
import warnings
import math
warnings.filterwarnings("ignore")


result="./results/results_2.csv" #Archivo CSV donde se guardarán los resultados
#csv_files=os.listdir("attacks")
csv_files = [
            #'Generic.csv', 
            'Exploits.csv',
            #'Fuzzers.csv',
            #'DoS.csv',
            #'Reconnaissance.csv',
            #'Analysis.csv',
            #'Backdoor.csv',
            #'Shellcode.csv',
            #'Worms.csv' #Para poder seleccionar los ataques que quedamos,  <----   ######  BORRAR LUEGO  #######
]
path=".\\attacks\\"
repetition=1


def folder(f_name):
    try:
        if not os.path.exists(f_name):
            os.makedirs(f_name)
    except OSError:
        print ("The folder could not be created!")

#folder_name="./results/"
#folder(folder_name)
#folder_name="./results/result_graph_2/"
#folder(folder_name)


ml_list={
"Logistic Regression":LogisticRegression(penalty='l2', dual=False, solver='newton-cg', class_weight='balanced'),
"Support Vector Machine":SVC(gamma='auto', kernel='linear', C=5),
"Naive Bayes Multinomial":MultinomialNB(alpha=0.0, fit_prior=False),
"SGD Classifier" :SGDClassifier(max_iter=2000, tol=1e-4, loss='squared_hinge', penalty='l1'),
"Gradient Boosting Classifier":GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
}


features={  'Generic': ["service","dbytes","dsport","Sload",'Label'], 
            'Exploits': ["sttl","dsport","sbytes","dbytes", 'Label'],
            'Fuzzers': ["sttl","sloss","sport","Sload", 'Label'],
            'DoS': ["sttl","dsport","dbytes","sbytes", 'Label'],
            'Reconnaissance': ["sttl","dsport","proto","Dpkts",'Label'],
            'Analysis': ["sttl","dsport","dstip","proto", 'Label'],
            'Backdoor': ["sttl","dsport","dbytes","Dload", 'Label'],
            'Shellcode': ["sttl","dsport","sbytes","dbytes",'Label'],
            'Worms': ["sttl","dsport","sbytes","sport",'Label']}

seconds=time.time()


with open(result, "w", newline="",encoding="utf-8") as f:
    wrt = csv.writer(f)
    wrt.writerow(["File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"])


for j in csv_files: 
    print ('%-17s %-27s  %-15s %-15s %-15s %-15s %-15s' % ("File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"))# print output header
    a=[]
    
    feature_list=list(features[j[0:-4]])
    df=pd.read_csv(path+j,usecols=feature_list)
    df = df.sample(50000)
    df=df.fillna(0)
    attack_or_not=[]
    for i in df["Label"]:
        if i =="BENIGN":
            attack_or_not.append(1)
        else:
            attack_or_not.append(0)
    df["Label"]=attack_or_not

    
    y = df["Label"]
    del df["Label"]
    feature_list.remove('Label')
    X = df[feature_list]

    
    for ii in ml_list: 
        precision=[]
        recall=[]
        f1=[]
        accuracy=[]
        t_time=[]
        for i in range(repetition):
            second=time.time()

            X_train, X_test, y_train, y_test = train_test_split(X, y,#  data (X) and labels (y) are divided into 2 parts to be sent to the machine learning algorithm (80% train,%20 test). 
                test_size = 0.20, random_state = repetition)#  So, in total there are 4 tracks: training data(X_train), training tag (y_train), test data(X_test) and test tag(y_test).
            if ii in ["Naive Bayes Multinomial", "SGD Classifier", "Logistic Regression"]:
                scaler = MinMaxScaler()
                #scaler = StandardScaler()
                #scaler = Normalizer()
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)

            if ii in ["Support Vector Machine"]:
                #scaler = MinMaxScaler()
                scaler = StandardScaler()
                #scaler = Normalizer()
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)

            #machine learning algorithm is applied in this section
            clf = ml_list[ii]#choose algorithm from ml_list dictionary                                                                          
            clf.fit(X_train, y_train)
            predict =clf.predict(X_test)
        
            #makes "classification report" and assigns the precision, f-measure, and recall values.s.    
            f_1=f1_score(y_test, predict, average='macro')
            pr=precision_score(y_test, predict, average='macro')
            rc=recall_score(y_test, predict, average='macro')

            
            precision.append(float(pr))
            recall.append(float(rc))
            f1.append(float(f_1))
            accuracy.append(clf.score(X_test, y_test))
            t_time.append(float((time.time()-second)) )


            
        print ('%-17s %-27s  %-15s %-15s %-15s %-15s %-15s' % (j[0:-4],ii,str(round(np.mean(accuracy),2)),str(round(np.mean(precision),2)), 
            str(round(np.mean(recall),2)),str(round(np.mean(f1),2)),str(round(np.mean(t_time),4))))#the result of the ten repetitions is printed on the screen.

        with open(result, "a", newline="",encoding="utf-8") as f: # all the values found are saved in the opened file.
            wrt = csv.writer(f)
            for i in range(0,len(t_time)):
                wrt.writerow([j[0:-4],ii,accuracy[i],precision[i],recall[i],f1[i],t_time[i]])#file name, algorithm name, precision, recall and f-measure are writed in CSV file
        a.append(f1)




    print("\n------------------------------------------------------------------------------------------------------\n\n")
    
print("mission accomplished!")
print("Total operation time: = ",time.time()- seconds ,"seconds")


File              ML algorithm                 accuracy        Precision       Recall          F1-score        Time           
Exploits          Logistic Regression          0.96            0.96            0.95            0.95            0.1152         
Exploits          Support Vector Machine       0.87            0.89            0.79            0.82            58.2061        
Exploits          Naive Bayes Multinomial      0.62            0.63            0.65            0.6             0.0914         
Exploits          SGD Classifier               0.79            0.75            0.73            0.74            0.6082         
Exploits          Gradient Boosting Classifier  0.98            0.98            0.99            0.98            2.5647         

------------------------------------------------------------------------------------------------------


mission accomplished!
Total operation time: =  62.11910939216614 seconds
