## UNSW-NB15
Implementación de los IDS para el conjunto total de datos usando algoritmos de aprendizaje supervisado

In [7]:
####  ALL_DATA: 7 LABELS - P1 ALGORITHMS

#  Se requiere el archivo all_data_preproc.csv para la ejecución de este código.
#  El archivo all_data_preproc.csv debe estar ubicado en el mismo directorio que el programa.

#   El objetivo de este código es aplicar algoritmos de aprendizaje automático al dataset y observar su desempeño
#   Los algoritmos usados son: Naive Bayes, QDA, Random Forest, ID3, AdaBoost, MLP y K Nearest Neighbors
#   Las medidas de rendimiento calculadas y mostradas son accuracy, precision, recall y F1-score. También se recoge para cada uno el tiempo que ha tardado en calcular.
#   Se creará también un archivo CSV (results_1.csv) con los resultados y un directorio (result_graph) con los correspondientes gráficos

##  the some codes parts used for calculation and graphing are taken from the following site.
##  http://scikit-learn.org


from sklearn import metrics
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

import matplotlib.pyplot as plt
import numpy as np
#%matplotlib inline
import os
import pandas as pd
import csv
import time
import warnings
import math
warnings.filterwarnings("ignore")


result="./results/results_2.csv" #a CSV file is named in which the results are saved.
csv_files=["all_data_preproc.csv"]# CSV files names: #The names of the dataset files (csv_files).
path=""
repetition=10


def folder(f_name):
    try:
        if not os.path.exists(f_name):
            os.makedirs(f_name)
    except OSError:
        print ("The folder could not be created!")

folder_name="./results/"
folder(folder_name)
folder_name="./results/result_graph_1/"
folder(folder_name)


#   Los algoritmos utilizados se manejarán con una diccionario:
ml_list={
"Naive Bayes":GaussianNB(),
"QDA":QDA(),
"Random Forest":RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
"ID3" :DecisionTreeClassifier(max_depth=5,criterion="entropy"),
"AdaBoost":AdaBoostClassifier(),
"MLP":MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500),
"Nearest Neighbors":KNeighborsClassifier(3)}


features={  'all_data_preproc': ["dsport", "sbytes", "sport", "Sload", "dbytes", "Spkts", "dstip", "Label"]}

seconds=time.time() #Para calcular el tiempo de cálculo


with open(result, "w", newline="",encoding="utf-8") as f:#Creamos un archivo csv para guardar los resultados
    wrt = csv.writer(f)
    wrt.writerow(["File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"])

#Este bucle itera sobre la lista de ataques, ejecutando para cada uno todos los algoritmos de ML
for j in csv_files: #this loop runs on the list containing the filenames.Operations are repeated for all attack files
    print ('%-17s %-17s  %-15s %-15s %-15s %-15s %-15s' % ("File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"))# print output header   
    feature_list=list(features[j[0:-4]])
    df=pd.read_csv(path+j,usecols=feature_list)#read an attack file.
    df=df.fillna(0)
    attack_or_not=[]
    for i in df["Label"]: #it changes the normal label to "1" and the attack tag to "0" for use in the machine learning algorithm
        
        if i =="BENIGN":
            attack_or_not.append(1)
        else:
            attack_or_not.append(0)           
    df["Label"]=attack_or_not

    
    y = df["Label"] #this section separates the label and the data into two separate pieces, as Label=y Data=X 
    del df["Label"]
    feature_list.remove('Label')
    X = df[feature_list]

    
    for ii in ml_list: #this loop runs on the list containing the machine learning algorithm names. Operations are repeated for all the 7 algorithm
        precision=[]
        recall=[]
        f1=[]
        accuracy=[]
        t_time=[]
        for i in range(repetition): # This loop allows cross-validation and machine learning algorithm to be repeated 10 times
            second=time.time()#time stamp for processing time

            # cross-validation
            X_train, X_test, y_train, y_test = train_test_split(X, y,#  data (X) and labels (y) are divided into 2 parts to be sent to the machine learning algorithm (80% train,%20 test). 
                test_size = 0.20, random_state = repetition)#  So, in total there are 4 tracks: training data(X_train), training tag (y_train), test data(X_test) and test tag(y_test).


            #machine learning algorithm is applied in this section
            clf = ml_list[ii]#choose algorithm from ml_list dictionary                                                                          
            clf.fit(X_train, y_train)
            predict =clf.predict(X_test)
        
            #makes "classification report" and assigns the precision, f-measure, and recall values.s.    


            f_1=f1_score(y_test, predict, average='macro')
            pr=precision_score(y_test, predict, average='macro')
            rc=recall_score(y_test, predict, average='macro')


            precision.append(float(pr))
            recall.append(float(rc))
            f1.append(float(f_1))
            accuracy.append(clf.score(X_test, y_test))
            t_time.append(float((time.time()-second)) )



            
        print ('%-17s %-17s  %-15s %-15s %-15s %-15s %-15s' % (j[0:-4],ii,str(round(np.mean(accuracy),2)),str(round(np.mean(precision),2)), 
            str(round(np.mean(recall),2)),str(round(np.mean(f1),2)),str(round(np.mean(t_time),4))))#the result of the ten repetitions is printed on the screen.

        with open(result, "a", newline="",encoding="utf-8") as f: # all the values found are saved in the opened file.
            wrt = csv.writer(f)
            for i in range(0,len(t_time)):
                wrt.writerow([j[0:-4],ii,accuracy[i],precision[i],recall[i],f1[i],t_time[i]])#file name, algorithm name, precision, recall and f-measure are writed in CSV file
   
     # In this section, Box graphics are created for the results of machine learning algorithms and saved in the feaure_graph folder.
     #   plt.boxplot(f1)
     #   plt.title("All Dataset - " +str(ii))
     #   plt.ylabel('F-measure')
     #   plt.savefig(folder_name+j[0:-4]+str(ii)+".pdf",bbox_inches='tight', papertype = 'a4', orientation = 'portrait', format = 'pdf')
     #   plt.show()# you can remove the # sign if you want to see the graphics simultaneously

print("Cálculo completado")
print("Tiempo total de ejecución: = ",time.time()- seconds ,"segundos")


File              ML algorithm       accuracy        Precision       Recall          F1-score        Time           
all_data_preproc  Naive Bayes        0.86            0.59            0.52            0.51            2.2354         
all_data_preproc  QDA                0.87            0.66            0.55            0.56            2.4654         
all_data_preproc  Random Forest      0.98            0.98            0.91            0.94            19.0587        
all_data_preproc  ID3                0.98            0.98            0.95            0.96            9.4779         
all_data_preproc  AdaBoost           0.99            0.97            0.97            0.97            165.1903       
all_data_preproc  MLP                0.87            0.92            0.5             0.47            133.4903       
all_data_preproc  Nearest Neighbors  0.99            0.97            0.97            0.97            132.4598       
Cálculo completado
Tiempo total de ejecución: =  4652.0393166542

In [7]:
####  ALL_DATA: 7 LABELS - P2 ALGORITHMS

#  Se requiere el archivo all_data_preproc.csv para la ejecución de este código.
#  El archivo all_data_preproc.csv debe estar ubicado en el mismo directorio que el programa.

#   El objetivo de este código es aplicar algoritmos de aprendizaje automático al dataset y observar su desempeño
#   Los algoritmos usados son: Naive Bayes, QDA, Random Forest, ID3, AdaBoost, MLP y K Nearest Neighbors
#   Las medidas de rendimiento calculadas y mostradas son accuracy, precision, recall y F1-score. También se recoge para cada uno el tiempo que ha tardado en calcular.
#   Se creará también un archivo CSV (results_1.csv) con los resultados y un directorio (result_graph) con los correspondientes gráficos

##  the some codes parts used for calculation and graphing are taken from the following site.
##  http://scikit-learn.org


from sklearn import metrics
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression #Logistic Regression
from sklearn.svm import SVC #Support Vector Machine
from sklearn.naive_bayes import MultinomialNB #Naive Bayes Multinomial
from sklearn.linear_model import SGDClassifier #Stochastic Gradient Descent Classifier
from sklearn.ensemble import GradientBoostingClassifier #Gradient Boosting Classifier

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import Normalizer

import matplotlib.pyplot as plt
import numpy as np
#%matplotlib inline
import os
import pandas as pd
import csv
import time
import warnings
import math
warnings.filterwarnings("ignore")


result="./results/results_2.csv" #a CSV file is named in which the results are saved.
csv_files=["all_data_preproc.csv"]# CSV files names: #The names of the dataset files (csv_files).
path=""
repetition=1


def folder(f_name):
    try:
        if not os.path.exists(f_name):
            os.makedirs(f_name)
    except OSError:
        print ("The folder could not be created!")

folder_name="./results/"
folder(folder_name)
folder_name="./results/result_graph_1/"
folder(folder_name)


#   Los algoritmos utilizados se manejarán con una diccionario:
ml_list={
#"Logistic Regression":LogisticRegression(penalty='l2', dual=False, solver='newton-cg', class_weight='balanced'),
#"Support Vector Machine":SVC(gamma='auto', kernel='linear', C=5),
#"Naive Bayes Multinomial":MultinomialNB(alpha=0.0, fit_prior=False),
#"SGD Classifier" :SGDClassifier(max_iter=2000, tol=1e-4, loss='squared_hinge', penalty='l1'),
"Gradient Boosting Classifier":GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
}


features={  'all_data_preproc': ["dsport", "sbytes", "sport", "Sload", "dbytes", "Spkts", "dstip", "Label"]}

seconds=time.time() #Para calcular el tiempo de cálculo


with open(result, "w", newline="",encoding="utf-8") as f:#Creamos un archivo csv para guardar los resultados
    wrt = csv.writer(f)
    wrt.writerow(["File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"])

#Este bucle itera sobre la lista de ataques, ejecutando para cada uno todos los algoritmos de ML
for j in csv_files: #this loop runs on the list containing the filenames.Operations are repeated for all attack files
    print ('%-17s %-17s  %-15s %-15s %-15s %-15s %-15s' % ("File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"))# print output header   
    feature_list=list(features[j[0:-4]])
    df=pd.read_csv(path+j,usecols=feature_list)#read an attack file.
    df=df.fillna(0)
    df = df.sample(100000)
    attack_or_not=[]
    for i in df["Label"]: #it changes the normal label to "1" and the attack tag to "0" for use in the machine learning algorithm
        
        if i =="BENIGN":
            attack_or_not.append(1)
        else:
            attack_or_not.append(0)           
    df["Label"]=attack_or_not

    
    y = df["Label"] #this section separates the label and the data into two separate pieces, as Label=y Data=X 
    del df["Label"]
    feature_list.remove('Label')
    X = df[feature_list]

    
    for ii in ml_list: #this loop runs on the list containing the machine learning algorithm names. Operations are repeated for all the 7 algorithm
        precision=[]
        recall=[]
        f1=[]
        accuracy=[]
        t_time=[]
        for i in range(repetition):
            second=time.time()

            if ii in ["Naive Bayes Multinomial", "SGD Classifier", "Logistic Regression"]:
                scaler = MinMaxScaler()
                X = scaler.fit_transform(X)
            if ii in ["Support Vector Machine"]:
                scaler = StandardScaler()
                X = scaler.fit_transform(X)

            X_train, X_test, y_train, y_test = train_test_split(X, y,#  data (X) and labels (y) are divided into 2 parts to be sent to the machine learning algorithm (80% train,%20 test). 
                test_size = 0.20, random_state = repetition)#  So, in total there are 4 tracks: training data(X_train), training tag (y_train), test data(X_test) and test tag(y_test).

            #machine learning algorithm is applied in this section
            clf = ml_list[ii]#choose algorithm from ml_list dictionary                                                                          
            clf.fit(X_train, y_train)
            predict =clf.predict(X_test)
        
            #makes "classification report" and assigns the precision, f-measure, and recall values.s.    

            f_1=f1_score(y_test, predict, average='macro')
            pr=precision_score(y_test, predict, average='macro')
            rc=recall_score(y_test, predict, average='macro')


            precision.append(float(pr))
            recall.append(float(rc))
            f1.append(float(f_1))
            accuracy.append(clf.score(X_test, y_test))
            t_time.append(float((time.time()-second)) )

            
        print ('%-17s %-17s  %-15s %-15s %-15s %-15s %-15s' % (j[0:-4],ii,str(round(np.mean(accuracy),2)),str(round(np.mean(precision),2)), 
            str(round(np.mean(recall),2)),str(round(np.mean(f1),2)),str(round(np.mean(t_time),4))))#the result of the ten repetitions is printed on the screen.

        with open(result, "a", newline="",encoding="utf-8") as f: # all the values found are saved in the opened file.
            wrt = csv.writer(f)
            for i in range(0,len(t_time)):
                wrt.writerow([j[0:-4],ii,accuracy[i],precision[i],recall[i],f1[i],t_time[i]])#file name, algorithm name, precision, recall and f-measure are writed in CSV file
   
     # In this section, Box graphics are created for the results of machine learning algorithms and saved in the feaure_graph folder.
     #   plt.boxplot(f1)
     #   plt.title("All Dataset - " +str(ii))
     #   plt.ylabel('F-measure')
     #   plt.savefig(folder_name+j[0:-4]+str(ii)+".pdf",bbox_inches='tight', papertype = 'a4', orientation = 'portrait', format = 'pdf')
     #   plt.show()# you can remove the # sign if you want to see the graphics simultaneously

print("Cálculo completado")
print("Tiempo total de ejecución: = ",time.time()- seconds ,"segundos")



File              ML algorithm       accuracy        Precision       Recall          F1-score        Time           
all_data_preproc  Gradient Boosting Classifier  0.94            0.97            0.74            0.81            4.1431         
Cálculo completado
Tiempo total de ejecución: =  11.26466989517212 segundos


In [2]:
####  ALL_DATA: 12 LABELS - P1 ALGORITHMS

#  Se requiere el archivo all_data_preproc.csv para la ejecución de este código.
#  El archivo all_data_preproc.csv debe estar ubicado en el mismo directorio que el programa.

#   El objetivo de este código es aplicar algoritmos de aprendizaje automático al dataset y observar su desempeño
#   Los algoritmos usados son: Naive Bayes, QDA, Random Forest, ID3, AdaBoost, MLP y K Nearest Neighbors
#   Las medidas de rendimiento calculadas y mostradas son accuracy, precision, recall y F1-score. También se recoge para cada uno el tiempo que ha tardado en calcular.
#   Se creará también un archivo CSV (results_1.csv) con los resultados y un directorio (result_graph) con los correspondientes gráficos

##  the some codes parts used for calculation and graphing are taken from the following site.
##  http://scikit-learn.org


from sklearn import metrics
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

import matplotlib.pyplot as plt
import numpy as np
#%matplotlib inline
import os
import pandas as pd
import csv
import time
import warnings
import math
warnings.filterwarnings("ignore")


result="./results/results_2.csv" #a CSV file is named in which the results are saved.
csv_files=["all_data_preproc.csv"]# CSV files names: #The names of the dataset files (csv_files).
path=""
repetition=10


def folder(f_name):
    try:
        if not os.path.exists(f_name):
            os.makedirs(f_name)
    except OSError:
        print ("The folder could not be created!")

folder_name="./results/"
folder(folder_name)
folder_name="./results/result_graph_1/"
folder(folder_name)


#   Los algoritmos utilizados se manejarán con una diccionario:
ml_list={
#"Naive Bayes":GaussianNB(),
#"QDA":QDA(),
#"Random Forest":RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
#"ID3" :DecisionTreeClassifier(max_depth=5,criterion="entropy"),
#"AdaBoost":AdaBoostClassifier(),
#"MLP":MLPClassifier(hidden_layer_sizes=(13,13,13),max_iter=500),
"Nearest Neighbors":KNeighborsClassifier(3)
}

features={ 'all_data_preproc': ["service","dbytes","dsport","Sload","sttl","sbytes","sloss","sport","proto","Dpkts","dstip","Dload", "Label"]}


seconds=time.time() #Para calcular el tiempo de cálculo


with open(result, "w", newline="",encoding="utf-8") as f:#Creamos un archivo csv para guardar los resultados
    wrt = csv.writer(f)
    wrt.writerow(["File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"])

#Este bucle itera sobre la lista de ataques, ejecutando para cada uno todos los algoritmos de ML
for j in csv_files: #this loop runs on the list containing the filenames.Operations are repeated for all attack files
    print ('%-17s %-17s  %-15s %-15s %-15s %-15s %-15s' % ("File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"))# print output header   
    feature_list=list(features[j[0:-4]])
    df=pd.read_csv(path+j,usecols=feature_list)#read an attack file.
    df=df.fillna(0)
    attack_or_not=[]
    for i in df["Label"]: #it changes the normal label to "1" and the attack tag to "0" for use in the machine learning algorithm
        
        if i =="BENIGN":
            attack_or_not.append(1)
        else:
            attack_or_not.append(0)           
    df["Label"]=attack_or_not

    
    y = df["Label"] #this section separates the label and the data into two separate pieces, as Label=y Data=X 
    del df["Label"]
    feature_list.remove('Label')
    X = df[feature_list]

    
    for ii in ml_list: #this loop runs on the list containing the machine learning algorithm names. Operations are repeated for all the 7 algorithm
        precision=[]
        recall=[]
        f1=[]
        accuracy=[]
        t_time=[]
        for i in range(repetition): # This loop allows cross-validation and machine learning algorithm to be repeated 10 times
            second=time.time()#time stamp for processing time

            # cross-validation
            X_train, X_test, y_train, y_test = train_test_split(X, y,#  data (X) and labels (y) are divided into 2 parts to be sent to the machine learning algorithm (80% train,%20 test). 
                test_size = 0.20, random_state = repetition)#  So, in total there are 4 tracks: training data(X_train), training tag (y_train), test data(X_test) and test tag(y_test).


            #machine learning algorithm is applied in this section
            clf = ml_list[ii]#choose algorithm from ml_list dictionary                                                                          
            clf.fit(X_train, y_train)
            predict =clf.predict(X_test)
        
            #makes "classification report" and assigns the precision, f-measure, and recall values.s.    


            f_1=f1_score(y_test, predict, average='macro')
            pr=precision_score(y_test, predict, average='macro')
            rc=recall_score(y_test, predict, average='macro')


            precision.append(float(pr))
            recall.append(float(rc))
            f1.append(float(f_1))
            accuracy.append(clf.score(X_test, y_test))
            t_time.append(float((time.time()-second)) )

            
        print ('%-17s %-17s  %-15s %-15s %-15s %-15s %-15s' % (j[0:-4],ii,str(round(np.mean(accuracy),2)),str(round(np.mean(precision),2)), 
            str(round(np.mean(recall),2)),str(round(np.mean(f1),2)),str(round(np.mean(t_time),4))))#the result of the ten repetitions is printed on the screen.

        with open(result, "a", newline="",encoding="utf-8") as f: # all the values found are saved in the opened file.
            wrt = csv.writer(f)
            for i in range(0,len(t_time)):
                wrt.writerow([j[0:-4],ii,accuracy[i],precision[i],recall[i],f1[i],t_time[i]])#file name, algorithm name, precision, recall and f-measure are writed in CSV file
   
     # In this section, Box graphics are created for the results of machine learning algorithms and saved in the feaure_graph folder.
     #   plt.boxplot(f1)
     #   plt.title("All Dataset - " +str(ii))
     #   plt.ylabel('F-measure')
     #   plt.savefig(folder_name+j[0:-4]+str(ii)+".pdf",bbox_inches='tight', papertype = 'a4', orientation = 'portrait', format = 'pdf')
     #   plt.show()# you can remove the # sign if you want to see the graphics simultaneously

print("Cálculo completado")
print("Tiempo total de ejecución: = ",time.time()- seconds ,"segundos")


File              ML algorithm       accuracy        Precision       Recall          F1-score        Time           
all_data_preproc  Nearest Neighbors  0.99            0.98            0.97            0.97            268.961        
Cálculo completado
Tiempo total de ejecución: =  820.0980660915375 segundos


In [5]:
####  ALL_DATA: 12 LABELS - P2 ALGORITHMS

#  Se requiere el archivo all_data_preproc.csv para la ejecución de este código.
#  El archivo all_data_preproc.csv debe estar ubicado en el mismo directorio que el programa.

#   El objetivo de este código es aplicar algoritmos de aprendizaje automático al dataset y observar su desempeño
#   Los algoritmos usados son: Naive Bayes, QDA, Random Forest, ID3, AdaBoost, MLP y K Nearest Neighbors
#   Las medidas de rendimiento calculadas y mostradas son accuracy, precision, recall y F1-score. También se recoge para cada uno el tiempo que ha tardado en calcular.
#   Se creará también un archivo CSV (results_1.csv) con los resultados y un directorio (result_graph) con los correspondientes gráficos

##  the some codes parts used for calculation and graphing are taken from the following site.
##  http://scikit-learn.org


from sklearn import metrics
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression #Logistic Regression
from sklearn.svm import SVC #Support Vector Machine
from sklearn.naive_bayes import MultinomialNB #Naive Bayes Multinomial
from sklearn.linear_model import SGDClassifier #Stochastic Gradient Descent Classifier
from sklearn.ensemble import GradientBoostingClassifier #Gradient Boosting Classifier

from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import Normalizer

import matplotlib.pyplot as plt
import numpy as np
#%matplotlib inline
import os
import pandas as pd
import csv
import time
import warnings
import math
warnings.filterwarnings("ignore")


result="./results/results_2.csv" #a CSV file is named in which the results are saved.
csv_files=["all_data_preproc.csv"]# CSV files names: #The names of the dataset files (csv_files).
path=""
repetition=1


def folder(f_name):
    try:
        if not os.path.exists(f_name):
            os.makedirs(f_name)
    except OSError:
        print ("The folder could not be created!")

folder_name="./results/"
folder(folder_name)
folder_name="./results/result_graph_1/"
folder(folder_name)


#   Los algoritmos utilizados se manejarán con una diccionario:
ml_list={
#"Logistic Regression":LogisticRegression(penalty='l2', dual=False, solver='newton-cg', class_weight='balanced'),
#"Support Vector Machine":SVC(gamma='auto', kernel='linear', C=5),
#"Naive Bayes Multinomial":MultinomialNB(alpha=0.0, fit_prior=False),
#"SGD Classifier" :SGDClassifier(max_iter=2000, tol=1e-4, loss='squared_hinge', penalty='l1'),
"Gradient Boosting Classifier":GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
}


features={ 'all_data_preproc': ["service","dbytes","dsport","Sload","sttl","sbytes","sloss","sport","proto","Dpkts","dstip","Dload", "Label"]}

seconds=time.time() #Para calcular el tiempo de cálculo


with open(result, "w", newline="",encoding="utf-8") as f:#Creamos un archivo csv para guardar los resultados
    wrt = csv.writer(f)
    wrt.writerow(["File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"])

#Este bucle itera sobre la lista de ataques, ejecutando para cada uno todos los algoritmos de ML
for j in csv_files: #this loop runs on the list containing the filenames.Operations are repeated for all attack files
    print ('%-17s %-17s  %-15s %-15s %-15s %-15s %-15s' % ("File","ML algorithm","accuracy","Precision", "Recall" , "F1-score","Time"))# print output header   
    feature_list=list(features[j[0:-4]])
    df=pd.read_csv(path+j,usecols=feature_list)#read an attack file.
    df=df.fillna(0)
    df=df.sample(100000)
    attack_or_not=[]
    for i in df["Label"]: #it changes the normal label to "1" and the attack tag to "0" for use in the machine learning algorithm
        
        if i =="BENIGN":
            attack_or_not.append(1)
        else:
            attack_or_not.append(0)           
    df["Label"]=attack_or_not

    
    y = df["Label"] #this section separates the label and the data into two separate pieces, as Label=y Data=X 
    del df["Label"]
    feature_list.remove('Label')
    X = df[feature_list]

    
    for ii in ml_list: #this loop runs on the list containing the machine learning algorithm names. Operations are repeated for all the 7 algorithm
        precision=[]
        recall=[]
        f1=[]
        accuracy=[]
        t_time=[]
        for i in range(repetition):
            second=time.time()

            if ii in ["Naive Bayes Multinomial", "SGD Classifier", "Logistic Regression"]:
                scaler = MinMaxScaler()
                X = scaler.fit_transform(X)
            if ii in ["Support Vector Machine"]:
                scaler = StandardScaler()
                X = scaler.fit_transform(X)

            X_train, X_test, y_train, y_test = train_test_split(X, y,#  data (X) and labels (y) are divided into 2 parts to be sent to the machine learning algorithm (80% train,%20 test). 
                test_size = 0.20, random_state = repetition)#  So, in total there are 4 tracks: training data(X_train), training tag (y_train), test data(X_test) and test tag(y_test).

            #machine learning algorithm is applied in this section
            clf = ml_list[ii]#choose algorithm from ml_list dictionary                                                                          
            clf.fit(X_train, y_train)
            predict =clf.predict(X_test)
        
            #makes "classification report" and assigns the precision, f-measure, and recall values.s.    

            f_1=f1_score(y_test, predict, average='macro')
            pr=precision_score(y_test, predict, average='macro')
            rc=recall_score(y_test, predict, average='macro')


            precision.append(float(pr))
            recall.append(float(rc))
            f1.append(float(f_1))
            accuracy.append(clf.score(X_test, y_test))
            t_time.append(float((time.time()-second)) )

            
        print ('%-17s %-17s  %-15s %-15s %-15s %-15s %-15s' % (j[0:-4],ii,str(round(np.mean(accuracy),2)),str(round(np.mean(precision),2)), 
            str(round(np.mean(recall),2)),str(round(np.mean(f1),2)),str(round(np.mean(t_time),4))))#the result of the ten repetitions is printed on the screen.

        with open(result, "a", newline="",encoding="utf-8") as f: # all the values found are saved in the opened file.
            wrt = csv.writer(f)
            for i in range(0,len(t_time)):
                wrt.writerow([j[0:-4],ii,accuracy[i],precision[i],recall[i],f1[i],t_time[i]])#file name, algorithm name, precision, recall and f-measure are writed in CSV file
   
     # In this section, Box graphics are created for the results of machine learning algorithms and saved in the feaure_graph folder.
     #   plt.boxplot(f1)
     #   plt.title("All Dataset - " +str(ii))
     #   plt.ylabel('F-measure')
     #   plt.savefig(folder_name+j[0:-4]+str(ii)+".pdf",bbox_inches='tight', papertype = 'a4', orientation = 'portrait', format = 'pdf')
     #   plt.show()# you can remove the # sign if you want to see the graphics simultaneously

print("Cálculo completado")
print("Tiempo total de ejecución: = ",time.time()- seconds ,"segundos")



File              ML algorithm       accuracy        Precision       Recall          F1-score        Time           


all_data_preproc  Gradient Boosting Classifier  0.97            0.92            0.98            0.95            12.9346        
Cálculo completado
Tiempo total de ejecución: =  21.877104997634888 segundos
