In [1]:
import numpy as np  
import pandas as pd 
import pickle
from prettytable import PrettyTable
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
# Leer los datasets
path = './combined_data.csv'  # Importar de los csv
dataset = pd.read_csv(path, header = None, skiprows=1) # Concatenar datos de csv

In [3]:
dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
0,175.45.176.0,26088,149.171.126.14,80,tcp,CON,10.147043,2236,457700,62,...,0,0,0,1,1,1,1,1,1,1
1,175.45.176.2,44712,149.171.126.11,179,tcp,FIN,0.195649,172,258,254,...,0,0,0,1,1,1,1,1,1,1
2,59.166.0.3,1066,149.171.126.6,143,tcp,FIN,0.513527,4190,10244,31,...,0,0,0,1,1,1,1,1,1,1
3,149.171.126.11,4433,175.45.176.0,20108,udp,CON,4.631897,2652,2144,60,...,0,0,0,1,1,1,1,1,1,1
4,175.45.176.0,36014,149.171.126.11,25,tcp,CON,5.477102,300727,2640,62,...,0,0,0,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39809,59.166.0.4,15966,149.171.126.3,53,udp,CON,0.001002,146,178,31,...,0,0,0,1,1,1,1,1,1,1
39810,59.166.0.4,63665,149.171.126.1,80,tcp,CON,0.024294,994,8896,31,...,1,0,0,1,1,1,1,1,1,1
39811,59.166.0.8,18586,149.171.126.7,5190,tcp,FIN,0.004097,1470,1728,31,...,0,0,0,1,1,1,1,1,1,1
39812,59.166.0.9,61840,149.171.126.6,53,udp,CON,0.001048,146,178,31,...,0,0,0,1,1,1,1,1,1,1


In [4]:
# Importar nombres columnas
columnas_dataset = pd.read_csv('./machinelearningbinario/NUSW-NB15_features.csv', encoding='ISO-8859-1')
columnas_dataset

Unnamed: 0,No.,Name,Type,Description
0,1,srcip,nominal,Source IP address
1,2,sport,integer,Source port number
2,3,dstip,nominal,Destination IP address
3,4,dsport,integer,Destination port number
4,5,proto,nominal,Transaction protocol
5,6,state,nominal,Indicates to the state and its dependent proto...
6,7,dur,Float,Record total duration
7,8,sbytes,Integer,Source to destination transaction bytes
8,9,dbytes,Integer,Destination to source transaction bytes
9,10,sttl,Integer,Source to destination time to live value


In [5]:
# Todos los nombres en letra pequeña y eliminando espacios
columnas_dataset = columnas_dataset.iloc[:47, :]
columnas_dataset['Name'] = columnas_dataset['Name'].apply(lambda x: x.strip().replace(' ', '').lower())
columnas_dataset

Unnamed: 0,No.,Name,Type,Description
0,1,srcip,nominal,Source IP address
1,2,sport,integer,Source port number
2,3,dstip,nominal,Destination IP address
3,4,dsport,integer,Destination port number
4,5,proto,nominal,Transaction protocol
5,6,state,nominal,Indicates to the state and its dependent proto...
6,7,dur,Float,Record total duration
7,8,sbytes,Integer,Source to destination transaction bytes
8,9,dbytes,Integer,Destination to source transaction bytes
9,10,sttl,Integer,Source to destination time to live value


In [6]:
# Renombrar las columnas del dataset
dataset.columns = columnas_dataset['Name']

In [7]:
dataset.shape

(39814, 47)

In [8]:
dataset.head()

Name,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_flw_http_mthd,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
0,175.45.176.0,26088,149.171.126.14,80,tcp,CON,10.147043,2236,457700,62,...,0,0,0,1,1,1,1,1,1,1
1,175.45.176.2,44712,149.171.126.11,179,tcp,FIN,0.195649,172,258,254,...,0,0,0,1,1,1,1,1,1,1
2,59.166.0.3,1066,149.171.126.6,143,tcp,FIN,0.513527,4190,10244,31,...,0,0,0,1,1,1,1,1,1,1
3,149.171.126.11,4433,175.45.176.0,20108,udp,CON,4.631897,2652,2144,60,...,0,0,0,1,1,1,1,1,1,1
4,175.45.176.0,36014,149.171.126.11,25,tcp,CON,5.477102,300727,2640,62,...,0,0,0,1,1,1,1,1,1,1


In [9]:
dataset.isnull().sum()

Name
srcip                  0
sport                  0
dstip                  0
dsport                 0
proto                  0
state                  0
dur                    0
sbytes                 0
dbytes                 0
sttl                   0
dttl                   0
sloss                  0
dloss                  0
service                0
sload                  0
dload                  0
spkts                  0
dpkts                  0
swin                   0
dwin                   0
stcpb                  0
dtcpb                  0
smeansz                0
dmeansz                0
trans_depth            0
res_bdy_len            0
sjit                1269
djit                 290
stime                  0
ltime                  0
sintpkt                0
dintpkt                0
tcprtt                 0
synack                 0
ackdat                 0
is_sm_ips_ports        0
ct_state_ttl           0
ct_flw_http_mthd       0
is_ftp_login           0
ct_ftp_cmd          

In [10]:
dataset['sjit'] = dataset.sjit.fillna(value=0)

In [11]:
dataset['djit'] = dataset.djit.fillna(value=0)

In [12]:
dataset.isnull().sum()

Name
srcip               0
sport               0
dstip               0
dsport              0
proto               0
state               0
dur                 0
sbytes              0
dbytes              0
sttl                0
dttl                0
sloss               0
dloss               0
service             0
sload               0
dload               0
spkts               0
dpkts               0
swin                0
dwin                0
stcpb               0
dtcpb               0
smeansz             0
dmeansz             0
trans_depth         0
res_bdy_len         0
sjit                0
djit                0
stime               0
ltime               0
sintpkt             0
dintpkt             0
tcprtt              0
synack              0
ackdat              0
is_sm_ips_ports     0
ct_state_ttl        0
ct_flw_http_mthd    0
is_ftp_login        0
ct_ftp_cmd          0
ct_srv_src          0
ct_srv_dst          0
ct_dst_ltm          0
ct_src_ltm          0
ct_src_dport_ltm    0
ct_ds

In [13]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39814 entries, 0 to 39813
Data columns (total 47 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   srcip             39814 non-null  object 
 1   sport             39814 non-null  object 
 2   dstip             39814 non-null  object 
 3   dsport            39814 non-null  object 
 4   proto             39814 non-null  object 
 5   state             39814 non-null  object 
 6   dur               39814 non-null  object 
 7   sbytes            39814 non-null  int64  
 8   dbytes            39814 non-null  int64  
 9   sttl              39814 non-null  int64  
 10  dttl              39814 non-null  int64  
 11  sloss             39814 non-null  int64  
 12  dloss             39814 non-null  int64  
 13  service           39814 non-null  object 
 14  sload             39814 non-null  float64
 15  dload             39814 non-null  float64
 16  spkts             39814 non-null  int64 

In [14]:
dataset['service'].replace('smb', '-', inplace=True)
dataset['service'].replace('ntlm', '-', inplace=True)
dataset['service'].replace('gssapi,ntlm,smb', '-', inplace=True)
dataset['service'].replace('gssapi,smb', '-', inplace=True)
dataset['service'].replace('syslog', '-', inplace=True)
dataset['service'].replace('dce_rpc', '-', inplace=True)
dataset['service'].replace('gssapi,ntlm,smb,dce_rpc', '-', inplace=True)
dataset['service'].replace('modbus', '-', inplace=True)
dataset['service'].replace('ntlm,smb,dce_rpc', '-', inplace=True)
dataset['service'].replace('imap', '-', inplace=True)
dataset['service'].replace('mysql', '-', inplace=True)
dataset['service'].replace('gssapi', '-', inplace=True)
dataset['service'].replace('smb,gssapi', '-', inplace=True)
dataset['service'].replace('smb,gssapi,ntlm', '-', inplace=True)
dataset['service'].replace('rfb', '-', inplace=True)
dataset['service'].replace('smb,dce_rpc,gssapi,ntlm', '-', inplace=True)

In [15]:
dataset['smeansz'] = dataset['smeansz'].astype(int)
dataset['dmeansz'] = dataset['dmeansz'].astype(int)
dataset['ct_flw_http_mthd'] = dataset['ct_flw_http_mthd'].astype(float)
dataset['ct_ftp_cmd'] = dataset['ct_ftp_cmd'].astype(str)
dataset['dur'] = dataset['dur'].replace('-', 0)
dataset['dur'] = np.array(dataset['dur']).astype(np.float64)
dataset['state'] = dataset['state'].replace('0', 'FIN')

In [16]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39814 entries, 0 to 39813
Data columns (total 47 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   srcip             39814 non-null  object 
 1   sport             39814 non-null  object 
 2   dstip             39814 non-null  object 
 3   dsport            39814 non-null  object 
 4   proto             39814 non-null  object 
 5   state             39814 non-null  object 
 6   dur               39814 non-null  float64
 7   sbytes            39814 non-null  int64  
 8   dbytes            39814 non-null  int64  
 9   sttl              39814 non-null  int64  
 10  dttl              39814 non-null  int64  
 11  sloss             39814 non-null  int64  
 12  dloss             39814 non-null  int64  
 13  service           39814 non-null  object 
 14  sload             39814 non-null  float64
 15  dload             39814 non-null  float64
 16  spkts             39814 non-null  int64 

In [17]:
# Path to load files
file_path = "./machinelearningbinario/final-ipynb"

In [18]:
# Cargar datos de test
x_test = dataset

# Diccionarios
saved_dict = pickle.load(open(file_path+'/saved_dict.pkl', 'rb'))
mode_dict = pickle.load(open(file_path+'/mode_dict.pkl', 'rb'))

# Standard scaler
scaler = pickle.load(open(file_path+'/scaler.pkl', 'rb'))

# Encoders one-hot
ohe_proto = pickle.load(open(file_path+'/ohe_proto.pkl', 'rb'))
ohe_service = pickle.load(open(file_path+'/ohe_service.pkl', 'rb'))
ohe_state = pickle.load(open(file_path+'/ohe_state.pkl', 'rb'))

# El mejor modelo, es el Randomforest Classifier 
best_model = pickle.load(open(file_path+'/rf_best_clf.pkl', 'rb'))

In [19]:
saved_dict['num_col']

['sttl',
 'tcprtt',
 'smeansz_log1p',
 'res_bdy_len',
 'is_sm_ips_ports',
 'ct_state_ttl',
 'ct_ftp_cmd',
 'dtcpb_log1p',
 'dttl',
 'trans_depth',
 'synack',
 'sintpkt',
 'stime',
 'ct_srv_src',
 'stcpb_log1p',
 'ct_dst_ltm',
 'dload_log1p',
 'sload_log1p',
 'sjit_log1p',
 'dintpkt',
 'is_ftp_login',
 'dur_log1p',
 'dmeansz_log1p',
 'ackdat',
 'djit_log1p',
 'ct_dst_sport_ltm',
 'swin',
 'ct_flw_http_mthd',
 'dbytes_log1p',
 'spkts_log1p',
 'sbytes_log1p',
 'ct_src_ltm']

In [20]:
#Limpieza de datos

def clean_data(data):

    #Elimina los valores nulos y erróneos 
    
    numerical_col = data.select_dtypes(include=np.number).columns 
    categorical_col = data.select_dtypes(exclude=np.number).columns
    
    # Cleaning the data
    for col in data.columns:
        val = mode_dict[col]  # Sustituye los valores nulos por el valor modal
        data[col] = data[col].fillna(value=val)
        data[col] = data[col].replace(' ', value=val)
        data[col] = data[col].apply(lambda x:"None" if x=="-" else x)

        # Si son binarias y el valor es >1 se sustituye por el valor modal
        if col in saved_dict['binary_col']:
            data[col] = np.where(data[col]>1, val, data[col])

    # Convertir columnas de tipo erróneo
    bad_dtypes = list(set(categorical_col) - set(saved_dict['cat_col']))
    for bad_col in bad_dtypes:
        data[col] = data[col].astype(float)
    
    return data

#Aplicar log1p

def apply_log1p(data):
    
    #Ejecuta el log1p, crea una nueva columna y elimina la original
    
    for col in saved_dict['log1p_col']:
        new_col = col + '_log1p'
        data[new_col] = data[col].apply(np.log1p)
        data.drop(col, axis=1, inplace=True)
    return data

#Estandarizar los valores

def standardize(data):
    
    #Estandariza las columnas numéricas
    
    data[saved_dict['num_col']] = scaler.transform(data[saved_dict['num_col']])
    return data


#Encoders one-hot para las columnas categóricas

def ohencoding(data):
    '''
    Onehot encoding the categoricla columns.
    Add the ohe columns with the data and removes categorical columns.
    Using Onehotencoder objects trained on train data.
    '''
    X = ohe_service.transform(data['service'].values.reshape(-1, 1))
    Xm = ohe_proto.transform(data['proto'].values.reshape(-1, 1))
    Xmm = ohe_state.transform(data['state'].values.reshape(-1, 1))
    
    #Incluir las columnas con el encoder y eliminar las originales
    
    data = pd.concat([data,
                      pd.DataFrame(Xm.toarray(), columns=['proto_'+i for i in ohe_proto.categories_[0]]),
                      pd.DataFrame(X.toarray(), columns=['service_'+i for i in ohe_service.categories_[0]]),
                      pd.DataFrame(Xmm.toarray(), columns=['state_'+i for i in ohe_state.categories_[0]])],
                      axis=1)
    
    data.drop(['proto', 'service', 'state'], axis=1, inplace=True)
    return data

In [21]:
saved_dict['columns'] = saved_dict['columns'][:47]

In [22]:
saved_dict['to_drop'] = saved_dict['to_drop'][:4]

In [23]:
saved_dict['corr_col']

['sloss',
 'dloss',
 'dpkts',
 'dwin',
 'ltime',
 'ct_srv_dst',
 'ct_src_dport_ltm',
 'ct_dst_src_ltm']

In [24]:
#Funcion para realizar el preprocesamiento y las predicciones a partir de los datos

def final_fun_1(X):
   
    if isinstance(X, pd.core.series.Series):
        
        # Para una entrada puntual
        data = pd.DataFrame(X.values.reshape(1, -1)).copy()
    else:
        data = X.copy()
 

    data.reset_index(drop=True, inplace=True)
    data.columns = saved_dict['columns']


    dropable_col = saved_dict['to_drop'] + saved_dict['corr_col']
    data.drop(columns=dropable_col, inplace=True)

    data = clean_data(data)
    data = apply_log1p(data)
    data = standardize(data)
    data = ohencoding(data)

    
    # Realizar predicción con el modelo
    
    predictions = best_model.predict(data)
    
    return predictions

In [25]:
y_pred = final_fun_1(x_test.iloc[0:40])

In [26]:
print(y_pred)

[1 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0
 0 0 0]


In [27]:
y_pred = final_fun_1(x_test)

In [42]:
path = './datasetmulticlase.csv'  # Importar de los csv
dataset = pd.read_csv(path, header = None, skiprows=1)
columnas_dataset = pd.read_csv('./machinelearningbinario/NUSW-NB15_features.csv', encoding='ISO-8859-1')
columnas_dataset = columnas_dataset.iloc[:48, :]
columnas_dataset['Name'] = columnas_dataset['Name'].apply(lambda x: x.strip().replace(' ', '').lower())
dataset.columns = columnas_dataset['Name']

In [43]:
dataset.head

<bound method NDFrame.head of Name            srcip  sport           dstip dsport proto state        dur  \
0        175.45.176.0  26088  149.171.126.14     80   tcp   CON  10.147043   
1        175.45.176.2  44712  149.171.126.11    179   tcp   FIN   0.195649   
2          59.166.0.3   1066   149.171.126.6    143   tcp   FIN   0.513527   
3      149.171.126.11   4433    175.45.176.0  20108   udp   CON   4.631897   
4        175.45.176.0  36014  149.171.126.11     25   tcp   CON   5.477102   
...               ...    ...             ...    ...   ...   ...        ...   
39809      59.166.0.4  15966   149.171.126.3     53   udp   CON   0.001002   
39810      59.166.0.4  63665   149.171.126.1     80   tcp   CON   0.024294   
39811      59.166.0.8  18586   149.171.126.7   5190   tcp   FIN   0.004097   
39812      59.166.0.9  61840   149.171.126.6     53   udp   CON   0.001048   
39813      59.166.0.9  56250   149.171.126.2     80   tcp   CON   0.039019   

Name   sbytes  dbytes  sttl  ... 

In [44]:
dataset['label'] = y_pred

In [85]:
dataset.to_csv('./datasetcompleto.csv', index=False)