**Importar**

In [1]:
import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


**Funciones auxiliares**

In [2]:
def load_kdd_dataset(data_path):
    """Lectura del conjunto de datos NSL-KDD"""
    with open(data_path) as train_set:
        dataset = arff.load(train_set)
        atributos = [atrr[0] for atrr in dataset['attributes']]
        return pd.DataFrame(dataset["data"], columns =atributos)


In [3]:
def train_val_test_split(df,rstate=42, shuffle = True, stratify = None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(df, test_size = 0.4, random_state = rstate, shuffle = shuffle, stratify = strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(test_set, test_size = 0.5, random_state = rstate, shuffle = shuffle, stratify = strat)
    return (train_set,val_set,test_set)

**Lectura de datos**

In [4]:
df =  load_kdd_dataset("datasets\\datasets\\NSL-KDD\\KDDTrain+.arff") 

**2.0 Divisiòn del Conjunto de datos**

In [5]:
train_set,val_set,test_set = train_val_test_split(df, stratify = "protocol_type")

In [6]:
print("Longitud del training Set: (60%) ", len(train_set))
print("Longitus del Valitation Set: (20%)", len(val_set))
print("Longitus del Test Set: (20%)", len(test_set))

Longitud del training Set: (60%)  75583
Longitus del Valitation Set: (20%) 25195
Longitus del Test Set: (20%) 25195


In [7]:
#Separamos las caracteristicas de entrada de las de salida
X_train = train_set.drop('class', axis = 1)
y_train = train_set['class'].copy()

In [8]:
#Para simular algunas caracteristicas nulas añadiremos NAN
X_train.loc[(X_train["src_bytes"]>400)&(X_train['src_bytes']<800), "src_bytes"]= np.nan
X_train.loc[(X_train["dst_bytes"]>400)&(X_train['dst_bytes']<800), "dst_bytes"]= np.nan

**Opciòn 1: Eliminamos las filas con valores nulos**

In [9]:
#Copiamos el conjunto de datos para no alterar el principal#
X_train_copy = X_train.copy()

In [10]:
X_train_copy.dropna(subset = ["src_bytes","dst_bytes"])
X_train_copy

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,tcp,http,SF,,53508.0,0,0.0,0.0,0.0,...,9.0,255.0,1.00,0.00,0.11,0.03,0.00,0.00,0.0,0.0
31899,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.00,0.0,0.0
108116,0.0,tcp,http,SF,304.0,,0,0.0,0.0,0.0,...,39.0,255.0,1.00,0.00,0.03,0.06,0.00,0.00,0.0,0.0
89913,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.00,0.0,0.0
106319,0.0,icmp,eco_i,SF,8.0,0.0,0,0.0,0.0,0.0,...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64559,0.0,tcp,systat,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.00,0.0,0.0
67272,0.0,tcp,http,SF,210.0,,0,0.0,0.0,0.0,...,119.0,255.0,1.00,0.00,0.01,0.02,0.02,0.01,0.0,0.0
32452,3.0,tcp,smtp,SF,889.0,328.0,0,0.0,0.0,0.0,...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.00,0.0,0.0
112657,0.0,tcp,http,SF,284.0,,0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0


**Opciòn 2: Eliminamos los atributos con valores nulos**

In [11]:
#Copiamos el conjunto de datos para no alterar el principal 
X_train_copy = X_train.copy()

In [12]:
X_train_copy.drop(["src_bytes","dst_bytes"], axis = 1, inplace = True)
X_train_copy

Unnamed: 0,duration,protocol_type,service,flag,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,tcp,http,SF,0,0.0,0.0,0.0,0.0,1,...,9.0,255.0,1.00,0.00,0.11,0.03,0.00,0.00,0.0,0.0
31899,0.0,tcp,private,S0,0,0.0,0.0,0.0,0.0,0,...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.00,0.0,0.0
108116,0.0,tcp,http,SF,0,0.0,0.0,0.0,0.0,1,...,39.0,255.0,1.00,0.00,0.03,0.06,0.00,0.00,0.0,0.0
89913,0.0,tcp,private,S0,0,0.0,0.0,0.0,0.0,0,...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.00,0.0,0.0
106319,0.0,icmp,eco_i,SF,0,0.0,0.0,0.0,0.0,0,...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64559,0.0,tcp,systat,S0,0,0.0,0.0,0.0,0.0,0,...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.00,0.0,0.0
67272,0.0,tcp,http,SF,0,0.0,0.0,0.0,0.0,1,...,119.0,255.0,1.00,0.00,0.01,0.02,0.02,0.01,0.0,0.0
32452,3.0,tcp,smtp,SF,0,0.0,0.0,0.0,0.0,1,...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.00,0.0,0.0
112657,0.0,tcp,http,SF,0,0.0,0.0,0.0,0.0,1,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0


**Opciòn 3: Rellenamos los valores nulos con un valor determinado**

In [13]:
#Copiamos el conjunto de datos para no alterar el principal 
X_train_copy = X_train.copy()

In [14]:
#Rellenamos los valores nulos con la media 1
media_srcbytes = X_train_copy['src_bytes'].mean()
media_dstbytes = X_train_copy['dst_bytes'].mean()

print(media_srcbytes,media_dstbytes)

X_train_copy["src_bytes"]=X_train_copy['src_bytes'].fillna(media_srcbytes)
X_train_copy["dst_bytes"]=X_train_copy['dst_bytes'].fillna(media_dstbytes)

X_train_copy.sample(5)

66914.53076150673 8719.688279925516


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
92421,0.0,icmp,urp_i,SF,78.0,0.0,0,0.0,0.0,0.0,...,255.0,26.0,0.1,0.01,0.1,0.0,0.0,0.0,0.0,0.0
114398,0.0,tcp,http,SF,319.0,2835.0,0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
109270,0.0,tcp,ftp_data,SF,66914.530762,0.0,0,0.0,0.0,0.0,...,235.0,79.0,0.34,0.04,0.36,0.0,0.0,0.01,0.0,0.0
120368,0.0,tcp,http,SF,274.0,268.0,0,0.0,0.0,0.0,...,21.0,255.0,1.0,0.0,0.05,0.08,0.05,0.0,0.0,0.0
58435,0.0,tcp,http,SF,324.0,3285.0,0,0.0,0.0,0.0,...,57.0,255.0,1.0,0.0,0.02,0.08,0.0,0.0,0.0,0.0


In [15]:
#Rellenamos los valores nulos con la media 2
media_scrbytes = X_train_copy["src_bytes"].mean()
media_dstbytes = X_train_copy["dst_bytes"].mean()
print(media_scrbytes,media_dstbytes)
X_train_copy["src_bytes"].fillna(media_scrbytes, inplace = True)
X_train_copy["dst_bytes"].fillna(media_dstbytes, inplace = True)

X_train_copy

66914.53076150674 8719.688279925516


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train_copy["src_bytes"].fillna(media_scrbytes, inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train_copy["dst_bytes"].fillna(media_dstbytes, inplace = True)


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
113467,0.0,tcp,http,SF,66914.530762,53508.00000,0,0.0,0.0,0.0,...,9.0,255.0,1.00,0.00,0.11,0.03,0.00,0.00,0.0,0.0
31899,0.0,tcp,private,S0,0.000000,0.00000,0,0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.00,0.0,0.0
108116,0.0,tcp,http,SF,304.000000,8719.68828,0,0.0,0.0,0.0,...,39.0,255.0,1.00,0.00,0.03,0.06,0.00,0.00,0.0,0.0
89913,0.0,tcp,private,S0,0.000000,0.00000,0,0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.00,0.0,0.0
106319,0.0,icmp,eco_i,SF,8.000000,0.00000,0,0.0,0.0,0.0,...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64559,0.0,tcp,systat,S0,0.000000,0.00000,0,0.0,0.0,0.0,...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.00,0.0,0.0
67272,0.0,tcp,http,SF,210.000000,8719.68828,0,0.0,0.0,0.0,...,119.0,255.0,1.00,0.00,0.01,0.02,0.02,0.01,0.0,0.0
32452,3.0,tcp,smtp,SF,889.000000,328.00000,0,0.0,0.0,0.0,...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.00,0.0,0.0
112657,0.0,tcp,http,SF,284.000000,8719.68828,0,0.0,0.0,0.0,...,255.0,255.0,1.00,0.00,0.00,0.00,0.00,0.00,0.0,0.0


In [16]:
#Existen otras alternativas par ala ocpiòn 3 que consiste en usar una clase imputer de sklearn
X_train_copy = X_train.copy()

In [17]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = "median")

In [18]:
#La clase imputer no admite valores categoricos, eliminamos los atributos categoricos
X_train_copy_num = X_train_copy.select_dtypes(exclude=["object"])
#X_train_copy_num.info()#

In [19]:
imputer.fit(X_train_copy_num)

In [22]:
#Rellenar los valores nulos
X_train_copy_num_nonan = imputer.transform(X_train_copy_num)
X_train_copy = pd.DataFrame(X_train_copy_num_nonan, columns = X_train_copy_num.columns)

**APIs DE SKLEARN**

**Estimador**

*Es un objeto que se implemente con el metodo fit() y opcinalmente con fit_transform(). Se utiliza para ajustar modelos o aprender patrones*

**Transformers**

*Se tiliza para transformar datos de entrada en una representacòn diferente, se implementara con el metodo transform*

**Predictors**

*Se tiliza para hacer predicciones basadas en un modelo ajustado, se implementa con el metodo predict*

**4.- Transformacion de atributos categoricos a nùmericos**

*Antes de comenzar , vamos a recuperar e conjunto de datos limpios y vamos a separar las etiquetas del resto de datos*

In [23]:
#Separamos las características de entrada de la salida
X_train = train_set.drop("class", axis = 1)
y_train = train_set["class"].copy()


In [None]:
#X_train.info()#

*Existe diferentes formas de convertir los atributos categoricos en numericos, Probablemente la mas sencilla es la que proporciona el metodod factorize, el cual los transforma a numeros secuanciales*

In [24]:
protocol_type = X_train["protocol_type"]
protocal_type_encoded, categorias = protocol_type.factorize()



In [25]:
#Mostramos por pantalla como se han codigficado

for i in range(10):
    print(protocol_type.iloc[i],"=",protocal_type_encoded[i])

tcp = 0
tcp = 0
tcp = 0
tcp = 0
icmp = 1
udp = 2
tcp = 0
tcp = 0
tcp = 0
tcp = 0


In [26]:
print(categorias)

Index(['tcp', 'icmp', 'udp'], dtype='object')


**TRANSFORMACIONES AVANZADAS MEDIANTE SKLEARN**

**Ordinal Encoding**

*Realiza la misma codificaciòn que el mètodo factorize() de Pandas*

In [27]:
from sklearn.preprocessing import OrdinalEncoder

protocol_type = X_train[["protocol_type"]]

ordinal_encoder = OrdinalEncoder()
protocol_type_encoded = ordinal_encoder.fit_transform(protocol_type)
protocol_type_encoded

array([[1.],
       [1.],
       [1.],
       ...,
       [1.],
       [1.],
       [1.]])

In [28]:
#Mostramos en pantalla como se han codificado#

for i  in range(10):
    print(protocol_type['protocol_type'].iloc[i], "=", protocol_type_encoded[i])

tcp = [1.]
tcp = [1.]
tcp = [1.]
tcp = [1.]
icmp = [0.]
udp = [2.]
tcp = [1.]
tcp = [1.]
tcp = [1.]
tcp = [1.]


In [29]:
print(ordinal_encoder.categories_)

[array(['icmp', 'tcp', 'udp'], dtype=object)]


In [None]:
#help(OrdinalEncoder)

**One-Hot Encoding**

*Genera para cada categoria del atributo categorico una matriz binaria que representa valor*

In [30]:
from sklearn.preprocessing import OneHotEncoder
protocol_type = X_train[["protocol_type"]]

oh_encoder = OneHotEncoder(handle_unknown= 'ignore')
protocol_type_oh = oh_encoder.fit_transform(protocol_type)
protocol_type_oh

<75583x3 sparse matrix of type '<class 'numpy.float64'>'
	with 75583 stored elements in Compressed Sparse Row format>

In [31]:
#Mostrar 
protocol_type_oh.toarray()

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [32]:
#Mostramos por pantalla como se ha codificado
for i in range (10):
    print(protocol_type["protocol_type"].iloc[i], "=", protocol_type_oh.toarray()[i] )

tcp = [0. 1. 0.]
tcp = [0. 1. 0.]
tcp = [0. 1. 0.]
tcp = [0. 1. 0.]
icmp = [1. 0. 0.]
udp = [0. 0. 1.]
tcp = [0. 1. 0.]
tcp = [0. 1. 0.]
tcp = [0. 1. 0.]
tcp = [0. 1. 0.]


**Get Dummies**

*Es un metodo que permite aplicar One Hot Encoding a un dataframe de Pandas*

In [33]:
pd.get_dummies(X_train["protocol_type"])

Unnamed: 0,icmp,tcp,udp
113467,False,True,False
31899,False,True,False
108116,False,True,False
89913,False,True,False
106319,True,False,False
...,...,...,...
64559,False,True,False
67272,False,True,False
32452,False,True,False
112657,False,True,False


**5.0 Escalado Conjunto de datos**

*Antes de comenzar, vamos a recuperar el conjunto de datos limpios y vamos a separar las etiquetas del resto de datos*

In [None]:
#Separamos las características de entrada de la salida
X_train = train_set.drop("class", axis = 1)
y_train = train_set["class"].copy()


**Nomalizacion** Los valores del atributo se escalan para adquirir valores entre 0 y 1 (Util en algoritmos basados en distancia) --> MinMaxScaler

**Estandarizaciòn** Los valores del atributo se escalan y reciben un valor similar pero no se encuentre dentro de un rango (Adecuadro cuando no hay outliers significativos)--> StandardScaler

**RiobustScaler** Beneficiso cuando hay outliers significativos y se desa reducir su impacto.

In [40]:
from sklearn.preprocessing import RobustScaler
scale_attrs = X_train[['src_bytes','dst_bytes']]
robust_scaled = RobustScaler()
X_train_scaled = robust_scaled.fit_transform(scale_attrs)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=['src_bytes','dst_bytes'])

In [46]:
X_train_scaled.sample(10)

Unnamed: 0,src_bytes,dst_bytes
48973,7680.058394,0.0
57473,8009.39781,0.0
26093,-0.160584,0.0
32798,-0.160584,0.0
62034,-0.160584,0.0
48977,-0.160584,0.0
32577,2.613139,0.620952
52456,-0.160584,0.0
46777,-0.160584,0.0
15677,-0.032847,0.247619
