**Imports**

In [1]:
import arff
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

**Funciones auxiliares**

In [2]:
def load_kdd_dataset(data_path):
    """Lectura del conjunto de datos NSL-KDD"""
    with open(data_path) as train_set:
        dataset = arff.load(train_set)
        atributos = [atrr[0] for atrr in dataset['attributes']]
        return pd.DataFrame(dataset["data"], columns =atributos)

In [3]:
def train_val_test_split(df,rstate=42, shuffle = True, stratify = None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(df, test_size = 0.4, random_state = rstate, shuffle = shuffle, stratify = strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(test_set, test_size = 0.5, random_state = rstate, shuffle = shuffle, stratify = strat)
    return (train_set,val_set,test_set)

**1. Lectura del conjunto de datos**

In [4]:
df =  load_kdd_dataset("datasets\\datasets\\NSL-KDD\\KDDTrain+.arff") 

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     125973 non-null  float64
 1   protocol_type                125973 non-null  object 
 2   service                      125973 non-null  object 
 3   flag                         125973 non-null  object 
 4   src_bytes                    125973 non-null  float64
 5   dst_bytes                    125973 non-null  float64
 6   land                         125973 non-null  object 
 7   wrong_fragment               125973 non-null  float64
 8   urgent                       125973 non-null  float64
 9   hot                          125973 non-null  float64
 10  num_failed_logins            125973 non-null  float64
 11  logged_in                    125973 non-null  object 
 12  num_compromised              125973 non-null  float64
 13 

**2.0 Divisiòn del conjunto de datos**

In [6]:
train_set,val_set,test_set = train_val_test_split(df, stratify = "protocol_type")

In [7]:
print("Longitud del training Set: (60%) ", len(train_set))
print("Longitus del Valitation Set: (20%)", len(val_set))
print("Longitus del Test Set: (20%)", len(test_set))

Longitud del training Set: (60%)  75583
Longitus del Valitation Set: (20%) 25195
Longitus del Test Set: (20%) 25195


**Contruyendo transformadores perzonalizados**

*La creación de transformadores personalizados permite mantener el código mucho más limpio y estructurado a la hora
de preparar los datos para los algoritmos de ML. Además permite la reutilización de código para otros proyectos.*

*Antes de comenzar, vamor a recuperar el conjunto de datos limpio y vamos a separar las etiquetas del resto de datos*

In [8]:
#Separamos las características de entrada de la salida
X_train = train_set.drop("class", axis = 1)
y_train = train_set["class"].copy()

In [9]:
X_train.loc[(X_train["src_bytes"]>400)&(X_train["src_bytes"]<800), "src_bytes"] = np.nan
X_train.loc[(X_train["dst_bytes"]>400)&(X_train["dst_bytes"]<800), "dst_bytes"] = np.nan
#X_train

**Transformadores para atributos númericos**

In [10]:
#Tranformador cread para eliminar filas con valores nulos
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import RobustScaler

class DeleteNanRows(TransformerMixin, BaseEstimator):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X , y = None):
        return X.dropna()

In [11]:
delete_nan = DeleteNanRows()
X_train_prep = delete_nan.fit_transform(X_train)

In [12]:
X_train_prep

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
31899,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.00,0.00,1.00,1.00,0.0,0.0
89913,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.00,0.00,1.00,1.00,0.0,0.0
106319,0.0,icmp,eco_i,SF,8.0,0.0,0,0.0,0.0,0.0,...,2.0,7.0,1.00,0.00,1.00,0.57,0.00,0.00,0.0,0.0
98007,0.0,udp,domain_u,SF,46.0,139.0,0,0.0,0.0,0.0,...,255.0,254.0,1.00,0.01,0.00,0.00,0.00,0.00,0.0,0.0
16447,0.0,tcp,smtp,SF,1790.0,363.0,0,0.0,0.0,0.0,...,141.0,137.0,0.55,0.04,0.01,0.01,0.00,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55359,0.0,tcp,http,SF,283.0,237.0,0,0.0,0.0,0.0,...,91.0,255.0,1.00,0.00,0.01,0.05,0.00,0.01,0.0,0.0
90665,0.0,tcp,ftp_data,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,63.0,0.25,0.02,0.02,0.00,1.00,1.00,0.0,0.0
64559,0.0,tcp,systat,S0,0.0,0.0,0,0.0,0.0,0.0,...,255.0,20.0,0.08,0.06,0.00,0.00,1.00,1.00,0.0,0.0
32452,3.0,tcp,smtp,SF,889.0,328.0,0,0.0,0.0,0.0,...,111.0,155.0,0.64,0.04,0.01,0.01,0.01,0.00,0.0,0.0


**Transformados para scalar**

In [13]:
class CustomScaler(TransformerMixin, BaseEstimator):
    def __init__(self,attributes):
        self.attributes = attributes
    def fit(self, X, y=None):
        return self
    def transform(self, X , y = None):
        X_copy = X.copy()
        scale_attrs = X_copy[self.attributes]
        robust_scaler = RobustScaler()
        X_scaled = robust_scaler.fit_transform(scale_attrs)
        X_scaled = pd.DataFrame(X_scaled, columns = self.attributes)
        for attr in self.attributes:
            X_copy[attr] = X_scaled[attr]
        return X_copy


In [14]:
custom_scaler = CustomScaler(["src_bytes","dst_bytes"])
X_train_prep = custom_scaler.fit_transform(X_train_prep)


In [15]:
X_train_prep.head(10)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
31899,0.0,tcp,private,S0,-0.119342,0.0,0,0.0,0.0,0.0,...,255.0,4.0,0.02,0.05,0.0,0.0,1.0,1.0,0.0,0.0
89913,0.0,tcp,private,S0,,,0,0.0,0.0,0.0,...,255.0,15.0,0.06,0.07,0.0,0.0,1.0,1.0,0.0,0.0
106319,0.0,icmp,eco_i,SF,,,0,0.0,0.0,0.0,...,2.0,7.0,1.0,0.0,1.0,0.57,0.0,0.0,0.0,0.0
98007,0.0,udp,domain_u,SF,,,0,0.0,0.0,0.0,...,255.0,254.0,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
16447,0.0,tcp,smtp,SF,0.765432,0.0,0,0.0,0.0,0.0,...,141.0,137.0,0.55,0.04,0.01,0.01,0.0,0.0,0.0,0.0
100052,0.0,tcp,http,SF,,,0,0.0,0.0,0.0,...,255.0,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28800,0.0,tcp,ftp_data,SF,-0.119342,0.0,0,0.0,0.0,0.0,...,8.0,28.0,1.0,0.0,1.0,0.11,0.0,0.0,0.0,0.0
78082,0.0,tcp,ftp_data,SF,,,0,0.0,0.0,0.0,...,93.0,51.0,0.23,0.03,0.23,0.04,0.0,0.0,0.0,0.0
69315,0.0,tcp,systat,S0,0.814815,22.016438,0,0.0,0.0,0.0,...,255.0,5.0,0.02,0.07,0.0,0.0,1.0,1.0,0.0,0.0
100360,0.0,tcp,private,S0,,,0,0.0,0.0,0.0,...,255.0,13.0,0.05,0.07,0.0,0.0,1.0,1.0,0.0,0.0


**Transformadores para atrinbutos nùmericos**

In [44]:
# Definición de la clase CustomOneHotEncoding que hereda de TransformerMixin y BaseEstimator
class CustomOneHotEncoding(TransformerMixin, BaseEstimator):
    # Método constructor de la clase
    def __init__(self, attributes):
        # Inicializa el codificador One-Hot de scikit-learn
        self.oh = OneHotEncoder()
        # Inicializa una variable para almacenar los nombres de las columnas resultantes de la codificación
        self._columns = None

    # Método para ajustar el transformador a los datos de entrada
    def fit(self, X, y=None):
        # Selecciona las columnas categóricas (de tipo 'object') del DataFrame X
        X_cat = X.select_dtypes(include=["object"])
        # Utiliza pd.get_dummies para obtener temporalmente los nombres de las columnas que resultarán de la codificación
        self._columns = pd.get_dummies(X_cat).columns
        # Ajusta el codificador One-Hot a las columnas categóricas seleccionadas
        self.oh.fit(X_cat)
        # Devuelve el objeto transformador ajustado (esto es estándar en los métodos fit de scikit-learn)
        return self

    # Método para transformar los datos de entrada utilizando la codificación one-hot
    def transform(self, X, y=None):
        # Crea una copia del DataFrame de entrada para evitar modificar los datos originales
        X_copy = X.copy()
        # Selecciona las columnas categóricas del DataFrame copiado
        X_cat = X_copy.select_dtypes(include=["object"])
        # Selecciona las columnas no categóricas (numéricas) del DataFrame copiado
        X_num = X_copy.select_dtypes(exclude=["object"])
        # Transforma las columnas categóricas utilizando el codificador One-Hot
        X_cat_oh = self.oh.transform(X_cat)
        # Convierte el resultado de la transformación en un DataFrame de pandas, 
        # asignando los nombres de columnas y los índices originales
        X_cat_oh = pd.DataFrame(X_cat_oh.toarray(), columns=self._columns, index=X_copy.index)
        # Elimina las columnas categóricas originales del DataFrame
        X_copy.drop(list(X_cat), axis=1, inplace=True)
        # Une las columnas numéricas restantes con las nuevas columnas codificadas en one-hot
        return X_copy.join(X_cat_oh)

**CONSTRUYENDO PIPELINES**

*lOS PIPELINES NOS PERMITE AGRUPAR EN UN FLUJO DE EJECUCIÒN TODAS LAS OPERACIONES DE TRASNFORMACIÒN QUE NECESITEMOS REALIZAR SOBRE EL CONJUNTO DE DATOS ESTO FACILITA MUCHSIMO LAS TRASNFORMACIONES PARA DIFERENTES CONJUNTOS DE DATOS*

In [43]:
#cONSTRUCCIÒN DE UN PIPELINE PARA ATRIBUTOS NÙMERIDOS#
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler


# Construcción de un pipeline para atributos numéricos
num_pipeline = Pipeline([
    # Paso 1: Imputación de valores faltantes utilizando la mediana de cada columna numérica
    ('imputer', SimpleImputer(strategy='median')),
    
    # Paso 2: Escalado de características numéricas utilizando RobustScaler, 
    # que escala las características utilizando la mediana y el rango intercuartílico (IQR)
    # para hacerlas menos sensibles a los valores atípicos
    ('rbst_scaler', RobustScaler())
])

In [41]:
#La clase imputer no admite valores categoricos, eliminamos los atributos categoricos
# Selección de atributos numéricos
X_train_num = X_train.select_dtypes(exclude=['object'])

# Aplicación del pipeline numérico
X_train_prep = num_pipeline.fit_transform(X_train_num)

# Crear un DataFrame con los datos transformados
X_train_prep = pd.DataFrame(X_train_prep, columns=X_train_num.columns, index=X_train_num.index)




In [42]:
X_train_num.sample(6)

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
101178,28.0,1494.0,4152.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,...,119.0,52.0,0.44,0.04,0.01,0.0,0.0,0.0,0.0,0.0
6360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,4.0,0.02,0.08,0.0,0.0,1.0,1.0,0.0,0.0
29372,0.0,46.0,133.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,254.0,1.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0
10285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,20.0,0.08,0.07,0.0,0.0,1.0,1.0,0.0,0.0
115132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,255.0,8.0,0.03,0.06,0.0,0.0,1.0,1.0,0.0,0.0
22954,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,68.0,68.0,1.0,0.0,0.01,0.0,0.0,0.0,1.0,1.0


In [45]:
X_train_prep.sample(6)

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
12420,26.0,5.882591,12.431138,0.0,0.0,30.0,0.0,0.0,0.0,0.0,...,-1.271676,-0.22449,-0.326316,1.571429,0.5,0.0,0.0,0.0,0.0,0.0
43918,0.0,-0.1417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.468208,-0.004082,0.515789,-0.428571,16.666667,13.0,0.0,0.0,0.0,0.0
16418,0.0,0.004049,0.338323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.109827,-0.016327,0.431579,0.0,0.333333,0.0,0.0,0.0,0.0,0.0
24375,0.0,0.40081,80.437126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.83237,0.787755,0.515789,-0.428571,0.166667,1.0,0.01,0.01,0.0,0.0
17301,0.0,3.781377,0.997006,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.745665,0.404082,0.284211,0.0,0.333333,0.5,0.0,0.0,0.0,0.0
23448,0.0,-0.174089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.167347,-0.452632,0.428571,0.0,0.0,1.0,1.0,0.0,0.0


In [47]:
X_train_prep.sample(6)

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
65206,0.0,-0.174089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.445087,-0.24898,-0.326316,13.857143,3.333333,0.0,0.0,0.0,0.8,1.0
24147,0.0,4.004049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.693878,0.421053,-0.285714,15.166667,0.0,0.0,0.0,0.0,0.0
3808,0.0,-0.174089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.171429,-0.452632,0.571429,0.0,0.0,1.0,1.0,0.0,0.0
26346,0.0,-0.048583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.109827,-0.130612,-0.421053,0.0,1.833333,3.5,0.0,0.0,0.0,0.0
87981,0.0,-0.174089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.22449,-0.505263,0.571429,0.0,0.0,1.0,1.0,0.0,0.0
22654,0.0,220.635628,24.892216,0.0,0.0,2.0,0.0,1.0,0.0,0.0,...,-1.040462,0.053061,0.515789,-0.428571,0.166667,0.0,0.0,0.0,0.01,0.01


In [None]:
A continuacionn el metodo ColumnTrnsformer, que ejecuta los pipelines en paralelo y concatena el resultado para ello el resultado de los pipelines debe ser un valor numerico

In [52]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Selección de los nombres de las columnas numéricas del conjunto de datos X_train
num_attribs = list(X_train.select_dtypes(exclude=['object']))
# Selección de los nombres de las columnas categóricas del conjunto de datos X_train
cat_attribs = list(X_train.select_dtypes(include=['object']))

# Creación de un pipeline completo para procesar tanto columnas numéricas como categóricas
full_pipeline = ColumnTransformer([
    # Transformación para atributos numéricos: utiliza un pipeline (num_pipeline) predefinido para procesar las columnas numéricas
    ('num', num_pipeline, num_attribs),
    
    # Transformación para atributos categóricos: aplica OneHotEncoder a las columnas categóricas
    ('cat', OneHotEncoder(), cat_attribs)
])

In [56]:
# Aplica el pipeline completo de preprocesamiento a los datos de entrenamiento (X_train)
# Esto ajusta el pipeline a los datos (si es necesario) y los transforma según las reglas definidas.
X_train_prep = full_pipeline.fit_transform(X_train)

# Convierte el resultado del preprocesamiento en un DataFrame de pandas
# 'pd.get_dummies(X_train)' genera las columnas que resultan de aplicar una codificación one-hot a X_train
# 'list(...)' toma esos nombres de columna para usarlos en el DataFrame resultante
# 'index=X_train.index' mantiene los mismos índices que el conjunto de datos original
X_train_prep = pd.DataFrame(X_train_prep, columns=list(pd.get_dummies(X_train)), index=X_train.index)

In [55]:
X_train_prep.sample(6)

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,flag_S3,flag_SF,flag_SH,land_0,land_1,logged_in_0,logged_in_1,is_host_login_0,is_guest_login_0,is_guest_login_1
90588,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
14526,0.0,-0.1417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
20438,0.0,48.497976,4.281437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
109045,0.0,-0.174089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
97106,0.0,-0.174089,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
511,4.0,5.121457,0.982036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0
