In [44]:
import pandas as pd

from scipy.io import arff
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

In [31]:
# Ruta del archivo
ruta_archivo = (
    "C:\\Users\\seelro06\\OneDrive - Arca Continental S.A.B. de C.V\\"
    "Documentos\\uni\\KDDTrain+.arff"
)
ruta_archivo

'C:\\Users\\seelro06\\OneDrive - Arca Continental S.A.B. de C.V\\Documentos\\uni\\KDDTrain+.arff'

In [35]:
# Leer el archivo
datos_orig, _ = arff.loadarff(ruta_archivo)
datos_orig[:5]

array([(0., b'tcp', b'ftp_data', b'SF', 491.,    0., b'0', 0., 0., 0., 0., b'0', 0., 0., 0., 0., 0., 0., 0., 0., b'0', b'0',   2.,  2., 0. , 0. , 0., 0., 1.  , 0.  , 0.  , 150.,  25., 0.17, 0.03, 0.17, 0.  , 0.  , 0.  , 0.05, 0.  , b'normal'),
       (0., b'udp', b'other', b'SF', 146.,    0., b'0', 0., 0., 0., 0., b'0', 0., 0., 0., 0., 0., 0., 0., 0., b'0', b'0',  13.,  1., 0. , 0. , 0., 0., 0.08, 0.15, 0.  , 255.,   1., 0.  , 0.6 , 0.88, 0.  , 0.  , 0.  , 0.  , 0.  , b'normal'),
       (0., b'tcp', b'private', b'S0',   0.,    0., b'0', 0., 0., 0., 0., b'0', 0., 0., 0., 0., 0., 0., 0., 0., b'0', b'0', 123.,  6., 1. , 1. , 0., 0., 0.05, 0.07, 0.  , 255.,  26., 0.1 , 0.05, 0.  , 0.  , 1.  , 1.  , 0.  , 0.  , b'anomaly'),
       (0., b'tcp', b'http', b'SF', 232., 8153., b'0', 0., 0., 0., 0., b'1', 0., 0., 0., 0., 0., 0., 0., 0., b'0', b'0',   5.,  5., 0.2, 0.2, 0., 0., 1.  , 0.  , 0.  ,  30., 255., 1.  , 0.  , 0.03, 0.04, 0.03, 0.01, 0.  , 0.01, b'normal'),
       (0., b'tcp', b'http', b'

In [40]:
def limpiar_octetos(columna):
    return columna.apply(
        lambda x: x.decode('utf-8') if isinstance(x, bytes) else x
    )

datos = pd.DataFrame(datos_orig)

# Eliminar el prefijo 'b' en las columnas en formato de octetos
for col in datos.columns:
    datos[col] = limpiar_octetos(datos[col])

# Sustituir nombres de clases por valores numéricos
datos['class'] = datos['class'].map({'normal': 0, 'anomaly': 1})

# Identificar y convertir columnas categóricas
columnas_categoricas = datos.select_dtypes(include=['object']).columns
datos[columnas_categoricas] = datos[columnas_categoricas].astype('category')

datos.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0.0,tcp,ftp_data,SF,491.0,0.0,0,0.0,0.0,0.0,...,25.0,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,0
1,0.0,udp,other,SF,146.0,0.0,0,0.0,0.0,0.0,...,1.0,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,0
2,0.0,tcp,private,S0,0.0,0.0,0,0.0,0.0,0.0,...,26.0,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1
3,0.0,tcp,http,SF,232.0,8153.0,0,0.0,0.0,0.0,...,255.0,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,0
4,0.0,tcp,http,SF,199.0,420.0,0,0.0,0.0,0.0,...,255.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [47]:
# Definir caracteristicas y etiqueta
X_entrenamiento = datos.drop('class', axis=1)
y_entrenamiento = datos['class']
X_entrenamiento[:5], y_entrenamiento[:5]

(   duration protocol_type   service flag  src_bytes  dst_bytes land  \
 0       0.0           tcp  ftp_data   SF      491.0        0.0    0   
 1       0.0           udp     other   SF      146.0        0.0    0   
 2       0.0           tcp   private   S0        0.0        0.0    0   
 3       0.0           tcp      http   SF      232.0     8153.0    0   
 4       0.0           tcp      http   SF      199.0      420.0    0   
 
    wrong_fragment  urgent  hot  ...  dst_host_count dst_host_srv_count  \
 0             0.0     0.0  0.0  ...           150.0               25.0   
 1             0.0     0.0  0.0  ...           255.0                1.0   
 2             0.0     0.0  0.0  ...           255.0               26.0   
 3             0.0     0.0  0.0  ...            30.0              255.0   
 4             0.0     0.0  0.0  ...           255.0              255.0   
 
    dst_host_same_srv_rate  dst_host_diff_srv_rate  \
 0                    0.17                    0.03   
 1    

In [39]:
# Definir columnas categóricas y numéricas
caracteristicas_categoricas = datos.select_dtypes(include=['category']).columns
caracteristicas_numericas = X.columns.difference(caracteristicas_categoricas)
caracteristicas_categoricas, caracteristicas_numericas

(Index(['protocol_type', 'service', 'flag', 'land', 'logged_in',
        'is_host_login', 'is_guest_login'],
       dtype='object'),
 Index(['count', 'diff_srv_rate', 'dst_bytes', 'dst_host_count',
        'dst_host_diff_srv_rate', 'dst_host_rerror_rate',
        'dst_host_same_src_port_rate', 'dst_host_same_srv_rate',
        'dst_host_serror_rate', 'dst_host_srv_count',
        'dst_host_srv_diff_host_rate', 'dst_host_srv_rerror_rate',
        'dst_host_srv_serror_rate', 'duration', 'hot', 'num_access_files',
        'num_compromised', 'num_failed_logins', 'num_file_creations',
        'num_outbound_cmds', 'num_root', 'num_shells', 'rerror_rate',
        'root_shell', 'same_srv_rate', 'serror_rate', 'src_bytes', 'srv_count',
        'srv_diff_host_rate', 'srv_rerror_rate', 'srv_serror_rate',
        'su_attempted', 'urgent', 'wrong_fragment'],
       dtype='object'))

In [43]:
# Preprocesamiento
preprocesador = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), caracteristicas_numericas),
        ('cat', OneHotEncoder(handle_unknown='ignore'), caracteristicas_categoricas)
    ]
)
preprocesador

In [46]:
# Crear tubería para el modelo de regresión
modelo = Pipeline(steps=[
    ('preprocesador', preprocesador),
    ('regresor', RandomForestRegressor())
])
modelo

In [48]:
# Entrenar el modelo
modelo.fit(X_entrenamiento, y_entrenamiento)
modelo