Версия с автоэнкодером используя его латентное пространоство + MLP для класификации

In [None]:
import pandas as pd
import numpy as np
import hashlib
from sklearn.preprocessing import RobustScaler

# Загрузка датасета
df = pd.read_csv(r"D:\Проекты\Дипломаня работа\DoFitN\Code\DoFitN\Data\Data_20\combined_features.csv")

# Удаляем лишние столбцы
df = df.drop(columns=['session_id', 'is_broadcast'])


'''' При использования метода one-hot слишком много уникальных значений,
     поэтому решил использовать хеширование, для обобщённости
'''


# Функция хеширования строк
def hash_str(s, size=256):
    return int(hashlib.md5(s.encode()).hexdigest(), 16) % size

df = df.replace([np.inf, -np.inf], np.nan)

for column in df.select_dtypes(include=[np.number]).columns:
    # Если все значения в столбце - NaN, заполняем стандартным значением
    if df[column].isna().all():
        df[column] = 1000000  # Используем большое число как заполнитель
    else:
        # Иначе заполняем NaN значения удвоенным максимальным значением столбца
        max_val = df[column].max()
        df[column] = df[column].fillna(max_val * 2)

# Применяем хеширование к IP и MAC-адресам
df['src_ip_hash'] = df['src_ip'].apply(lambda x: hash_str(x))
df['dst_ip_hash'] = df['dst_ip'].apply(lambda x: hash_str(x))
df['src_mac_hash'] = df['src_mac'].apply(lambda x: hash_str(x))
df['dst_mac_hash'] = df['dst_mac'].apply(lambda x: hash_str(x))

# Удаляем оригинальные столбцы IP/MAC
df = df.drop(columns=['src_ip','dst_ip', 'src_mac', 'dst_mac'])

# Сохраняем метки отдельно
labels = df['label'].copy()
df = df.drop(columns=['label'])


# Применяем RobustScaler ко всем числовым признакам
scaler = RobustScaler()
scaled_features = scaler.fit_transform(df)

# Создаем итоговый DataFrame с нормализованными признаками
df_scaled = pd.DataFrame(scaled_features, columns=df.columns)

# Добавляем метки обратно, если нужно
df_scaled['label'] = labels.values

""" Теперь имеем новый датафрейм для автоэнкодера и MLP с помощью RobustScaler """
df_scaled


Unnamed: 0,timestamp,opcode,duplicates,requests,replies,packet_rate,multiple_macs,request_reply_ratio,time_since_last_packet,unique_ip_count,...,target_ip_diversity,is_gateway,unanswered_requests,reply_percentage,log_packet_rate,src_ip_hash,dst_ip_hash,src_mac_hash,dst_mac_hash,label
0,-0.014668,0.0,-0.142857,-0.142857,0.0,-0.031978,0.0,0.0,-0.268779,-1.0,...,-0.036,-1.0,-0.111111,0.0,-0.046418,-0.377953,0.635294,0.000000,30.0,0
1,-0.014668,1.0,-0.142857,-0.158730,1.0,-0.031978,0.0,-50.0,-0.268779,-1.0,...,-0.036,0.0,-0.126984,0.0,-0.046418,0.000000,0.070588,-0.007407,31.0,0
2,-0.014665,0.0,-0.134454,-0.142857,1.0,0.025323,0.0,-49.0,8.184499,-1.0,...,-0.036,0.0,-0.126984,100.0,0.035667,0.000000,0.070588,-0.007407,0.0,0
3,-0.014665,1.0,-0.134454,-0.142857,1.0,0.025283,0.0,-49.0,8.190432,-1.0,...,-0.036,-1.0,-0.126984,100.0,0.035611,-0.377953,0.635294,0.000000,30.0,0
4,-0.014658,0.0,-0.126050,-0.126984,1.0,-0.007331,0.0,-48.0,19.384236,-1.0,...,-0.036,-1.0,-0.111111,50.0,-0.010503,-0.377953,0.635294,0.000000,30.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11593,0.000287,0.0,0.168067,0.444444,0.0,0.006991,1.0,0.0,12.161064,-1.0,...,-0.012,0.0,0.476190,0.0,0.009941,0.000000,-0.823529,-0.007407,0.0,1
11594,0.000287,0.0,-0.042017,0.047619,0.0,0.467722,1.0,0.0,0.700568,-1.0,...,-0.012,0.0,0.079365,0.0,0.543138,0.000000,0.000000,-0.007407,0.0,1
11595,0.000287,0.0,0.176471,0.460317,0.0,0.531037,1.0,0.0,0.591559,-1.0,...,-0.012,0.0,0.492063,0.0,0.602451,0.000000,-0.823529,-0.007407,0.0,1
11596,0.000287,0.0,-0.042017,0.047619,0.0,8.721803,1.0,0.0,-0.213444,-1.0,...,-0.012,0.0,0.079365,0.0,3.153616,0.000000,-0.082353,-0.007407,0.0,1
