In [7]:
import datetime
import platform
import os
import sys
import time
import warnings

import numpy as np
import pandas as pd
import tensorflow as tf
import keras
from keras.layers import Dense
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

# Evaluation
from sklearn.metrics import confusion_matrix, classification_report

warnings.simplefilter('ignore')

In [20]:
# Config



from calendar import EPOCH


ROOT_DIR = os.path.join(os.getcwd(), "..")
USE_FULLDATA = False
RESTRECTED_FEATURES = False
RANDOM_SEED = 2023
N_SPLITS= 4
ACTIVATION = 'relu'
ENCODER_SIZES = [10, 5]
Model_type = 'LogisticRegression'

# AutoEncoder
EPOCHS = 5
BATCH_SIZE = 32

# 使用する機械学習モデル
from sklearn.linear_model import LogisticRegression
params = {
    'penalty': 'l2',
    'solver': 'lbfgs',
    'random_state': RANDOM_SEED,
    'max_iter': 10
}
Model = LogisticRegression

In [9]:
print("ROOT DIRECTORY: ", ROOT_DIR)
print("USE: ", "Full data" if USE_FULLDATA else "10% data")
print("ran: ", datetime.datetime.now())
print(f"python:      {platform.python_version()}")
print(f"sklearn:     {sklearn.__version__}")
print(f"tensorflow:  {tf.__version__}")
print(f"keras:       {keras.__version__}")
print(f"numpy:       {np.__version__}")
print(f"pandas:      {pd.__version__}")

ROOT DIRECTORY:  D:\ml-study-archive-rsato\experiments\..
USE:  10% data
ran:  2023-10-29 23:27:35.475727
python:      3.10.5
sklearn:     1.2.2
tensorflow:  2.13.0-rc0
keras:       2.13.1rc0
numpy:       1.23.5
pandas:      1.5.3


In [10]:
# KDD'99 ラベルデータの読み込み
with open(ROOT_DIR + "/datasets/kddcup.names", "r") as f:
        # 一行目は不要なので無視
    _ = f.readline()
    # `:`より手前がラベルなので，その部分を抽出してリストに追加
    names = [line.split(':')[0] for line in f]
# 　正解ラベルを追加
names.append("true_label")

# KDD'99 クラスラベルデータの読み込み
with open(ROOT_DIR + "/datasets/training_attack_types", "r") as f:
    lines = f.read().split("\n")
    classes = {'normal': 'normal'}
    for line in lines:
        if len(line) == 0:
            continue
        k, v = tuple(line.split(" "))
        classes[k] = v

# 除外する特徴量のリスト
ignore_names = [
    "hot", "num_compromised", "num_file_creations",      
    "num_outbound_cmds", "is_host_login", "srv_count",
    "srv_serror_rate", "srv_rerror_rate", "same_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_diff_srv_rate"
    ]

# KDD'99 データの読み込み
if USE_FULLDATA:
    df = pd.read_csv(ROOT_DIR + "/datasets/kddcup.data", names=names, index_col=False)
else:
    df = pd.read_csv(ROOT_DIR + "/datasets/kddcup.data_10_percent", names=names, index_col=False)

# カテゴリー特徴量を削除
data_x: pd.DataFrame = df.copy().drop(columns=['protocol_type', 'service', 'flag'], axis=1)

# 除外する特徴量を削除
if RESTRECTED_FEATURES:
    data_x = data_x.drop(columns=ignore_names, axis=1)


# ラベルデータを切り分ける
data_y = data_x.pop("true_label").map(lambda x: x.replace('.', ''))
 
# namesを更新
names = data_x.columns

 # 正規化
data_x = pd.DataFrame(StandardScaler().fit_transform(data_x), columns=names)

# ラベルを変換
data_y = data_y.map(lambda x: classes[x])


In [11]:
# k分割
k_fold = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)

In [12]:
def generate_encoder(x: pd.DataFrame):
    _model = keras.Sequential( 
        [
            Dense(ENCODER_SIZES[0], activation=ACTIVATION, input_shape=(x.shape[1],), name="encoder0"),
            *[
                Dense(hidden_layer_size, activation=ACTIVATION, name=f"encoder{idx + 1}")
                for idx, hidden_layer_size in enumerate(ENCODER_SIZES[1:])
            ],
            *[
                Dense(hidden_layer_size, activation=ACTIVATION)
                for hidden_layer_size in ENCODER_SIZES[-2::-1]
            ],
            Dense(x.shape[1], activation=ACTIVATION),
        ]
    )
    _model.summary()
    _model.compile(optimizer="adam", loss="mse")
    _model.fit(x, x, epochs=EPOCHS, batch_size=BATCH_SIZE)
    return keras.Sequential(_model.layers[: len(ENCODER_SIZES)])

encoder = generate_encoder(data_x)
new_features = pd.DataFrame(encoder.predict(data_x),
columns=[f"ae_{idx}" for idx in range(ENCODER_SIZES[-1])])
# データを結合
data_x_ae = pd.concat([data_x, new_features], axis=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder0 (Dense)            (None, 10)                390       
                                                                 
 encoder1 (Dense)            (None, 5)                 55        
                                                                 
 dense (Dense)               (None, 10)                60        
                                                                 
 dense_1 (Dense)             (None, 38)                418       
                                                                 
Total params: 923 (3.61 KB)
Trainable params: 923 (3.61 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [13]:
data_x.shape, data_x_ae.shape

((494021, 38), (494021, 43))

In [14]:
results_default = dict()
results_ae = dict()


def predict(_x, _y):
    _generator = k_fold.split(_x, _y)
    accuracies = []
    for fold, (train_idx, test_idx) in enumerate(_generator):
        print(f"fold: {fold}")
        # データを分割
        x_train = _x.iloc[train_idx]
        y_train = _y.iloc[train_idx]
        x_test = _x.iloc[test_idx]
        y_test = _y.iloc[test_idx]

        # モデルを学習
        model = Model(**params)
        model.fit(x_train, y_train)
        # テストデータで評価
        accuracy = classification_report(y_test, model.predict(x_test), output_dict=True)
        accuracies.append(accuracy)
        print(f"accuracy: {accuracy['macro avg']['recall']}") # type: ignore
    return accuracies

In [15]:
# 元の特徴量のみで学習
results_default = predict(data_x, data_y)

# AE特徴量を追加して学習
results_ae = predict(data_x_ae, data_y)

fold: 0
accuracy: 0.7207328405557556
fold: 1
accuracy: 0.7414721330838735
fold: 2
accuracy: 0.7616946771778198
fold: 3
accuracy: 0.7679793235349159
fold: 0
accuracy: 0.6193339615226862
fold: 1
accuracy: 0.6660122396166361
fold: 2
accuracy: 0.5651071753340396
fold: 3
accuracy: 0.700306768812134


In [28]:
results = dict()
results['config'] = {
    'USE_FULLDATA': USE_FULLDATA,
    'RESTRECTED_FEATURES': RESTRECTED_FEATURES,
    'RANDOM_SEED': RANDOM_SEED,
    'N_SPLITS': N_SPLITS,
    'Date': datetime.datetime.now().__str__(),
    'ACTIVATION': ACTIVATION,
    'ENCODER_SIZES': ENCODER_SIZES,
    'EPOCHS': EPOCHS,
    'BATCH_SIZE': BATCH_SIZE,
    'Model': Model_type,
    'params': params
}
results['default'] = dict()
for k0 in ['default', 'ae']:
    results[k0] = dict()
    for k1 in results_default[0].keys():
        results[k0][k1] = dict()
        if not hasattr(results_default[0][k1], 'keys'):
            continue
        for k2 in results_default[0][k1].keys():
            results[k0][k1][k2] = np.mean([results_default[i][k1][k2] for i in range(4)])

In [29]:
results

{'config': {'USE_FULLDATA': False,
  'RESTRECTED_FEATURES': False,
  'RANDOM_SEED': 2023,
  'N_SPLITS': 4,
  'Date': '2023-10-30 01:02:20.161794',
  'ACTIVATION': 'relu',
  'ENCODER_SIZES': [10, 5],
  'EPOCHS': 5,
  'BATCH_SIZE': 32,
  'Model': 'LogisticRegression',
  'params': {'penalty': 'l2',
   'solver': 'lbfgs',
   'random_state': 2023,
   'max_iter': 10}},
 'default': {'dos': {'precision': 0.9983640610875981,
   'recall': 0.9972640807933347,
   'f1-score': 0.9978124117242654,
   'support': 97864.5},
  'normal': {'precision': 0.9823062010811222,
   'recall': 0.9894837511984348,
   'f1-score': 0.9858524051653282,
   'support': 24319.5},
  'probe': {'precision': 0.8757492902557835,
   'recall': 0.847825808435402,
   'f1-score': 0.861508336547864,
   'support': 1026.75},
  'r2l': {'precision': 0.618676891001053,
   'recall': 0.5398904621286691,
   'f1-score': 0.5732038873506753,
   'support': 281.5},
  'u2r': {'precision': 0.7642857142857143,
   'recall': 0.3653846153846154,
   'f1-s