In [1]:
import datetime
import platform
import os
import sys
import json
import time
import warnings

import numpy as np
import pandas as pd
import tensorflow as tf
import keras
from keras.layers import Dense
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold

# Evaluation
from sklearn.metrics import confusion_matrix, classification_report

warnings.simplefilter('ignore')



In [2]:
# Config
DEBUG = False

from calendar import EPOCH

DATASET_NAME = 'kdd99'
AUTOENCODER_USED_DATA = 'all' # all, normal, anomaly, dos, probe, u2r, r2l
USE_FULLDATA = False
DESCRIPTION = 'None'

ROOT_DIR = os.path.join(os.getcwd(), "..")
RESTRECTED_FEATURES = False
RANDOM_SEED = 2023
N_SPLITS= 4
ACTIVATION = 'relu'
ENCODER_SIZES = [10, 5]
Model_type = 'LogisticRegression'
TIME_STAMP = datetime.datetime.now(tz=datetime.timezone(datetime.timedelta(hours=9)))

# AutoEncoder
EPOCHS = 5
BATCH_SIZE = 32

# 使用する機械学習モデル
from sklearn.linear_model import LogisticRegression
Model = LogisticRegression
params = {
    'penalty': 'l2',
    'solver': 'lbfgs',
    'random_state': RANDOM_SEED,
    'max_iter': 200
}

In [3]:
print("ROOT DIRECTORY: ", ROOT_DIR)
print("USE: ", "Full data" if USE_FULLDATA else "10% data")
print("ran: ", TIME_STAMP)
print(f"python:      {platform.python_version()}")
print(f"sklearn:     {sklearn.__version__}")
print(f"tensorflow:  {tf.__version__}")
print(f"keras:       {keras.__version__}")
print(f"numpy:       {np.__version__}")
print(f"pandas:      {pd.__version__}")

ROOT DIRECTORY:  /Users/rsato/ml/experiments/..
USE:  10% data
ran:  2023-11-08 16:43:20.428923+09:00
python:      3.9.6
sklearn:     1.3.1
tensorflow:  2.14.0
keras:       2.14.0
numpy:       1.26.1
pandas:      2.1.1


In [4]:
# KDD'99 ラベルデータの読み込み
with open(ROOT_DIR + "/datasets/kddcup.names", "r") as f:
        # 一行目は不要なので無視
    _ = f.readline()
    # `:`より手前がラベルなので，その部分を抽出してリストに追加
    names = [line.split(':')[0] for line in f]
# 　正解ラベルを追加
names.append("true_label")

# KDD'99 クラスラベルデータの読み込み
with open(ROOT_DIR + "/datasets/training_attack_types", "r") as f:
    lines = f.read().split("\n")
    classes = {'normal': 'normal'}
    for line in lines:
        if len(line) == 0:
            continue
        k, v = tuple(line.split(" "))
        classes[k] = v

# 除外する特徴量のリスト
ignore_names = [
    "hot", "num_compromised", "num_file_creations",      
    "num_outbound_cmds", "is_host_login", "srv_count",
    "srv_serror_rate", "srv_rerror_rate", "same_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
    "dst_host_diff_srv_rate"
    ]
category_names = ["protocol_type", "service", "flag"]

# KDD'99 データの読み込み
if USE_FULLDATA:
    df = pd.read_csv(ROOT_DIR + "/datasets/kddcup.data", names=names, index_col=False)
else:
    df = pd.read_csv(ROOT_DIR + "/datasets/kddcup.data_10_percent", names=names, index_col=False)

# カテゴリー特徴量を削除
data_x: pd.DataFrame = df.copy().drop(columns=category_names, axis=1)

# 除外する特徴量を削除
if RESTRECTED_FEATURES:
    data_x = data_x.drop(columns=ignore_names, axis=1)


# ラベルデータを切り分ける
data_y = data_x.pop("true_label").map(lambda x: x.replace('.', ''))
 
# namesを更新
names = data_x.columns

 # 正規化
data_x = pd.DataFrame(StandardScaler().fit_transform(data_x), columns=names)

# ラベルを変換
data_y = data_y.map(lambda x: classes[x])


In [5]:
# k分割
k_fold = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)

In [6]:
def generate_encoder(x: pd.DataFrame):
    if ENCODER_SIZES is None:
        return None
    _model = keras.Sequential( 
        [
            Dense(ENCODER_SIZES[0], activation=ACTIVATION, input_shape=(x.shape[1],), name="encoder0"),
            *[
                Dense(hidden_layer_size, activation=ACTIVATION, name=f"encoder{idx + 1}")
                for idx, hidden_layer_size in enumerate(ENCODER_SIZES[1:])
            ],
            *[
                Dense(hidden_layer_size, activation=ACTIVATION)
                for hidden_layer_size in ENCODER_SIZES[-2::-1]
            ],
            Dense(x.shape[1], activation=ACTIVATION),
        ]
    )
    _model.summary()
    _model.compile(optimizer="adam", loss="mse")
    _model.fit(x, x, epochs=EPOCHS, batch_size=BATCH_SIZE)
    return keras.Sequential(_model.layers[: len(ENCODER_SIZES)])

if ENCODER_SIZES is not None:
    encoder = generate_encoder(data_x)
    new_features = pd.DataFrame(encoder.predict(data_x),
    columns=[f"ae_{idx}" for idx in range(ENCODER_SIZES[-1])])
    # データを結合
    data_x = pd.concat([data_x, new_features], axis=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder0 (Dense)            (None, 10)                390       
                                                                 
 encoder1 (Dense)            (None, 5)                 55        
                                                                 
 dense (Dense)               (None, 10)                60        
                                                                 
 dense_1 (Dense)             (None, 38)                418       
                                                                 
Total params: 923 (3.61 KB)
Trainable params: 923 (3.61 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [7]:
data_x.shape

(494021, 43)

In [8]:
def predict(_x, _y):
    _generator = k_fold.split(_x, _y)
    accuracies = []
    for fold, (train_idx, test_idx) in enumerate(_generator):
        print(f"fold: {fold}")
        # データを分割
        x_train = _x.iloc[train_idx]
        y_train = _y.iloc[train_idx]
        x_test = _x.iloc[test_idx]
        y_test = _y.iloc[test_idx]

        # モデルを学習
        model = Model(**params)
        model.fit(x_train, y_train)
        # テストデータで評価
        accuracy = classification_report(y_test, model.predict(x_test), output_dict=True)
        accuracies.append(accuracy)
        print(f"f1-score: {accuracy['macro avg']['f1-score']}") # type: ignore
    return accuracies

In [9]:
# 学習
results = predict(data_x, data_y)

fold: 0
f1-score: 0.8262941811949478
fold: 1
f1-score: 0.9245261298720449
fold: 2
f1-score: 0.8743471668141941
fold: 3
f1-score: 0.9164669584476229


In [10]:
outputs = {
    'feature_size': data_x.shape[1],
    'dropped': RESTRECTED_FEATURES,
    'seed': RANDOM_SEED,
    'splits': N_SPLITS,
    'datetime': TIME_STAMP,
    'dataset': {
        'name': DATASET_NAME,
        'description': DESCRIPTION,
        'ratio': 1 if USE_FULLDATA else 0.1,
        'size': data_x.shape[1] - ENCODER_SIZES[-1] if ENCODER_SIZES is not None else data_x.shape[1],
    },
    'autoencoder': {
        'used_data': AUTOENCODER_USED_DATA,
        'layers': ENCODER_SIZES,
        'epochs': EPOCHS,
        'activation': ACTIVATION,
        'batch_size': BATCH_SIZE
    },
    'model': {
        'type': Model_type,
        **params
    }
}
outputs['result'] = dict()
for k1 in results[0].keys():
    outputs['result'][k1] = dict()
    if not hasattr(results[0][k1], 'keys'):
        continue
    for k2 in results[0][k1].keys():
        if k2 == 'support':
            outputs['result'][k1][k2] = np.sum([results[i][k1][k2] for i in range(4)])
        else:
            outputs['result'][k1][k2] = np.mean([results[i][k1][k2] for i in range(4)])

In [11]:
if not DEBUG:
    from pymongo.mongo_client import MongoClient
    from pymongo.server_api import ServerApi
    from dotenv import load_dotenv
    load_dotenv()
    uri = os.getenv('MongoDBURI')

    # Create a new client and connect to the server
    client = MongoClient(uri, server_api=ServerApi('1'))

    # Send a ping to confirm a successful connection
    client.admin.command('ping')
    db = client.get_database('ml')
    assert db is not None, "db is None"
    collection = db.get_collection('results')
    assert collection is not None, "collection is None"
    _result = collection.insert_one(outputs)
    if _result is None:
        print("successfully reflected results to DB")


successfully connected to MongoDB!


In [14]:
if not DEBUG:
    assert Model_type in locals().keys(), f"Model_type: {Model_type} does not match"
    dropped = "-d" if RESTRECTED_FEATURES else ""
    fname = f'{ROOT_DIR}/results/{DATASET_NAME}{dropped}.{Model_type}.{ENCODER_SIZES}.json'
    outputs['datetime'] = str(outputs['datetime'])
    del outputs['_id']
    with open(fname, 'w') as f:
        json.dump(outputs, f, indent=4)