In [2]:
import keras

from utils_kdd99 import *
print_version()

ran:  2023-10-06 09:34:11.837329
python:      3.10.11
sklearn:     1.2.2
tensorflow:  2.12.0
keras:       2.12.0
numpy:       1.23.5
pandas:      1.5.3


## General
- 使用するモデル：　ロジスティック回帰
- 使用するデータ：　KDD99(10%)
- 使用する特徴量の数： 38
- k分割交差検証の分割数： 4
## Autoencoder
- 特徴量の数： 8
- 構造： 38-32-16-8-16-32-38
- 活性化関数： relu
- 最適化関数： adam
- 損失関数： mse
- エポック数： 10
- バッチサイズ： 32
- 乱数シード： 2023

In [3]:
# configration

In [4]:
# データの読み込み (10% data)
X, y = load_data(verbose=0, to_number_labels=True)

In [5]:
y.value_counts()

0    391458
1     97278
2      4107
3      1126
4        52
Name: true_label, dtype: int64

In [6]:
X.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,-0.067792,-0.002879,0.138664,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,...,-3.451536,-1.694315,0.599396,-0.282867,-1.022077,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464
1,-0.067792,-0.00282,-0.011578,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,...,-3.297085,-1.600011,0.599396,-0.282867,-1.146737,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464
2,-0.067792,-0.002824,0.014179,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,...,-3.142633,-1.505707,0.599396,-0.282867,-1.188291,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464
3,-0.067792,-0.00284,0.014179,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,...,-2.988182,-1.411403,0.599396,-0.282867,-1.188291,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464
4,-0.067792,-0.002842,0.035214,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,...,-2.833731,-1.3171,0.599396,-0.282867,-1.209067,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464


In [7]:
# k分割
N_SPLITS= 4
from sklearn.model_selection import StratifiedKFold
k_fold = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=2023)
k_fold_generator = k_fold.split(X, y)

In [8]:
#　オートエンコーダ特徴量の作成関数
def generate_autoencoder_features(dataset, encoder_sizes, activation='relu')-> pd.DataFrame:
    model = keras.Sequential(
        [
            Dense(encoder_sizes[0], activation=activation, input_shape=(dataset.shape[1],), name="encoder0"),
            *[
                Dense(hidden_layer_size, activation=activation, name=f"encoder{idx + 1}")
                for idx, hidden_layer_size in enumerate(encoder_sizes[1:])
            ],
            *[
                Dense(hidden_layer_size, activation=activation)
                for hidden_layer_size in encoder_sizes[-2::-1]
            ],
            Dense(dataset.shape[1], activation=activation),
        ]
    )
    model.summary()
    model.compile(optimizer="adam", loss="mse")
    model.fit(dataset, dataset, epochs=10, batch_size=32)
    encoder = keras.Sequential(model.layers[: len(encoder_sizes)])
    X_pred = encoder.predict(dataset)
    return pd.DataFrame(X_pred, columns=[f"ae_{idx}" for idx in range(X_pred.shape[1])])

In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
# 元の特徴量のみでの学習
lr = RandomForestClassifier(random_state=2023,  n_jobs=8)
results_38 = dict()
for i, (train_idx, test_idx) in enumerate(k_fold_generator):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    lr.fit(X_train, y_train)
    results_38[i] = classification_report(y_test, lr.predict(X_test), output_dict=True)

In [10]:
for i in range(N_SPLITS):
    print(results_38[i]['macro avg']['recall'])

0.8836329360128976
0.9325912920590591
0.9143440309590034
0.9287028667552917


In [11]:
recall = np.mean([results_38[i]['macro avg']['recall'] for i in range(N_SPLITS)])
recall

0.914817781446563

In [12]:
results_38

{0: {'0': {'precision': 0.9999795630581839,
   'recall': 0.9999591269516881,
   'f1-score': 0.9999693449005241,
   'support': 97864},
  '1': {'precision': 0.9988909426987062,
   'recall': 0.9999177631578947,
   'f1-score': 0.9994040891811363,
   'support': 24320},
  '2': {'precision': 1.0,
   'recall': 0.9922103213242454,
   'f1-score': 0.9960899315738024,
   'support': 1027},
  '3': {'precision': 1.0,
   'recall': 0.9645390070921985,
   'f1-score': 0.9819494584837545,
   'support': 282},
  '4': {'precision': 0.75,
   'recall': 0.46153846153846156,
   'f1-score': 0.5714285714285714,
   'support': 13},
  'accuracy': 0.9997490000485806,
  'macro avg': {'precision': 0.949774101151378,
   'recall': 0.8836329360128976,
   'f1-score': 0.9097682791135577,
   'support': 123506},
  'weighted avg': {'precision': 0.9997391032464711,
   'recall': 0.9997490000485806,
   'f1-score': 0.9997395276073827,
   'support': 123506}},
 1: {'0': {'precision': 0.9999591273693353,
   'recall': 0.999969345213766

In [13]:
# オートエンコーダ特徴量を加えたXの作成
X_ae = pd.concat([X, generate_autoencoder_features(X, [10, 5])], axis=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder0 (Dense)            (None, 10)                390       
                                                                 
 encoder1 (Dense)            (None, 5)                 55        
                                                                 
 dense (Dense)               (None, 10)                60        
                                                                 
 dense_1 (Dense)             (None, 38)                418       
                                                                 
Total params: 923
Trainable params: 923
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10


2023-10-06 09:34:23.246654: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
k_fold_generator = k_fold.split(X_ae, y)

In [15]:
from sklearn.ensemble import RandomForestClassifier

# オートエンコーダ特徴量を加えた特徴量での学習
lr_ae = RandomForestClassifier(random_state=2023)
results = dict()
for i, (train_idx, test_idx) in enumerate(k_fold_generator):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    lr_ae.fit(X_train, y_train)
    results[i] = classification_report(y_test, lr_ae.predict(X_test), output_dict=True)

In [16]:
recall = np.mean([results[i]['macro avg']['recall'] for i in range(N_SPLITS)])
recall

0.914817781446563

In [17]:
X_ae.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,ae_0,ae_1,ae_2,ae_3,ae_4
0,-0.067792,-0.002879,0.138664,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,...,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,1.858684,2.94754,4.101514,2.833884,2.709509
1,-0.067792,-0.00282,-0.011578,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,...,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,1.810396,2.926773,4.052037,2.759405,2.69086
2,-0.067792,-0.002824,0.014179,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,...,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,1.768042,2.904612,4.003182,2.695909,2.682333
3,-0.067792,-0.00284,0.014179,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,...,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,1.727767,2.887534,3.957442,2.634125,2.67667
4,-0.067792,-0.002842,0.035214,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.39698,-0.005679,...,-0.158629,-0.464418,-0.463202,-0.25204,-0.249464,1.686351,2.866448,3.908681,2.571404,2.669727


In [18]:
results

{0: {'0': {'precision': 0.9999795630581839,
   'recall': 0.9999591269516881,
   'f1-score': 0.9999693449005241,
   'support': 97864},
  '1': {'precision': 0.9988909426987062,
   'recall': 0.9999177631578947,
   'f1-score': 0.9994040891811363,
   'support': 24320},
  '2': {'precision': 1.0,
   'recall': 0.9922103213242454,
   'f1-score': 0.9960899315738024,
   'support': 1027},
  '3': {'precision': 1.0,
   'recall': 0.9645390070921985,
   'f1-score': 0.9819494584837545,
   'support': 282},
  '4': {'precision': 0.75,
   'recall': 0.46153846153846156,
   'f1-score': 0.5714285714285714,
   'support': 13},
  'accuracy': 0.9997490000485806,
  'macro avg': {'precision': 0.949774101151378,
   'recall': 0.8836329360128976,
   'f1-score': 0.9097682791135577,
   'support': 123506},
  'weighted avg': {'precision': 0.9997391032464711,
   'recall': 0.9997490000485806,
   'f1-score': 0.9997395276073827,
   'support': 123506}},
 1: {'0': {'precision': 0.9999591273693353,
   'recall': 0.999969345213766

In [19]:
np.mean([results[i]['macro avg']['recall'] for i in range(N_SPLITS)])

0.914817781446563

In [20]:
np.mean([results_38[i]['macro avg']['recall'] for i in range(N_SPLITS)])

0.914817781446563

In [21]:
np.mean([results[i]['macro avg']['f1-score'] for i in range(N_SPLITS)])

0.935085926162575

In [22]:
np.mean([results_38[i]['macro avg']['f1-score'] for i in range(N_SPLITS)])

0.935085926162575

In [23]:
np.mean([results[i]['4']['recall'] for i in range(N_SPLITS)])

0.6153846153846154

In [24]:
np.mean([results_38[i]['4']['recall'] for i in range(N_SPLITS)])

0.6153846153846154

0.6153846153846154