In [61]:
# libraryの読み込み
import platform
import sklearn
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import keras
import matplotlib.pyplot as plt


try:
    # colab環境上
    from google.colab import drive
    drive.mount('/content/drive')
    pwd = '/content/drive/MyDrive/dataset/'
    is_colab = True
except ImportError:
    import os
    # ローカル環境
    pwd = os.getcwd() + '/../dataset/'
    is_colab = False


# 環境出力
print(f"colab: {is_colab}")
print(f"python:      {platform.python_version()}")
print(f"sklearn:     {sklearn.__version__}")
print(f"tensorflow:  {tf.__version__}")
print(f"keras:       {keras.__version__}")
print(f"numpy:       {np.__version__}")
print(f"pandas:      {pd.__version__}")

# enable TPU
# if is_colab:
#     # detect TPU
#     try:
#         tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
#         print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
#     except ValueError:
#         pass

#     tf.config.experimental_connect_to_cluster(tpu)
#     tf.tpu.experimental.initialize_tpu_system(tpu)
#     tpu_strategy = tf.distribute.TPUStrategy(tpu)


colab: False
python:      3.10.11
sklearn:     1.2.2
tensorflow:  2.12.0
keras:       2.12.0
numpy:       1.23.5
pandas:      1.5.3


In [66]:
def load_data():
    """
    データの読み込み
    :return:
    """
    # 特徴量名の読み込み
    with open(pwd + "kddcup.names") as fp:
        # 一行目は不要なので無視
        _ = fp.readline()
        # `:`より手前がラベルなので，その部分を抽出してリストに追加
        names = [line.split(':')[0] for line in fp]
    print(f"特徴量の数：{len(names)}")
    print(f"各特徴量の名前：{', '.join(names)}")
    #　正解ラベルを追加
    names.append("true_label")
    data = pd.read_csv(pwd + "kddcup.data_10_percent", names=names, index_col=False)
    data_x = data.copy()
    data_x = data_x.drop(columns=['protocol_type', 'service', 'flag'], axis=1)
    true_label = data_x.pop('true_label')# 正解ラベルのピリオドを外す．
    names = data_x.columns
    from sklearn.preprocessing import StandardScaler
    data_x = StandardScaler().fit_transform(data_x)
    data_x = pd.DataFrame(data_x, columns=names)
    true_label = true_label.map(lambda x: x.replace('.', ''))
    print(true_label.value_counts())
    return data_x, true_label

# attack_class_labels -> key: class, value: list[label]
attack_class_labels = {
    'normal': ['normal'],
    'dos': ['back', 'land', 'neptune', 'pod', 'smurf', 'teardrop'],
    'u2r': ['buffer_overflow', 'loadmodule', 'perl', 'rootkit'],
    'r2l': ['ftp_write', 'guess_passwd', 'imap', 'multihop', 'phf', 'spy', 'warezclient', 'warezmaster'],
    'probe': ['ipsweep', 'nmap', 'portsweep', 'satan']
}
# attack_class_label -> key: label, value: class
attack_label_class = {}
for c, labels in attack_class_labels.items():
    for label in labels:
        attack_label_class[label] = c


In [67]:
data_x, data_y = load_data()

# 4つのクラスラベルに変換する
data_y = data_y.map(lambda x: attack_label_class[x])
data_y.value_counts()


特徴量の数：41
各特徴量の名前：duration, protocol_type, service, flag, src_bytes, dst_bytes, land, wrong_fragment, urgent, hot, num_failed_logins, logged_in, num_compromised, root_shell, su_attempted, num_root, num_file_creations, num_shells, num_access_files, num_outbound_cmds, is_host_login, is_guest_login, count, srv_count, serror_rate, srv_serror_rate, rerror_rate, srv_rerror_rate, same_srv_rate, diff_srv_rate, srv_diff_host_rate, dst_host_count, dst_host_srv_count, dst_host_same_srv_rate, dst_host_diff_srv_rate, dst_host_same_src_port_rate, dst_host_srv_diff_host_rate, dst_host_serror_rate, dst_host_srv_serror_rate, dst_host_rerror_rate, dst_host_srv_rerror_rate
smurf              280790
neptune            107201
normal              97278
back                 2203
satan                1589
ipsweep              1247
portsweep            1040
warezclient          1020
teardrop              979
pod                   264
nmap                  231
guess_passwd           53
buffer_overflow        30


dos       391458
normal     97278
probe       4107
r2l         1126
u2r           52
Name: true_label, dtype: int64

In [68]:
from sklearn.model_selection import train_test_split
# data_y = pd.get_dummies(data_y)
# データを分割する。テストと，学習の比は1：2なるようにします。
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.33,random_state=2018, stratify=data_y)
print(f"x_train: {len(x_train)}, x_test: {len(x_test)}")
print(f'y_train')
print((y_train.value_counts()))
print(f'y_test')
print(y_test.value_counts())

x_train: 330994, x_test: 163027
y_train
dos       262277
normal     65176
probe       2752
r2l          754
u2r           35
Name: true_label, dtype: int64
y_test
dos       129181
normal     32102
probe       1355
r2l          372
u2r           17
Name: true_label, dtype: int64


In [69]:
y_test.head()

38204     normal
176442       dos
384838       dos
261369       dos
17533     normal
Name: true_label, dtype: object

### scikit-learnのLigisticRegressionによる多クラス分類

In [70]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=2018, solver='liblinear')

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

In [71]:
from sklearn.metrics import accuracy_score,classification_report
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

         dos       1.00      1.00      1.00    129181
      normal       0.99      0.99      0.99     32102
       probe       0.93      0.86      0.89      1355
         r2l       0.87      0.86      0.86       372
         u2r       0.54      0.41      0.47        17

    accuracy                           1.00    163027
   macro avg       0.86      0.83      0.84    163027
weighted avg       1.00      1.00      1.00    163027


## オートエンコーダを用いて，学習したモデルのエンコーダ部分を教師あり学習に用いる

In [72]:
from keras.layers import Dense

model = keras.Sequential([
    Dense(units=19, activation='relu', input_dim=38, name='encoder1'),
    Dense(units=10, activation='relu', name='encoder2'),
    Dense(units=19, activation='relu'),
    Dense(units=38, activation='relu'),
])
model.summary()


Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder1 (Dense)            (None, 19)                741       
                                                                 
 encoder2 (Dense)            (None, 10)                200       
                                                                 
 dense_8 (Dense)             (None, 19)                209       
                                                                 
 dense_9 (Dense)             (None, 38)                760       
                                                                 
Total params: 1,910
Trainable params: 1,910
Non-trainable params: 0
_________________________________________________________________


In [73]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
model.fit(x_train, x_train,
          epochs=1, # データセットを使って学習する回数
        batch_size=32,
        shuffle=True, # データをシャッフルすることで，．
        validation_data=(x_train, x_train), # 評価用データ（検証データ）の指定
        verbose=1,
        use_multiprocessing=True
          )




<keras.callbacks.History at 0x2820e8940>

In [74]:
x_predict = model.predict(x_test)
def reconstruction_errors(original_df: pd.DataFrame, reconstructed_df: pd.DataFrame) -> np.ndarray:
    """
    元の特徴量行列と，新たに再構成された特徴量行列の間の再構成誤差を計算する．
    各特徴量ごとの再構成前後の誤差を二乗し，全ての特徴量を足し合わせる．正規化して0〜1に納める．
    0に近いほど正常，１に近いほど異常
    :param original_df: 元の特徴量の行列
    :param reconstructed_df: 再構成された特徴量の行列
    :param reversed:
    :return:
    """
    loss: np.ndarray = np.sum((np.array(original_df) - np.array(reconstructed_df)) ** 2, axis=1)
    loss: pd.Series = pd.Series(loss, index=original_df.index)
    loss: np.ndarray = (loss - np.min(loss)) / (np.max(loss) - np.min(loss))
    return loss



In [75]:
encoder = keras.Sequential([model.get_layer('encoder1'),
                            model.get_layer('encoder2')])


In [76]:
columns = list(map(lambda x: 'feature' + str(x), range(10)))
x_train_encoded = pd.DataFrame(data=encoder.predict(x_train), index=x_train.index, columns=columns)
x_test_encoded = pd.DataFrame(data=encoder.predict(x_test), index=x_test.index, columns=columns)
x_train_new_feature = x_train.merge(x_train_encoded, right_index=True, left_index=True)
x_test_new_feature = x_test.merge(x_test_encoded, right_index=True, left_index=True)




In [77]:
x_train_new_feature

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9
212221,-0.067792,-0.002017,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,0.000000,1.277205,0.000000,0.108720,0.422462,1.648855,0.077227,0.175952,0.000000,1.477143
30903,-0.067792,-0.002774,0.472896,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.396980,-0.005679,...,1.958844,0.378546,6.753596,6.021462,0.000000,2.715844,2.443421,0.000000,0.484922,2.399273
9739,-0.067792,-0.002017,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,0.000000,1.276137,0.000000,0.107538,0.423957,1.647022,0.075201,0.175824,0.000000,1.477987
37540,-0.067792,-0.002776,-0.014120,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,2.396980,-0.005679,...,2.131836,0.000000,6.723519,6.154136,0.000000,2.330154,2.873864,0.000000,0.620295,2.420650
418638,-0.067792,-0.002535,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,0.000000,1.276377,0.000000,0.107716,0.424246,1.647340,0.075353,0.176048,0.000000,1.477969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443069,-0.067792,-0.002535,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,0.000000,1.276377,0.000000,0.107716,0.424246,1.647340,0.075353,0.176048,0.000000,1.477969
282911,-0.067792,-0.002017,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,0.000000,1.276137,0.000000,0.107538,0.423957,1.647022,0.075201,0.175824,0.000000,1.477987
156862,-0.067792,-0.002017,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,0.000000,1.277205,0.000000,0.108720,0.422462,1.648855,0.077227,0.175952,0.000000,1.477143
227651,-0.067792,-0.002017,-0.026287,-0.006673,-0.04772,-0.002571,-0.044136,-0.009782,-0.417192,-0.005679,...,0.000000,1.276137,0.000000,0.107538,0.423957,1.647022,0.075201,0.175824,0.000000,1.477987


In [78]:
model = LogisticRegression(random_state=2018, solver='liblinear')

model.fit(x_train_new_feature, y_train)
y_predict = model.predict(x_test_new_feature)
print(classification_report(y_test, y_predict))



              precision    recall  f1-score   support

         dos       1.00      1.00      1.00    129181
      normal       0.99      1.00      1.00     32102
       probe       0.99      0.94      0.96      1355
         r2l       0.90      0.80      0.85       372
         u2r       0.54      0.41      0.47        17

    accuracy                           1.00    163027
   macro avg       0.88      0.83      0.85    163027
weighted avg       1.00      1.00      1.00    163027


## カテゴリデータを除いた３８次元の特徴量を入力に用いたロジスティック回帰分類
              precision    recall  f1-score   support

         dos       1.00      1.00      1.00    129181
      normal       0.99      0.99      0.99     32102
       probe       0.93      0.86      0.89      1355
         r2l       0.87      0.86      0.86       372
         u2r       0.54      0.41      0.47        17

    accuracy                           1.00    163027
   macro avg       0.86      0.83      0.84    163027
weighted avg       1.00      1.00      1.00    163027




## カテゴリデータを除いた３８次元の特徴量と，オートエンコーダで学習したエンコーダー部分の１０次元の特徴量を合わせた４８次元を入力に用いたロジスティック回帰分類
              precision    recall  f1-score   support

         dos       1.00      1.00      1.00    129181
      normal       0.99      1.00      1.00     32102
       probe       0.99      0.94      0.96      1355
         r2l       0.90      0.80      0.85       372
         u2r       0.54      0.41      0.47        17

    accuracy                           1.00    163027
   macro avg       0.88      0.83      0.85    163027
weighted avg       1.00      1.00      1.00    163027

## エンコーダ部分の出力を10->5次元に圧縮して再度行う

In [79]:
model = keras.Sequential([
    Dense(units=19, activation='relu', input_dim=38, name='encoder1'),
    Dense(units=5, activation='relu', name='encoder2'),
    Dense(units=19, activation='relu'),
    Dense(units=38, activation='relu'),
])
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder1 (Dense)            (None, 19)                741       
                                                                 
 encoder2 (Dense)            (None, 5)                 100       
                                                                 
 dense_10 (Dense)            (None, 19)                114       
                                                                 
 dense_11 (Dense)            (None, 38)                760       
                                                                 
Total params: 1,715
Trainable params: 1,715
Non-trainable params: 0
_________________________________________________________________


In [81]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
model.fit(x_train, x_train,
          epochs=1, # データセットを使って学習する回数
        batch_size=32,
        shuffle=True, # データをシャッフルすることで，．
        validation_data=(x_train, x_train), # 評価用データ（検証データ）の指定
        verbose=1,
        use_multiprocessing=True
          )
x_predict = model.predict(x_test)
encoder = keras.Sequential([model.get_layer('encoder1'),
                            model.get_layer('encoder2')])
columns = list(map(lambda x: 'feature' + str(x), range(5)))
x_train_encoded = pd.DataFrame(data=encoder.predict(x_train), index=x_train.index, columns=columns)
x_test_encoded = pd.DataFrame(data=encoder.predict(x_test), index=x_test.index, columns=columns)
x_train_new_feature = x_train.merge(x_train_encoded, right_index=True, left_index=True)
x_test_new_feature = x_test.merge(x_test_encoded, right_index=True, left_index=True)



## カテゴリデータを除いた３８次元の特徴量を入力に用いたロジスティック回帰分類
              precision    recall  f1-score   support

         dos       1.00      1.00      1.00    129181
      normal       0.99      0.99      0.99     32102
       probe       0.93      0.86      0.89      1355
         r2l       0.87      0.86      0.86       372
         u2r       0.54      0.41      0.47        17

    accuracy                           1.00    163027
   macro avg       0.86      0.83      0.84    163027
weighted avg       1.00      1.00      1.00    163027


## カテゴリデータを除いた３８次元の特徴量と，オートエンコーダで学習したエンコーダー部分の5次元の特徴量を合わせた４3次元を入力に用いたロジスティック回帰分類

              precision    recall  f1-score   support

         dos       1.00      1.00      1.00    129181
      normal       0.99      1.00      0.99     32102
       probe       0.98      0.91      0.94      1355
         r2l       0.87      0.77      0.82       372
         u2r       0.88      0.41      0.56        17

    accuracy                           1.00    163027
   macro avg       0.94      0.82      0.86    163027
weighted avg       1.00      1.00      1.00    163027



In [82]:
model = LogisticRegression(random_state=2018, solver='liblinear')
model.fit(x_train_new_feature, y_train)
y_predict = model.predict(x_test_new_feature)
print(classification_report(y_test, y_predict))




              precision    recall  f1-score   support

         dos       1.00      1.00      1.00    129181
      normal       0.99      1.00      0.99     32102
       probe       0.98      0.91      0.94      1355
         r2l       0.87      0.77      0.82       372
         u2r       0.88      0.41      0.56        17

    accuracy                           1.00    163027
   macro avg       0.94      0.82      0.86    163027
weighted avg       1.00      1.00      1.00    163027


In [83]:
model = LogisticRegression(random_state=2018, solver='liblinear')

model.fit(x_train_encoded, y_train)
y_predict = model.predict(x_test_encoded)
print(classification_report(y_test, y_predict))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         dos       0.96      0.99      0.98    129181
      normal       0.94      0.84      0.89     32102
       probe       1.00      0.56      0.72      1355
         r2l       0.00      0.00      0.00       372
         u2r       0.00      0.00      0.00        17

    accuracy                           0.96    163027
   macro avg       0.58      0.48      0.52    163027
weighted avg       0.96      0.96      0.96    163027


  _warn_prf(average, modifier, msg_start, len(result))


## エンコーダー部分の５次元の出力を入力としたロジスティック回帰分類

              precision    recall  f1-score   support

         dos       0.96      0.99      0.98    129181
      normal       0.94      0.84      0.89     32102
       probe       1.00      0.56      0.72      1355
         r2l       0.00      0.00      0.00       372
         u2r       0.00      0.00      0.00        17

    accuracy                           0.96    163027
   macro avg       0.58      0.48      0.52    163027
weighted avg       0.96      0.96      0.96    163027
