## カテゴリデータを削除して実装する．
## 全てのデータを含むデータセットで試す．
## Accuracyや混合行列など他の指標で評価する．
## macro, microについても理解する．

In [2]:
from utils import *
print_version()

python:      3.10.11
sklearn:     1.2.2
tensorflow:  2.13.0-rc0
keras:       2.13.1rc0
numpy:       1.24.3
pandas:      1.5.3


In [3]:
# 特徴量名の読み込み
with open("../dataset/kddcup.names") as fp:
    # 一行目は不要なので無視
    _ = fp.readline()
    # `:`より手前がラベルなので，その部分を抽出してリストに追加
    names = [line.split(':')[0] for line in fp]
print(f"特徴量の数：{len(names)}")
print(f"各特徴量の名前：{', '.join(names)}")

特徴量の数：41
各特徴量の名前：duration, protocol_type, service, flag, src_bytes, dst_bytes, land, wrong_fragment, urgent, hot, num_failed_logins, logged_in, num_compromised, root_shell, su_attempted, num_root, num_file_creations, num_shells, num_access_files, num_outbound_cmds, is_host_login, is_guest_login, count, srv_count, serror_rate, srv_serror_rate, rerror_rate, srv_rerror_rate, same_srv_rate, diff_srv_rate, srv_diff_host_rate, dst_host_count, dst_host_srv_count, dst_host_same_srv_rate, dst_host_diff_srv_rate, dst_host_same_src_port_rate, dst_host_srv_diff_host_rate, dst_host_serror_rate, dst_host_srv_serror_rate, dst_host_rerror_rate, dst_host_srv_rerror_rate


In [4]:
# データの読み込み
#　正解ラベルを追加
names.append("true_label")
data = pd.read_csv("../dataset/kddcup.data", names=names, index_col=False)
true_label = data.pop('true_label')

In [5]:
true_label_counts = true_label.value_counts()
true_label_counts

smurf.              2807886
neptune.            1072017
normal.              972781
satan.                15892
ipsweep.              12481
portsweep.            10413
nmap.                  2316
back.                  2203
warezclient.           1020
teardrop.               979
pod.                    264
guess_passwd.            53
buffer_overflow.         30
land.                    21
warezmaster.             20
imap.                    12
rootkit.                 10
loadmodule.               9
ftp_write.                8
multihop.                 7
phf.                      4
perl.                     3
spy.                      2
Name: true_label, dtype: int64

In [6]:
# 正解ラベルのピリオドを外す．
true_label = true_label.map(lambda x: x.replace('.', ''))
true_label

0          normal
1          normal
2          normal
3          normal
4          normal
            ...  
4898426    normal
4898427    normal
4898428    normal
4898429    normal
4898430    normal
Name: true_label, Length: 4898431, dtype: object

### 攻撃の種類は，大まかに４種類(`Dos`, `U2R`, `R2L`, `Probe`)
### 後の学習プログラムのために，予め各クラスを定義しておく，
### class -> `Dos`, `U2R`, `R2L`, `Probe`
### label -> `duration`, `protocol_type`, `service`, `flag`, `src_bytes`, `dst_bytes`, `land`, `wrong_fragment`, `urgent`, `hot`, `num_failed_logins`, `logged_in`, `num_compromised`, `root_shell`, `su_attempted`, `num_root`, `num_file_creations`, `num_shells`, `num_access_files`, `num_outbound_cmds`, `is_host_login`, `is_guest_login`, `count`, `srv_count`, `serror_rate`, `srv_serror_rate`, `rerror_rate`, `srv_rerror_rate`, `same_srv_rate`, `diff_srv_rate`, `srv_diff_host_rate`, `dst_host_count`, `dst_host_srv_count`, `dst_host_same_srv_rate`, `dst_host_diff_srv_rate`, `dst_host_same_src_port_rate`, `dst_host_srv_diff_host_rate`, `dst_host_serror_rate`, `dst_host_srv_serror_rate`, `dst_host_rerror_rate`

In [7]:
# attack_class_labels -> key: class, value: list[label]
attack_class_labels = {
    'normal': ['normal'],
    'dos': ['back', 'land', 'neptune', 'pod', 'smurf', 'teardrop'],
    'u2r': ['buffer_overflow', 'loadmodule', 'perl', 'rootkit'],
    'r2l': ['ftp_write', 'guess_passwd', 'imap', 'multihop', 'phf', 'spy', 'warezclient', 'warezmaster'],
    'probe': ['ipsweep', 'nmap', 'portsweep', 'satan']
}
# attack_class_label -> key: label, value: class
attack_label_class = {}
for c, labels in attack_class_labels.items():
    for label in labels:
        attack_label_class[label] = c
attack_label_class

{'normal': 'normal',
 'back': 'dos',
 'land': 'dos',
 'neptune': 'dos',
 'pod': 'dos',
 'smurf': 'dos',
 'teardrop': 'dos',
 'buffer_overflow': 'u2r',
 'loadmodule': 'u2r',
 'perl': 'u2r',
 'rootkit': 'u2r',
 'ftp_write': 'r2l',
 'guess_passwd': 'r2l',
 'imap': 'r2l',
 'multihop': 'r2l',
 'phf': 'r2l',
 'spy': 'r2l',
 'warezclient': 'r2l',
 'warezmaster': 'r2l',
 'ipsweep': 'probe',
 'nmap': 'probe',
 'portsweep': 'probe',
 'satan': 'probe'}

In [8]:
# データの定義
data_x = data.copy()
true_labels = pd.DataFrame({
    'class': [attack_label_class[label] for label in true_label],
    'name': true_label,
    'label': [int(t_or_f) for t_or_f in true_label != 'normal']
})
data_y = true_labels['label'].copy()

### カテゴリーデータを削除
カテゴリデータは，ワンホットエンコーディングすると学習に大きく影響を受けてしまうため，特徴量から外す．

In [9]:
data_x = data_x.drop(columns=['protocol_type', 'service', 'flag'], axis=1)

In [10]:
print(f"データの総数：{len(data)}")
# 各クラスとそのクラスに属するラベルのデータ数
for c, labels in attack_class_labels.items():
    print(f"{c}: {len(true_label[true_labels['class'] == c])}")
    for l in labels:
        print(f"   {l}: {len(true_label[true_labels['name'] == l])}")

データの総数：4898431
normal: 972781
   normal: 972781
dos: 3883370
   back: 2203
   land: 21
   neptune: 1072017
   pod: 264
   smurf: 2807886
   teardrop: 979
u2r: 52
   buffer_overflow: 30
   loadmodule: 9
   perl: 3
   rootkit: 10
r2l: 1126
   ftp_write: 8
   guess_passwd: 53
   imap: 12
   multihop: 7
   phf: 4
   spy: 2
   warezclient: 1020
   warezmaster: 20
probe: 41102
   ipsweep: 12481
   nmap: 2316
   portsweep: 10413
   satan: 15892


In [11]:
print(f"正常ラベル：{len(true_label[true_labels['label'] == 0])}")
print(f"異常ラベル：{len(true_label[true_labels['label'] == 1])}")

正常ラベル：972781
異常ラベル：3925650


In [12]:
# 標準化
from sklearn.preprocessing import StandardScaler
data_x = pd.DataFrame(StandardScaler().fit_transform(data_x))

In [13]:
# 学習データと，検証データを2：1に分ける．
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, stratify=data_y, test_size=0.33, random_state=2018)

In [14]:
print(f"学習・正常ラベル：{np.sum(y_train == 0)}，学習・不正ラベル：{np.sum(y_train   == 1)}")
print(f"検証・正常ラベル：{np.sum(y_test == 0)}，検証・不正ラベル：{np.sum(y_test == 1)}")

学習・正常ラベル：651763，学習・不正ラベル：2630185
検証・正常ラベル：321018，検証・不正ラベル：1295465


In [15]:
params = {
    'task': 'train', # default
    'boosting': 'gbdt', #  default
    'objective': 'binary', # 2値分類：binary, 他クラス分類：multiclass
    'metric': 'binary_logloss', # 交差エントロピー誤差
    'learning_rate': 0.1,
    'num_leaves': 31, # ノードの数
    'verbose': -1,
    'seed': RANDOM_SEED
}

In [16]:
# k分割交差検証
#　`StratifiedKFold`は，，y_trainのラベル分布が近くなるように分割する．
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
predictions_based_on_k_folds = pd.DataFrame(
    data=[], index=y_train.index, columns=['prediction']
)
# 学習
training_scores = [] # 学習率
validation_scores = [] # 検証スコア


In [17]:
for t_idx, v_idx in k_fold.split(np.zeros(len(x_train)), y_train.ravel()):
    x_train_fold, x_validation_fold = x_train.iloc[t_idx, :], x_train.iloc[v_idx, :] # 2次元配列
    y_train_fold, y_validation_fold = y_train.iloc[t_idx], y_train.iloc[v_idx] # １次元配列

    lgb_train = lgb.Dataset(x_train_fold, y_train_fold)
    lgb_eval = lgb.Dataset(x_validation_fold, y_validation_fold, reference=lgb_train)
    gbm = lgb.train(params, lgb_train,
                    num_boost_round=2000,
                    valid_sets=lgb_eval,
                    callbacks=[lgb.early_stopping(200)]
                    )
    y_train_pred = gbm.predict(x_train_fold, num_iteration=gbm.best_iteration)
    training_score = log_loss(y_train_fold, y_train_pred)

    y_validation_predict = gbm.predict(x_validation_fold, num_iteration=gbm.best_iteration)
    predictions_based_on_k_folds.loc[x_validation_fold.index, 'prediction'] = y_validation_predict
    validation_score = log_loss(y_validation_fold, y_validation_predict)

    training_scores.append(training_score)
    validation_scores.append(validation_score)

Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[100]	valid_0's binary_logloss: 0.000257404
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[284]	valid_0's binary_logloss: 0.000170135
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[95]	valid_0's binary_logloss: 0.000251176
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[93]	valid_0's binary_logloss: 0.000248926
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[93]	valid_0's binary_logloss: 0.000293408


In [18]:
# Evaluate on Test Set
predictions = pd.Series(data=gbm.predict(x_test,
                                         num_iteration=gbm.best_iteration), index=x_test.index)

In [19]:
y_pred = np.where(predictions < 0.5, 0, 1)
y_pred

array([1, 0, 1, ..., 1, 1, 1])

In [20]:
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score

# モデルの評価
# acc: 正答率
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy score: {acc}")


# log loss
log_loss_light_gbm_gradient_boosting = log_loss(y_test, predictions)
print(f'Light GBM Gradient Boosting Log loss: {log_loss_light_gbm_gradient_boosting}')

# AUC
auc = roc_auc_score(y_test, predictions)
print(f"AUC: {auc}")

# precision score
print(f"precision score (micro): {precision_score(y_test, y_pred, average='micro')}")
print(f"precision score (macro): {precision_score(y_test, y_pred, average='macro')}")

Accuracy score: 0.9999300951510162
Light GBM Gradient Boosting Log loss: 0.00028903618874890204
AUC: 0.9999981129618384
precision score (micro): 0.9999300951510162
precision score (macro): 0.9998685306157578


In [21]:
# 混合行列で表示
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, digits=6))

              precision    recall  f1-score   support

           0   0.999766  0.999882  0.999824    321018
           1   0.999971  0.999942  0.999956   1295465

    accuracy                       0.999930   1616483
   macro avg   0.999869  0.999912  0.999890   1616483
weighted avg   0.999930  0.999930  0.999930   1616483



In [22]:
data_x.shape

(4898431, 38)

In [23]:
true_labels.iloc[y_test.index[y_test != y_pred]]['class'].value_counts()

normal    38
probe     38
r2l       20
u2r        9
dos        8
Name: class, dtype: int64

### Accuracy(正解率)が異常に高い
### Accuracyがこれほど高くなる原因はわからない．
### r2lは，20/1126, u2rは，9/52間違えている．
### 全体のデータ量が489万件で，そのうちDOSの異常データが388万件のため，これによって正解率が上がっている可能性