In [54]:
from two_step_classification import *
print_version()

python:      3.10.11
sklearn:     1.2.2
tensorflow:  2.12.0
keras:       2.12.0
numpy:       1.23.5
pandas:      1.5.3


In [55]:
# load data
x_test: pd.DataFrame = pd.read_pickle("models/kdd99_features/x_test-drop_25_df.pkl")
y_test: pd.Series = pd.read_pickle("models/kdd99_features/y_test_df.pkl")
y_test_binary: pd.Series = y_test.apply(lambda x: 0 if x == 1 else 1)
y_test_anomaly: pd.Series = pd.read_pickle("models/kdd99_features/y_test_dropped_mapped_series.pkl")
x_test_ae_35: pd.DataFrame = pd.read_pickle("models/kdd99_features/x_test-drop+ae_35_df&activation=relu&epochs=5&batch_size=32.pkl")

In [56]:
# load models
with open("models/logistic_regression_binary/kdd99-drop_25&penalty=l1&solver=liblinear&C=0.1.pkl", 'rb') as fp:
    model_1st: LogisticRegression = pickle.load(fp)
with open("models/logistic_regression_anomaly/kdd99-dropped_mapped&penalty=l1&solver=liblinear&C=1.pkl", 'rb') as fp:
    model_2nd: LogisticRegression = pickle.load(fp)

In [57]:
# 第1段階: 正常と異常の2値分類
y_pred_binary:pd.Series = classification_normal_and_anomaly(x_test, model_1st)
predicted_indexes = y_pred_binary[y_pred_binary == 1].index
y_pred_binary.value_counts()

1    129863
0     33164
dtype: int64

In [58]:
y_pred_normal: pd.Series = y_pred_binary[y_pred_binary == 0].apply(lambda _: 1)

In [59]:
cm_1st = confusion_matrix_df(y_test_binary.sort_index(), y_pred_binary.sort_index(), labels=['normal', 'anomaly'])

In [60]:
# 第二段階
x_anomalies: pd.DataFrame = x_test.loc[predicted_indexes]
y_pred_anomalies: pd.Series = classification_anomalies(x_anomalies, model_2nd)
y_pred_anomalies = y_pred_anomalies.apply(lambda x: wrapper[x])
y_pred_anomalies.value_counts()

0    128719
2       929
3       197
4        18
dtype: int64

In [61]:
y_pred = pd.concat([y_pred_normal, y_pred_anomalies])
cm_2nd = confusion_matrix_df(y_test.sort_index(), y_pred.sort_index())

In [62]:
swapped_correspondences = {v: k for k, v in correspondences.items()}
y_pred_value_counts = y_pred.value_counts()
y_pred_value_counts.index = y_pred_value_counts.index.map(lambda x: swapped_correspondences[x])
y_pred_value_counts

dos       128719
normal     33164
probe        929
r2l          197
u2r           18
dtype: int64

In [63]:
cm_1st

Unnamed: 0,normal,anomaly
true_normal,31666,436
true_anomaly,1498,129427


In [64]:
cm_2nd

Unnamed: 0,dos,normal,probe,r2l,u2r
true_dos,128429,750,0,2,0
true_normal,285,31666,36,106,9
true_probe,2,459,892,1,1
true_r2l,3,280,1,88,0
true_u2r,0,9,0,0,8


In [65]:
print(classification_report(y_test.sort_index(), y_pred.sort_index()))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00    129181
           1       0.95      0.99      0.97     32102
           2       0.96      0.66      0.78      1355
           3       0.45      0.24      0.31       372
           4       0.44      0.47      0.46        17

    accuracy                           0.99    163027
   macro avg       0.76      0.67      0.70    163027
weighted avg       0.99      0.99      0.99    163027


In [66]:
cm_2nd['u2r']['true_u2r'] / cm_2nd.loc['true_u2r'].sum()

0.47058823529411764

In [67]:
model_1st: lgb.Booster = lgb.Booster(model_file="models/lightgbm/lgb_dropped_binary_tuned_booster.model")
model_2nd: lgb.Booster = lgb.Booster(model_file="models/lightgbm/lgb_dropped_mapped_anomaly_tuned_booster.model")

In [68]:
y_pred_binary = classification_normal_and_anomaly(x_test, model_1st)
predicted_indexes = y_pred_binary[y_pred_binary == 1].index

In [69]:
# ????
y_pred_value_counts = y_pred.value_counts()
y_pred_value_counts.index = y_pred_value_counts.index.map(lambda x: swapped_correspondences[x])
y_pred_value_counts

dos       128719
normal     33164
probe        929
r2l          197
u2r           18
dtype: int64

In [70]:
y_pred_binary.value_counts()

1    130920
0     32107
dtype: int64

In [71]:
x_anomalies = x_test.loc[predicted_indexes]
y_pred_anomalies: pd.Series = classification_anomalies(x_anomalies, model_2nd)
y_pred_anomalies = y_pred_anomalies.apply(lambda x: wrapper[x])
y_pred_anomalies.value_counts()

0    129186
2      1354
3       364
4        16
dtype: int64

In [72]:
y_pred_normal: pd.Series = y_pred_binary[y_pred_binary == 0].apply(lambda _: 1)
y_pred = pd.concat([y_pred_normal, y_pred_anomalies])

y_test_binary = y_test.apply(lambda x: 0 if x == 1 else 1)

cm_1st = confusion_matrix_df(y_test_binary.sort_index(), y_pred_binary.sort_index(), labels=['normal', 'anomaly'])

cm_2nd = confusion_matrix_df(y_test.sort_index(), y_pred.sort_index())

In [73]:
cm_1st

Unnamed: 0,normal,anomaly
true_normal,32084,18
true_anomaly,23,130902


In [74]:
cm_2nd

Unnamed: 0,dos,normal,probe,r2l,u2r
true_dos,129176,5,0,0,0
true_normal,7,32084,6,3,2
true_probe,2,6,1347,0,0
true_r2l,1,8,1,361,1
true_u2r,0,4,0,0,13


In [75]:
print(classification_report(y_test.sort_index(), y_pred.sort_index()))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    129181
           1       1.00      1.00      1.00     32102
           2       0.99      0.99      0.99      1355
           3       0.99      0.97      0.98       372
           4       0.81      0.76      0.79        17

    accuracy                           1.00    163027
   macro avg       0.96      0.95      0.95    163027
weighted avg       1.00      1.00      1.00    163027


In [76]:
recall_u2r = cm_2nd['u2r']['true_u2r'] / cm_2nd.loc['true_u2r'].sum()

0.7647058823529411

In [77]:
# model_1st: lgb.Booster = lgb.Booster(model_file="models/lightgbm/lgb_dropped_binary_tuned_booster.model")
# model_2nd: lgb.Booster = lgb.Booster(model_file="models/lightgbm/lgb_dropped_mapped_anomaly_tuned_booster.model")
# y_pred_binary = classification_normal_and_anomaly(x_test, model_1st)
# predicted_indexes = y_pred_binary[y_pred_binary == 1].index
# x_anomalies = x_test.loc[predicted_indexes]
# y_pred_anomalies: pd.Series = classification_anomalies(x_anomalies, model_2nd)
# print(y_pred_anomalies.value_counts())
# y_pred_normal = y_pred_binary[y_pred_binary == 0].apply(lambda _: 1)
# y_pred = pd.concat([y_pred_normal, y_pred_anomalies])
#
# y_test_binary = y_test.apply(lambda x: 0 if x == 1 else 1)
#
# cm_1st = confusion_matrix_df(y_test_binary.sort_index(), y_pred_binary.sort_index(), labels=['normal', 'anomaly'])
#
# cm_2nd = confusion_matrix_df(y_test.sort_index(), y_pred.sort_index(), labels=correspondences_anomaly.keys())
