
## ロジスティック回帰モデルとlightGBMモデルを組み合わせる．

### 背景
- ロジスティック回帰モデルが高精度だった
- 一般的により精度が高いとされるLightGBMモデルと組み合わせることで，より高い精度を出せる可能性がある．
### 使用するデータセット
- KDD99 10%
### 手法
### 結果
### 考察

In [33]:
from utils_kdd99 import *



python:      3.10.11
sklearn:     1.2.2
tensorflow:  2.12.0
keras:       2.12.0
numpy:       1.23.5
pandas:      1.5.3


In [34]:
# load data
with open("models/kdd99_features/x_train_df.pkl", 'rb') as f:
    x_train: pd.DataFrame = pickle.load(f)
with open("models/kdd99_features/x_test_df.pkl", 'rb') as f:
    x_test: pd.DataFrame = pickle.load(f)
with open("models/kdd99_features/y_train_df.pkl", 'rb') as f:
    y_train: pd.DataFrame = pickle.load(f)
with open("models/kdd99_features/y_test_df.pkl", 'rb') as f:
    y_test: pd.DataFrame = pickle.load(f)
with open("models/kdd99_features/x_train+ae_43_df&activation=relu&epochs=5&batch_size=32.pkl", 'rb') as f:
    x_train_with_ae: pd.DataFrame = pickle.load(f)
with open("models/kdd99_features/x_test+ae_43_df&activation=relu&epochs=5&batch_size=32.pkl", 'rb') as f:
    x_test_with_ae: pd.DataFrame = pickle.load(f)


In [35]:
# load models
from sklearn.linear_model import LogisticRegression
lgb_model_with_ae: lgb.Booster = lgb.Booster(model_file="models/lightgbm/lgb+ae_tuned_booster.model")
with open(f"models/logistic_regression/kdd99+ae_43&penalty=l2&solver=liblinear&C=1e-05.pkl", 'rb') as f:
    lr_model_with_ae: LogisticRegression = pickle.load(f)

In [36]:
y_pred_by_lgb_prob = lgb_model_with_ae.predict(x_test_with_ae)
y_pred_by_lr = lr_model_with_ae.predict(x_test_with_ae)
y_pred_by_lgb = np.argmax(y_pred_by_lgb_prob, axis=1)
y_pred_by_lgb = pd.Series(y_pred_by_lgb)
y_pred_by_lr = pd.Series(y_pred_by_lr)


In [37]:
correspondences

{'dos': 0, 'normal': 1, 'probe': 2, 'r2l': 3, 'u2r': 4}

In [38]:
# normal = 1, abnormal = 0
convert_to_binary = lambda x: 1 if x == 1 else 0
# y_pred_by_lr_b = pd.Series(y_pred_by_lr.apply(convert_to_binary))
# y_pred_by_lgb_b = pd.Series(y_pred_by_lgb.apply(convert_to_binary))
y_pred_b = (y_pred_by_lr == 1) & (y_pred_by_lgb == 1)
y_pred_b = y_pred_b.astype(int)
y_test_b = y_test.apply(convert_to_binary)
y_pred_b

0         1
1         0
2         0
3         0
4         1
         ..
163022    1
163023    0
163024    0
163025    0
163026    0
Length: 163027, dtype: int64

In [39]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test_b, y_pred_b) )


[[130856     69]
 [  4335  27767]]


In [40]:
print(classification_report(y_test_b, y_pred_b))


              precision    recall  f1-score   support

           0       0.97      1.00      0.98    130925
           1       1.00      0.86      0.93     32102

    accuracy                           0.97    163027
   macro avg       0.98      0.93      0.95    163027
weighted avg       0.97      0.97      0.97    163027



In [41]:
y_pred_u2r_by_lgb = y_pred_by_lgb[y_pred_by_lgb == correspondences['u2r']]
y_pred_u2r_by_lgb

208       4
1280      4
41104     4
45839     4
48731     4
70692     4
77039     4
78627     4
112153    4
121474    4
126200    4
135644    4
137414    4
dtype: int64

In [42]:
y_pred_u2r_by_lr = y_pred_by_lr[y_pred_by_lr==correspondences['u2r']]
y_pred_u2r_by_lr

1073      4
2453      4
3115      4
6592      4
10313     4
         ..
160312    4
160490    4
160795    4
161319    4
161640    4
Length: 84, dtype: int64

In [43]:
tmp = y_pred_u2r_by_lgb.index.tolist()
tmp.extend(y_pred_u2r_by_lr.tolist())
y_test_pred_as_u2r: pd.Series = y_test.iloc[tmp]

y_test_pred_as_u2r

149011    4
344468    4
147606    4
485174    4
19385     1
         ..
17533     1
17533     1
17533     1
17533     1
17533     1
Name: true_label, Length: 97, dtype: int64

In [44]:
# true classes that lightGBM and Logistic Regression predicted u2r
num_to_class = {v: k for k, v in correspondences.items()}
value_counts = y_test_pred_as_u2r.value_counts()
value_counts

1    86
4    10
3     1
Name: true_label, dtype: int64

In [45]:
y_test_new_index = y_test.reset_index()
y_test_u2r_idx = y_test_new_index[y_test_new_index['true_label'] == correspondences['u2r']].index

In [46]:
y_test_u2r_idx

Int64Index([   208,   1280,  27499,  31123,  41104,  45839,  70692,  70731,
             78627,  85337,  94452,  99037, 112153, 121474, 126200, 137414,
            147744],
           dtype='int64')

In [47]:
y_pred_by_lgb.iloc[y_test_u2r_idx].apply(lambda x: num_to_class[x])

208          u2r
1280         u2r
27499     normal
31123     normal
41104        u2r
45839        u2r
70692        u2r
70731     normal
78627        u2r
85337     normal
94452     normal
99037     normal
112153       u2r
121474       u2r
126200       u2r
137414       u2r
147744    normal
dtype: object

In [48]:
y_pred_by_lr.iloc[y_test_u2r_idx].apply(lambda x: num_to_class[x])


208          r2l
1280       probe
27499     normal
31123        r2l
41104      probe
45839      probe
70692      probe
70731      probe
78627      probe
85337     normal
94452     normal
99037     normal
112153     probe
121474       r2l
126200     probe
137414       r2l
147744    normal
dtype: object