In [2]:
import sys
print(sys.version)
import pandas as pd
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import IsolationForest
from src.detectors import LSHiForest
import data_loader as dl

3.13.7 (main, Aug 14 2025, 11:12:11) [Clang 17.0.0 (clang-1700.0.13.3)]


In [None]:
# dataのロード:cic
X_train, y_train, X_test, y_test, y_cat = dl.load_cic_ids()
print(len(X_train))
print(len(X_test))

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- index


In [None]:
# 先行研究で実験
num_ensemblers = 100
classifiers = [("sklearn.ISO", IsolationForest(n_estimators=num_ensemblers)), ("ALSH", LSHiForest('ALSH', num_ensemblers)), ("L2SH", LSHiForest('L2SH', num_ensemblers)), ("L1SH", LSHiForest('L1SH', num_ensemblers)) ]


for i, (clf_name, clf) in enumerate(classifiers):
	
	print("\n"+clf_name+":")
	start_time = time.time()
	clf.fit(X_train)
	train_time = time.time()-start_time
	y_pred = clf.decision_function(X_test)
	test_time = time.time()-start_time-train_time
	auc = roc_auc_score(y_test, y_pred)
	
	print("\tAUC score:\t", auc)
	print("\tTraining time:\t", train_time) 
	print("\tTesting time:\t", test_time)

In [None]:
# dataのロード:unsw
X_train, y_train, X_test, y_test = dl.load_unsw_nb15()

print(len(X_train))
print(len(X_test))

In [None]:
# 先行研究で実験
num_ensemblers = 100
classifiers = [("sklearn.ISO", IsolationForest(n_estimators=num_ensemblers)), ("ALSH", LSHiForest('ALSH', num_ensemblers)), ("L2SH", LSHiForest('L2SH', num_ensemblers)), ("L1SH", LSHiForest('L1SH', num_ensemblers)) ]


for i, (clf_name, clf) in enumerate(classifiers):
	
	print("\n"+clf_name+":")
	start_time = time.time()
	clf.fit(X_train)
	train_time = time.time()-start_time
	y_pred = clf.decision_function(X_test)
	test_time = time.time()-start_time-train_time
	auc = roc_auc_score(y_test, y_pred)
	
	print("\tAUC score:\t", auc)
	print("\tTraining time:\t", train_time) 
	print("\tTesting time:\t", test_time)

In [None]:
num_ensemblers = 100
classifiers = [
    ("sklearn.ISO", IsolationForest(n_estimators=num_ensemblers)), 
    ("ALSH", LSHiForest('ALSH', num_ensemblers)), 
    ("L2SH", LSHiForest('L2SH', num_ensemblers)), 
    ("L1SH", LSHiForest('L1SH', num_ensemblers))
]

attack_cat_series = pd.read_csv('data/unsw_nb15/UNSW_NB15_training-set.csv')['attack_cat']

results_df = pd.DataFrame({
    'attack_cat': attack_cat_series.values,
    'label': y_test.values
}, index=y_test.index)


# --- 3. モデルの学習・予測・結果の保存 ---
if results_df is not None:
    # X_testは数値データのみなので、そのまま予測に使用。
    for clf_name, clf in classifiers:
        print(f"\nTraining and Predicting with {clf_name}...")
        clf.fit(X_train)
        y_pred = clf.decision_function(X_test)
        results_df[f'pred_{clf_name}'] = y_pred

    normal_df = results_df[results_df['label'] == 0]
    attack_df = results_df[results_df['label'] == 1]
    
    corrected_auc_scores = {}
    for clf_name, _ in classifiers:
        scores = {}
        unique_attack_cats = sorted(attack_df['attack_cat'].unique())
        for cat in unique_attack_cats:
            current_attack_df = attack_df[attack_df['attack_cat'] == cat]
            eval_df = pd.concat([normal_df, current_attack_df])
            eval_preds = eval_df[f'pred_{clf_name}']
            eval_labels = eval_df['label']
            auc = roc_auc_score(eval_labels, eval_preds)
            scores[cat] = auc
        corrected_auc_scores[clf_name] = scores

    # グラフ描画と結果表示
    plot_df = pd.DataFrame(corrected_auc_scores).T.rename_axis('Classifier').reset_index()
    plot_df_melted = plot_df.melt(id_vars='Classifier', var_name='Attack Category', value_name='AUC Score').dropna()
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(18, 10))
    sns.barplot(data=plot_df_melted, x='Attack Category', y='AUC Score', hue='Classifier', ax=ax)
    ax.set_title('AUC Score by Attack Category (vs. Normal)', fontsize=20)
    ax.set_xlabel('Attack Category', fontsize=14)
    ax.set_ylabel('AUC Score', fontsize=14)
    ax.set_ylim(0, 1.05)
    ax.tick_params(axis='x', labelrotation=45, labelsize=12)
    ax.tick_params(axis='y', labelsize=12)
    ax.legend(title='Classifier', fontsize=12)
    ax.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

    print("Corrected AUC Scores (Each Attack vs. Normal):")
    pd.options.display.float_format = '{:.4f}'.format
    results_table = pd.DataFrame(corrected_auc_scores)
    print(results_table)

In [None]:
# データの準備:nsl
X_train, y_train, X_test, y_test = dl.load_nsl_kdd()

print(f"Train data size :{len(X_train)}")
print(f"Test data size :{len(X_test)}")

In [None]:
num_ensemblers = 100
classifiers = [("sklearn.ISO", IsolationForest(n_estimators=num_ensemblers)), ("ALSH", LSHiForest('ALSH', num_ensemblers)), ("L2SH", LSHiForest('L2SH', num_ensemblers)), ("L1SH", LSHiForest('L1SH', num_ensemblers)) ]
for i, (clf_name, clf) in enumerate(classifiers):
	
	print("\n"+clf_name+":")
	start_time = time.time()
	
	clf.fit(X_train)
	
	train_time = time.time()-start_time

	y_pred = clf.decision_function(X_test)

	if clf_name == "sklearn.ISO":
		y_pred = -y_pred

	
	test_time = time.time()-start_time-train_time
	
	auc = roc_auc_score(y_test, y_pred)
	
	print("\tAUC score:\t", auc)
	print("\tTraining time:\t", train_time) 
	print("\tTesting time:\t", test_time)

In [None]:
num_ensemblers = 100
classifiers = [
    ("sklearn.ISO", IsolationForest(n_estimators=num_ensemblers)),
    ("ALSH", LSHiForest('ALSH', num_ensemblers)),
    ("L2SH", LSHiForest('L2SH', num_ensemblers)),
    ("L1SH", LSHiForest('L1SH', num_ensemblers))
]

# --- 2.【重要】NSL-KDDの攻撃カテゴリ名を取得し、予測結果を格納するDataFrameを準備 ---
results_df_created = False
try:
    test_df_for_cat = pd.read_csv('data/nsl-kdd/KDDTest+.txt', header=None)
    
    # カラム名はdata_loader.pyの定義に合わせます。
    columns = [
        'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
        'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
        'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
        'num_root', 'num_file_creations', 'num_shells', 'num_access_files',
        'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
        'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate',
        'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
        'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
        'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
        'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
        'dst_host_serror_rate', 'dst_host_srv_serror_rate',
        'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'attack',
        'difficulty'
    ]
    test_df_for_cat.columns = columns
    # attack列の値を空白除去
    attack_cat_series = test_df_for_cat['attack'].str.strip()
    results_df_created = True
except FileNotFoundError:
    print("Error: 'data/nsl_kdd/KDDTest+.txt' が見つかりません。ファイルパスを確認してください。")


if results_df_created:
    # y_test（0/1ラベル）とattack_cat（攻撃カテゴリ名）でDataFrameを作成
    results_df = pd.DataFrame({
        'attack_cat': attack_cat_series.values,
        'label': y_test.values
    }, index=y_test.index)

    # --- 3. モデルの学習・予測・結果の保存 ---
    for clf_name, clf in classifiers:
        print(f"\nTraining and Predicting with {clf_name}...")
        clf.fit(X_train)
        y_pred = clf.decision_function(X_test)
        results_df[f'pred_{clf_name}'] = y_pred
    print("\n--- All predictions are stored in results_df ---")

    # --- 4. カテゴリ別AUCの計算 ---
    normal_df = results_df[results_df['label'] == 0]
    attack_df = results_df[results_df['label'] == 1]

    corrected_auc_scores = {}
    for clf_name, _ in classifiers:
        scores = {}
        unique_attack_cats = sorted([cat for cat in attack_df['attack_cat'].unique() if cat != 'normal'])
        for cat in unique_attack_cats:
            current_attack_df = attack_df[attack_df['attack_cat'] == cat]
            eval_df = pd.concat([normal_df, current_attack_df])
            eval_preds = eval_df[f'pred_{clf_name}']
            eval_labels = eval_df['label']
            
            if len(eval_labels.unique()) > 1:
                auc = roc_auc_score(eval_labels, -eval_preds)
                scores[cat] = auc
            else:
                scores[cat] = None
        corrected_auc_scores[clf_name] = scores

    # --- 5. グラフ描画と結果表示 ---
    plot_df = pd.DataFrame(corrected_auc_scores).T.rename_axis('Classifier').reset_index()
    plot_df_melted = plot_df.melt(id_vars='Classifier', var_name='Attack Category', value_name='AUC Score').dropna()

    # カテゴリごとの平均AUCを算出し、降順でカテゴリの順序を決定
    category_order = plot_df_melted.groupby('Attack Category')['AUC Score'].mean().sort_values(ascending=False).index

    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(18, 10))
    # order引数に決定した順序を指定
    sns.barplot(data=plot_df_melted, x='Attack Category', y='AUC Score', hue='Classifier', ax=ax, order=category_order)
    
    ax.set_title('NSL-KDD: AUC Score by Attack Category (vs. Normal)', fontsize=20)
    ax.set_xlabel('Attack Category', fontsize=14)
    ax.set_ylabel('AUC Score', fontsize=14)
    ax.set_ylim(0, 1.05)
    ax.tick_params(axis='x', labelrotation=90, labelsize=12)
    ax.tick_params(axis='y', labelsize=12)
    ax.legend(title='Classifier', fontsize=12)
    ax.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('nsl_kdd_auc_by_category.png')
    plt.show()

    print("\nCorrected AUC Scores for NSL-KDD (Each Attack vs. Normal):")
    pd.options.display.float_format = '{:.4f}'.format
    # テーブルもグラフの順序に合わせて表示
    results_table = pd.DataFrame(corrected_auc_scores).reindex(category_order, axis=1) 
    print(results_table)