In [1]:
# CICIDS2017数据集特征工程笔记本
# 用于网络异常检测的深度学习模型

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from collections import Counter
import os
import time
import warnings
import joblib
from sklearn.utils import resample
warnings.filterwarnings('ignore')

# 设置随机种子以确保可重复性
np.random.seed(42)

# 设置更好的可视化样式
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# 定义路径
INPUT_PATH = '/root/autodl-tmp/projects/DL/dataset/preprocessed/CICIDS2017_merged_preprocessed.csv'
OUTPUT_DIR = '/root/autodl-tmp/projects/DL/dataset/feature_engineering'

# 确保输出目录存在
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("CICIDS2017特征工程与分层数据准备")
print("="*80)

CICIDS2017特征工程与分层数据准备


In [2]:
# 1. 数据加载与初步探索
# --------------------------------------------------
print("\n1. 加载预处理数据")
print("-"*50)

start_time = time.time()
print(f"正在加载数据: {INPUT_PATH}")

# 由于数据集较大，使用pandas的chunksize参数分批读取
# 但首先读取小样本来确定数据类型
sample_df = pd.read_csv(INPUT_PATH, nrows=10000)
dtypes = sample_df.dtypes
numeric_columns = sample_df.select_dtypes(include=['float64']).columns
int_columns = sample_df.select_dtypes(include=['int64']).columns

# 设置优化的数据类型
optimized_dtypes = {}
for col in numeric_columns:
    optimized_dtypes[col] = 'float32'  # 降低精度以节省内存
for col in int_columns:
    optimized_dtypes[col] = 'int32'  # 降低精度以节省内存

# 分批读取并合并
chunk_size = 500000  # 每批读取的行数
chunks = []
for chunk in pd.read_csv(INPUT_PATH, chunksize=chunk_size, dtype=optimized_dtypes):
    chunks.append(chunk)
df = pd.concat(chunks, ignore_index=True)

load_time = time.time() - start_time
print(f"数据加载完成，耗时: {load_time:.2f}秒")
print(f"数据集形状: {df.shape}")

# 显示内存使用情况
memory_usage = df.memory_usage().sum() / (1024 ** 2)
print(f"内存使用: {memory_usage:.2f} MB")


1. 加载预处理数据
--------------------------------------------------
正在加载数据: /root/autodl-tmp/projects/DL/dataset/preprocessed/CICIDS2017_merged_preprocessed.csv
数据加载完成，耗时: 19.11秒
数据集形状: (2830743, 81)
内存使用: 907.07 MB


In [3]:
# 2. 数据探索性分析
# --------------------------------------------------
print("\n2. 数据探索性分析")
print("-"*50)

# 检查标签分布
print("\n标签分布:")
label_counts = df['Label'].value_counts()
print(label_counts)

# 可视化标签分布
plt.figure(figsize=(14, 8))
label_counts.plot(kind='bar', color='skyblue')
plt.title('Attack Types Distribution')
plt.xlabel('Attack Type')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'attack_distribution.png'), dpi=300)
plt.close()

# 获取每种攻击类型的百分比
attack_percentages = (label_counts / len(df)) * 100
print("\n各攻击类型百分比:")
for attack_type, percentage in attack_percentages.items():
    print(f"{attack_type}: {percentage:.4f}%")

# 检查数据类型和缺失值
print("\n数据类型:")
print(df.dtypes.value_counts())

print("\n检查缺失值:")
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
if len(missing_values) > 0:
    print(missing_values)
else:
    print("没有缺失值")

# 统计分析
print("\n数值特征的统计摘要:")
numeric_df = df.select_dtypes(include=['float32', 'float64', 'int32', 'int64'])
summary_stats = numeric_df.describe().T
summary_stats['range'] = summary_stats['max'] - summary_stats['min']
summary_stats['coefficient_of_variation'] = summary_stats['std'] / summary_stats['mean']
print(summary_stats[['mean', 'std', 'min', 'max', 'range', 'coefficient_of_variation']].head())

# 保存完整统计摘要
summary_stats.to_csv(os.path.join(OUTPUT_DIR, 'feature_statistics.csv'))
print(f"完整统计摘要已保存至: {os.path.join(OUTPUT_DIR, 'feature_statistics.csv')}")


2. 数据探索性分析
--------------------------------------------------

标签分布:
BENIGN                        2273097
DOS HULK                       231073
PORTSCAN                       158930
DDOS                           128027
DOS GOLDENEYE                   10293
FTP-PATATOR                      7938
SSH-PATATOR                      5897
DOS SLOWLORIS                    5796
DOS SLOWHTTPTEST                 5499
BOT                              1966
WEB ATTACK � BRUTE FORCE         1507
WEB ATTACK � XSS                  652
INFILTRATION                       36
WEB ATTACK � SQL INJECTION         21
HEARTBLEED                         11
Name: Label, dtype: int64

各攻击类型百分比:
BENIGN: 80.3004%
DOS HULK: 8.1630%
PORTSCAN: 5.6144%
DDOS: 4.5227%
DOS GOLDENEYE: 0.3636%
FTP-PATATOR: 0.2804%
SSH-PATATOR: 0.2083%
DOS SLOWLORIS: 0.2048%
DOS SLOWHTTPTEST: 0.1943%
BOT: 0.0695%
WEB ATTACK � BRUTE FORCE: 0.0532%
WEB ATTACK � XSS: 0.0230%
INFILTRATION: 0.0013%
WEB ATTACK � SQL INJECTION: 0.0007%
HEARTBLEED:

In [4]:
# 3. 创建二分类和多分类标签
# --------------------------------------------------
print("\n3. 创建二分类和多分类标签")
print("-"*50)

# 创建二分类标签 (Normal vs Attack)
df['binary_label'] = df['Label'].apply(lambda x: 0 if x == 'BENIGN' else 1)
print("二分类标签创建完成，分布如下:")
print(df['binary_label'].value_counts())

# 创建多分类标签 - 将罕见攻击类型分组
# 定义分组策略 - 将样本量少于1000的攻击类型归为"Other Attacks"
THRESHOLD = 1000
attack_counts = label_counts[label_counts.index != 'BENIGN']
rare_attacks = attack_counts[attack_counts < THRESHOLD].index.tolist()

print(f"\n以下罕见攻击类型将被归为'Other Attacks'组（阈值：{THRESHOLD}）:")
for attack in rare_attacks:
    print(f"- {attack}: {label_counts[attack]}")

# 创建多分类标签
def create_multiclass_label(label):
    if label == 'BENIGN':
        return 'Normal'
    elif label in rare_attacks:
        return 'Other Attacks'
    else:
        # 将DOS攻击类型归为一组
        if 'DOS' in label:
            return 'DOS'
        # 将Web攻击类型归为一组
        elif 'WEB ATTACK' in label:
            return 'Web Attack'
        else:
            return label

df['multiclass_label'] = df['Label'].apply(create_multiclass_label)
print("\n多分类标签创建完成，分布如下:")
print(df['multiclass_label'].value_counts())

# 将多分类标签编码为数字
label_mapping = {label: idx for idx, label in enumerate(df['multiclass_label'].unique())}
df['multiclass_encoded'] = df['multiclass_label'].map(label_mapping)

print("\n标签映射:")
for label, code in label_mapping.items():
    print(f"{label}: {code}")

# 保存标签映射以便后续使用
joblib.dump(label_mapping, os.path.join(OUTPUT_DIR, 'label_mapping.joblib'))


3. 创建二分类和多分类标签
--------------------------------------------------
二分类标签创建完成，分布如下:
0    2273097
1     557646
Name: binary_label, dtype: int64

以下罕见攻击类型将被归为'Other Attacks'组（阈值：1000）:
- WEB ATTACK � XSS: 652
- INFILTRATION: 36
- WEB ATTACK � SQL INJECTION: 21
- HEARTBLEED: 11

多分类标签创建完成，分布如下:
Normal           2273097
DOS               380688
PORTSCAN          158930
FTP-PATATOR         7938
SSH-PATATOR         5897
BOT                 1966
Web Attack          1507
Other Attacks        720
Name: multiclass_label, dtype: int64

标签映射:
Normal: 0
DOS: 1
Other Attacks: 2
FTP-PATATOR: 3
SSH-PATATOR: 4
Web Attack: 5
BOT: 6
PORTSCAN: 7


['/root/autodl-tmp/projects/DL/dataset/feature_engineering/label_mapping.joblib']

In [5]:
# 4. 特征工程与选择
# --------------------------------------------------
print("\n4. 特征工程与选择")
print("-"*50)

# 移除标签列和可能导致数据泄露的列
features_to_drop = ['Label', 'binary_label', 'multiclass_label', 'multiclass_encoded', 'Day', 'Scenario']
X = df.drop(columns=features_to_drop, errors='ignore')
y_binary = df['binary_label']
y_multi = df['multiclass_encoded']

print(f"特征矩阵形状: {X.shape}")

# 检查特征的相关性
print("\n计算特征相关性矩阵...")
correlation_time = time.time()

# 为了节省内存，只计算部分特征的相关性
# 4.1 选择数值列
numeric_cols = X.select_dtypes(include=['float32', 'float64', 'int32', 'int64']).columns

# 4.2 如果特征太多，选择前N个
MAX_CORRELATION_FEATURES = 30  # 限制相关性分析的特征数量
if len(numeric_cols) > MAX_CORRELATION_FEATURES:
    # 选择方差最大的特征
    variances = X[numeric_cols].var().sort_values(ascending=False)
    selected_cols = variances.index[:MAX_CORRELATION_FEATURES].tolist()
    print(f"为相关性分析选择了{MAX_CORRELATION_FEATURES}个高方差特征")
else:
    selected_cols = numeric_cols.tolist()

# 4.3 计算相关性
corr_matrix = X[selected_cols].corr()
correlation_time = time.time() - correlation_time
print(f"相关性计算完成，耗时: {correlation_time:.2f}秒")

# 保存相关性矩阵热图
plt.figure(figsize=(16, 14))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'correlation_heatmap.png'), dpi=300)
plt.close()

# 找出高度相关的特征对
corr_pairs = []
for i in range(len(selected_cols)):
    for j in range(i+1, len(selected_cols)):
        if abs(corr_matrix.iloc[i, j]) > 0.9:  # 高度相关阈值设为0.9
            corr_pairs.append((selected_cols[i], selected_cols[j], corr_matrix.iloc[i, j]))

print(f"\n发现{len(corr_pairs)}对高度相关的特征对 (|correlation| > 0.9):")
for feat1, feat2, corr in corr_pairs[:5]:  # 只显示前5对
    print(f"- {feat1} 和 {feat2}: {corr:.4f}")

if len(corr_pairs) > 5:
    print(f"...以及{len(corr_pairs)-5}对其他高度相关的特征对")

# 检测常数和近常数特征
const_features = [col for col in X.columns if X[col].nunique() <= 1]
near_const_features = [col for col in X.columns if X[col].nunique() <= 2 and X[col].nunique() > 1]

print(f"\n发现{len(const_features)}个常数特征，{len(near_const_features)}个近常数特征")
if const_features:
    print("常数特征:")
    for feat in const_features:
        print(f"- {feat}")
    
    # 移除常数特征
    X = X.drop(columns=const_features)
    print(f"已移除常数特征，特征矩阵新形状: {X.shape}")

# 识别高基数特征（唯一值比例>80%的特征）
high_cardinality_cols = []
for col in X.columns:
    unique_ratio = X[col].nunique() / len(X)
    if unique_ratio > 0.8:
        high_cardinality_cols.append((col, unique_ratio))

print(f"\n发现{len(high_cardinality_cols)}个高基数特征 (唯一值比例>80%):")
for col, ratio in high_cardinality_cols[:5]:  # 只显示前5个
    print(f"- {col}: {ratio:.4f}")


4. 特征工程与选择
--------------------------------------------------
特征矩阵形状: (2830743, 78)

计算特征相关性矩阵...
为相关性分析选择了30个高方差特征
相关性计算完成，耗时: 15.04秒

发现25对高度相关的特征对 (|correlation| > 0.9):
- Flow Duration 和 Fwd IAT Total: 0.9986
- Fwd IAT Max 和 Flow IAT Max: 0.9981
- Fwd IAT Max 和 Idle Max: 0.9884
- Fwd IAT Max 和 Idle Mean: 0.9781
- Fwd IAT Max 和 Idle Min: 0.9491
...以及20对其他高度相关的特征对

发现8个常数特征，10个近常数特征
常数特征:
- Bwd PSH Flags
- Bwd URG Flags
- Fwd Avg Bytes/Bulk
- Fwd Avg Packets/Bulk
- Fwd Avg Bulk Rate
- Bwd Avg Bytes/Bulk
- Bwd Avg Packets/Bulk
- Bwd Avg Bulk Rate
已移除常数特征，特征矩阵新形状: (2830743, 70)

发现0个高基数特征 (唯一值比例>80%):


In [6]:
# 5. 分层数据集划分
# --------------------------------------------------
print("\n5. 分层数据集划分")
print("-"*50)

# 进行分层划分：训练集、验证集和测试集 (70-15-15)
print("执行分层数据集划分...")
split_time = time.time()

# 先划分为训练集和临时集
X_train, X_temp, y_binary_train, y_binary_temp, y_multi_train, y_multi_temp = train_test_split(
    X, y_binary, y_multi, test_size=0.3, stratify=y_multi, random_state=42
)

# 再将临时集划分为验证集和测试集
X_val, X_test, y_binary_val, y_binary_test, y_multi_val, y_multi_test = train_test_split(
    X_temp, y_binary_temp, y_multi_temp, test_size=0.5, stratify=y_multi_temp, random_state=42
)

split_time = time.time() - split_time
print(f"数据集划分完成，耗时: {split_time:.2f}秒")

# 显示不同集合的大小和分布
print(f"\n训练集: {X_train.shape[0]} 样本")
print(f"验证集: {X_val.shape[0]} 样本")
print(f"测试集: {X_test.shape[0]} 样本")

print("\n二分类标签分布:")
print(f"训练集: {Counter(y_binary_train)}")
print(f"验证集: {Counter(y_binary_val)}")
print(f"测试集: {Counter(y_binary_test)}")

print("\n多分类标签分布:")
print(f"训练集: {Counter(y_multi_train)}")
print(f"验证集: {Counter(y_multi_val)}")
print(f"测试集: {Counter(y_multi_test)}")


5. 分层数据集划分
--------------------------------------------------
执行分层数据集划分...
数据集划分完成，耗时: 4.19秒

训练集: 1981520 样本
验证集: 424611 样本
测试集: 424612 样本

二分类标签分布:
训练集: Counter({0: 1591168, 1: 390352})
验证集: Counter({0: 340964, 1: 83647})
测试集: Counter({0: 340965, 1: 83647})

多分类标签分布:
训练集: Counter({0: 1591168, 1: 266481, 7: 111251, 3: 5557, 4: 4128, 6: 1376, 5: 1055, 2: 504})
验证集: Counter({0: 340964, 1: 57103, 7: 23839, 3: 1191, 4: 885, 6: 295, 5: 226, 2: 108})
测试集: Counter({0: 340965, 1: 57104, 7: 23840, 3: 1190, 4: 884, 6: 295, 5: 226, 2: 108})


In [7]:
# 6. 特征缩放
# --------------------------------------------------
print("\n6. 特征缩放")
print("-"*50)

# 使用RobustScaler进行特征缩放以处理异常值
print("使用RobustScaler进行特征缩放...")
scaling_time = time.time()

# 创建并拟合缩放器
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# 将缩放后的数据转换回DataFrame以保留列名和索引
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns, index=X_val.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

scaling_time = time.time() - scaling_time
print(f"特征缩放完成，耗时: {scaling_time:.2f}秒")

# 保存缩放器以便后续使用
joblib.dump(scaler, os.path.join(OUTPUT_DIR, 'robust_scaler.joblib'))


6. 特征缩放
--------------------------------------------------
使用RobustScaler进行特征缩放...
特征缩放完成，耗时: 9.19秒


['/root/autodl-tmp/projects/DL/dataset/feature_engineering/robust_scaler.joblib']

In [8]:
# 7. 类别不平衡处理
# --------------------------------------------------
print("\n7. 类别不平衡处理")
print("-"*50)

# 仅对训练集进行类别平衡处理

# 7.1 二分类任务的平衡处理
print("\n7.1 处理二分类任务的类别不平衡...")
binary_balance_time = time.time()

# 使用SMOTEENN组合过采样和清洗
print("应用SMOTEENN平衡二分类数据...")
smote_enn = SMOTEENN(random_state=42)
X_train_binary_balanced, y_binary_train_balanced = smote_enn.fit_resample(X_train_scaled, y_binary_train)

binary_balance_time = time.time() - binary_balance_time
print(f"二分类平衡处理完成，耗时: {binary_balance_time:.2f}秒")
print(f"平衡前: {Counter(y_binary_train)}")
print(f"平衡后: {Counter(y_binary_train_balanced)}")

# 7.2 多分类任务的平衡处理
print("\n7.2 处理多分类任务的类别不平衡...")
multi_balance_time = time.time()

# 对于多分类，使用分层策略 - 保持大类样本数量不变，对小类进行上采样
# 确定最少需要多少样本
min_samples_per_class = 5000  # 每个类别的最小样本数

# 获取各类别当前的样本数
class_counts = Counter(y_multi_train)
print("多分类任务的原始类别分布:")
print(class_counts)

# 进行分层采样
X_multi_resampled = pd.DataFrame()
y_multi_resampled = pd.Series()

for class_label, count in class_counts.items():
    # 获取当前类别的样本
    class_indices = y_multi_train[y_multi_train == class_label].index
    X_class = X_train_scaled.loc[class_indices]
    y_class = y_multi_train.loc[class_indices]
    
    # 如果样本量少于阈值，进行过采样
    if count < min_samples_per_class:
        # 计算需要合成的样本数量
        n_samples = min_samples_per_class
        print(f"对类别 {class_label} 进行过采样: {count} -> {n_samples}")
        
        # 使用带替换的随机采样进行过采样
        X_resampled, y_resampled = resample(
            X_class, y_class, 
            replace=True,
            n_samples=n_samples,
            random_state=42
        )
    else:
        # 对于大类别，保持原样
        X_resampled, y_resampled = X_class, y_class
    
    # 合并到结果中
    X_multi_resampled = pd.concat([X_multi_resampled, X_resampled])
    y_multi_resampled = pd.concat([y_multi_resampled, y_resampled])

multi_balance_time = time.time() - multi_balance_time
print(f"多分类平衡处理完成，耗时: {multi_balance_time:.2f}秒")
print(f"平衡后的类别分布: {Counter(y_multi_resampled)}")


7. 类别不平衡处理
--------------------------------------------------

7.1 处理二分类任务的类别不平衡...
应用SMOTEENN平衡二分类数据...
二分类平衡处理完成，耗时: 9426.92秒
平衡前: Counter({0: 1591168, 1: 390352})
平衡后: Counter({1: 1579962, 0: 1576401})

7.2 处理多分类任务的类别不平衡...
多分类任务的原始类别分布:
Counter({0: 1591168, 1: 266481, 7: 111251, 3: 5557, 4: 4128, 6: 1376, 5: 1055, 2: 504})
对类别 5 进行过采样: 1055 -> 5000
对类别 2 进行过采样: 504 -> 5000
对类别 6 进行过采样: 1376 -> 5000
对类别 4 进行过采样: 4128 -> 5000
多分类平衡处理完成，耗时: 7.92秒
平衡后的类别分布: Counter({0: 1591168, 1: 266481, 7: 111251, 3: 5557, 5: 5000, 2: 5000, 6: 5000, 4: 5000})


In [9]:
# 8. 保存处理后的数据集
# --------------------------------------------------
print("\n8. 保存处理后的数据集")
print("-"*50)

# 8.1 保存二分类数据集
print("保存二分类数据集...")
binary_save_time = time.time()

# 训练集（平衡后）
binary_train_data = {
    'X_train': X_train_binary_balanced,
    'y_train': y_binary_train_balanced
}
joblib.dump(binary_train_data, os.path.join(OUTPUT_DIR, 'binary_train_balanced.joblib'))

# 验证集和测试集（保持原始分布）
binary_val_data = {
    'X_val': X_val_scaled,
    'y_val': y_binary_val
}
joblib.dump(binary_val_data, os.path.join(OUTPUT_DIR, 'binary_val.joblib'))

binary_test_data = {
    'X_test': X_test_scaled,
    'y_test': y_binary_test
}
joblib.dump(binary_test_data, os.path.join(OUTPUT_DIR, 'binary_test.joblib'))

binary_save_time = time.time() - binary_save_time
print(f"二分类数据集保存完成，耗时: {binary_save_time:.2f}秒")

# 8.2 保存多分类数据集
print("保存多分类数据集...")
multi_save_time = time.time()

# 训练集（平衡后）
multi_train_data = {
    'X_train': X_multi_resampled,
    'y_train': y_multi_resampled
}
joblib.dump(multi_train_data, os.path.join(OUTPUT_DIR, 'multi_train_balanced.joblib'))

# 验证集和测试集（保持原始分布）
multi_val_data = {
    'X_val': X_val_scaled, 
    'y_val': y_multi_val
}
joblib.dump(multi_val_data, os.path.join(OUTPUT_DIR, 'multi_val.joblib'))

multi_test_data = {
    'X_test': X_test_scaled,
    'y_test': y_multi_test
}
joblib.dump(multi_test_data, os.path.join(OUTPUT_DIR, 'multi_test.joblib'))

multi_save_time = time.time() - multi_save_time
print(f"多分类数据集保存完成，耗时: {multi_save_time:.2f}秒")

# 8.3 保存特征列表
feature_list = X_train.columns.tolist()
joblib.dump(feature_list, os.path.join(OUTPUT_DIR, 'feature_list.joblib'))
print(f"特征列表已保存，共 {len(feature_list)} 个特征")


8. 保存处理后的数据集
--------------------------------------------------
保存二分类数据集...
二分类数据集保存完成，耗时: 1.78秒
保存多分类数据集...
多分类数据集保存完成，耗时: 1.25秒
特征列表已保存，共 70 个特征


In [10]:
# 9. 总结
# --------------------------------------------------
print("\n9. 数据处理总结")
print("-"*50)

total_time = time.time() - start_time
print(f"总处理时间: {total_time:.2f}秒")

print("\n数据集统计:")
print(f"原始数据集: {df.shape}")
print(f"特征数量: {X.shape[1]}")
print(f"二分类类别: {len(np.unique(y_binary))}")
print(f"多分类类别: {len(np.unique(y_multi))}")

print("\n已完成以下处理:")
print("- 数据加载与探索")
print("- 创建二分类和多分类标签")
print("- 特征工程与选择")
print("- 分层数据集划分")
print("- 特征缩放")
print("- 类别不平衡处理")
print("- 保存处理后的数据集")

print("\n处理后的文件已保存至:")
print(OUTPUT_DIR)


9. 数据处理总结
--------------------------------------------------
总处理时间: 9510.28秒

数据集统计:
原始数据集: (2830743, 84)
特征数量: 70
二分类类别: 2
多分类类别: 8

已完成以下处理:
- 数据加载与探索
- 创建二分类和多分类标签
- 特征工程与选择
- 分层数据集划分
- 特征缩放
- 类别不平衡处理
- 保存处理后的数据集

处理后的文件已保存至:
/root/autodl-tmp/projects/DL/dataset/feature_engineering

特征工程和数据准备完成！现在可以继续进行深度学习模型构建。
