In [1]:
# 读入数据集
DATA = '../web_trace.csv'

'''
Raw packets
'''
packets = []
with open(DATA, 'r') as f:
    for line in f:
        key, p_bits, global_time, label = line.strip().split(',')
        key = int(key)
        global_time = int(global_time)
        label = int(label)
        p_bits = int(p_bits)
        packets.append((key, p_bits, global_time, label))

In [2]:
# 分为不同滑动窗口
features = [] # 总和, 均值, 最大值, 方差
windowLabel = [] # 标签
for i in range(0, len(packets)):
    _sqsum = 0
    _sum = 0
    flag = 0
    _max = 0
    for p in range(i, i+8):
        if p >= len(packets) : break
        key, p_bits, global_time, label = packets[p]
        if label == 1: flag = 1
        _sum += p_bits
        _max = max(_max, p_bits)
        _sqsum += p_bits ** 2

    windowLabel.append(flag)
    _mean = _sum >> 3
    _sqsum = _sqsum >> 3
    _var =  _sqsum - _mean ** 2
    features.append([_sum, _mean, _max, _var])



In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# 将其转为 NumPy 数组（便于训练）
X = np.array(features)
y = np.array(windowLabel)

# 划分训练集和测试集（例如 80% 训练，20% 测试）
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 创建决策树分类器（可指定 criterion 为 "gini" 或 "entropy"）
clf = DecisionTreeClassifier(criterion='gini', max_depth=10, random_state=42)

# 拟合训练数据
clf.fit(X_train, y_train)

# 在测试集上预测
y_pred = clf.predict(X_test)

# 输出评估结果
print("准确率: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\n分类报告:")
print(classification_report(y_test, y_pred))


准确率: 89.55%

分类报告:
              precision    recall  f1-score   support

           0       0.89      0.99      0.94    100337
           1       0.94      0.58      0.72     30091

    accuracy                           0.90    130428
   macro avg       0.92      0.79      0.83    130428
weighted avg       0.90      0.90      0.89    130428



In [4]:
from sklearn.tree import export_text
print(export_text(clf, feature_names=["sum", "mean", "max", "var"]))

|--- max <= 723.50
|   |--- sum <= 450.50
|   |   |--- var <= 19.50
|   |   |   |--- sum <= 420.00
|   |   |   |   |--- class: 1
|   |   |   |--- sum >  420.00
|   |   |   |   |--- var <= 5.50
|   |   |   |   |   |--- max <= 56.50
|   |   |   |   |   |   |--- max <= 55.00
|   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |--- max >  55.00
|   |   |   |   |   |   |   |--- sum <= 436.00
|   |   |   |   |   |   |   |   |--- var <= 0.50
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- var >  0.50
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |--- sum >  436.00
|   |   |   |   |   |   |   |   |--- var <= 0.50
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |   |   |   |--- var >  0.50
|   |   |   |   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- max >  56.50
|   |   |   |   |   |   |--- mean <= 54.50
|   |   |   |   |   |   |   |--- var <= 2.00
|   |   |   |   |   |   |  