In [2]:
# 读入数据集
DATA = './web_trace.csv'

'''
Raw packets
'''
packets = []
with open(DATA, 'r') as f:
    for line in f:
        key, p_bits, global_time, label = line.strip().split(',')
        key = int(key)
        global_time = int(global_time)
        label = int(label)
        p_bits = int(p_bits)
        packets.append((key, p_bits, global_time, label))

In [3]:
# 分为不同滑动窗口
features = [] # 总和, 均值, 最大值, 方差
windowLabel = [] # 标签
for i in range(0, len(packets)):
    _sqsum = 0
    _sum = 0
    flag = 0
    _max = 0
    for p in range(i, i+8):
        if p >= len(packets) : break
        key, p_bits, global_time, label = packets[p]
        if label == 1: flag = 1
        _sum += p_bits
        _max = max(_max, p_bits)
        _sqsum += p_bits ** 2

    windowLabel.append(flag)
    _mean = _sum >> 3
    _sqsum = _sqsum >> 3
    _var =  _sqsum - _mean ** 2
    features.append([_sum, _mean, _max, _var])



In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# 将其转为 NumPy 数组（便于训练）
X = np.array(features)
y = np.array(windowLabel)

# 划分训练集和测试集（例如 80% 训练，20% 测试）
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 创建决策树分类器（可指定 criterion 为 "gini" 或 "entropy"）
clf = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)

# 拟合训练数据
clf.fit(X_train, y_train)

# 在测试集上预测
y_pred = clf.predict(X_test)

# 输出评估结果
print("准确率: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))
print("\n分类报告:")
print(classification_report(y_test, y_pred))


准确率: 99.89%

分类报告:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    529921
           1       1.00      0.96      0.98     15228

    accuracy                           1.00    545149
   macro avg       1.00      0.98      0.99    545149
weighted avg       1.00      1.00      1.00    545149



In [5]:
from sklearn.tree import export_text
print(export_text(clf, feature_names=["sum", "mean", "max", "var"]))

|--- sum <= 8950.00
|   |--- sum <= 8550.00
|   |   |--- mean <= 956.00
|   |   |   |--- class: 0
|   |   |--- mean >  956.00
|   |   |   |--- class: 0
|   |--- sum >  8550.00
|   |   |--- var <= 254540.50
|   |   |   |--- class: 1
|   |   |--- var >  254540.50
|   |   |   |--- class: 0
|--- sum >  8950.00
|   |--- var <= 148978.00
|   |   |--- var <= 89943.50
|   |   |   |--- class: 1
|   |   |--- var >  89943.50
|   |   |   |--- class: 1
|   |--- var >  148978.00
|   |   |--- mean <= 1156.00
|   |   |   |--- class: 1
|   |   |--- mean >  1156.00
|   |   |   |--- class: 0

