### 程序功能：对前三步处理后的数据集KDDCUP99，使用支持向量机SVM实现分类并输出评价指标

In [1]:
import sklearn
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import svm
from time import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

### 输出python和package的版本

In [2]:
import platform 
print("platform  version: {}".format(platform .__version__))
print("python version: {}".format(platform.python_version()))
import csv
print("csv version: {}".format(csv.__version__))
import numpy as np
print("numpy version: {}".format(np.__version__))
import pandas as pd
print("pandas version: {}".format(pd.__version__))
import sklearn
print("scikit-learn version: {}".format(sklearn.__version__))
import matplotlib
print("matplotlib version: {}".format(matplotlib.__version__))
import IPython
print("IPython version: {}".format(IPython.__version__))

platform  version: 1.0.8
python version: 3.6.12
csv version: 1.0
numpy version: 1.19.2
pandas version: 1.1.5
scikit-learn version: 0.23.2
matplotlib version: 3.3.2
IPython version: 7.16.1


### 读取、处理数据集

In [3]:
fr= open("kddcup.data.numerization_corrected_normalizing_StandardScaler.txt")  # 打开数值化、修正后的数据集
data = fr.readlines()                     # 读取所有行(直到结束符EOF)并返回列表
line_nums = len(data)
# data_feature = np.zeros((line_nums, 41))   # 创建line_nums行 41列的矩阵
data_feature = np.zeros((line_nums, 10))   # 创建line_nums行 10列的矩阵
data_labels = []
for i in range(line_nums):                 # 依次读取每行
    line = data[i].strip().split(',')      # 去掉每行头尾空白，分隔符对字符串进行切片
#     data_feature[i, :] = line[0:41]        # 选择前41个特征  划分数据集特征和标签
    feature = [3,4,5,6,8,10,13,23,24,37]   # 选择第3,4,5,6,8,10,13,23,24,37这10个特征分类
    for j in feature:
        data_feature[i,feature.index(j)] = line[j]
    data_labels.append(line[-1])           # 标签
fr.close()                                 # 关闭文件

data_feature = StandardScaler().fit_transform(data_feature)   # 标准化，利用Sklearn库的StandardScaler实现数据标准化
data_labels = StandardScaler().fit_transform(np.array(data_labels).reshape(-1, 1) )
# data_feature = MinMaxScaler().fit_transform(data_feature)   # 归一化：利用Sklearn库的MinMaxScaler实现数据归一化，返回[0,1]区间的数据
# data_labels = MinMaxScaler().fit_transform(np.array(data_labels).reshape(-1, 1) )

print('数据集特征大小：',data_feature.shape)
print('数据集标签大小：',len(data_labels))

数据集特征大小： (4898430, 10)
数据集标签大小： 4898430


### 划分训练集和测试集

In [4]:
data_label = []
for i in data_labels: 
    data_label.append(int(float(i)))
data_label =  np.array(data_label, dtype = int)       # list转换数组
train_feature, test_feature, train_label, test_label = train_test_split(data_feature, data_label,test_size=0.4,random_state=4)# 测试集40%
print('训练集特征大小：{}，训练集标签大小：{}'.format(train_feature.shape, train_label.shape))
print('测试集特征大小：{}，测试集标签大小：{}'.format(test_feature.shape, test_label.shape))

训练集特征大小：(2939058, 10)，训练集标签大小：(2939058,)
测试集特征大小：(1959372, 10)，测试集标签大小：(1959372,)


### 模型训练、预测

#### 方法一：决策树DT

In [5]:
begin_time = time()                     # 训练预测开始时间
if __name__ == '__main__':
    print('Start training DT：',end='')
    dt = sklearn.tree.DecisionTreeClassifier(criterion='gini',splitter='best', max_depth=20, min_samples_split=2, min_samples_leaf =1)
    dt.fit(train_feature, train_label)
    print(dt)
    print('Training done！')

    print('Start prediction DT：')
    test_predict = dt.predict(test_feature)
    print('Prediction done！')

    print('预测结果：',test_predict)
    print('实际结果：',test_label)
    print('正确预测的数量：',sum(test_predict==test_label)) 
end_time = time()                        # 训练预测结束时间
total_time = end_time - begin_time
print('训练预测耗时：',total_time,'s')

Start training DT：DecisionTreeClassifier(max_depth=20)
Training done！
Start prediction DT：
Prediction done！
预测结果： [0 0 0 ... 0 0 0]
实际结果： [0 0 0 ... 0 0 0]
正确预测的数量： 1948126
训练预测耗时： 23.319088220596313 s


#### 方法二：支持向量机SVM

In [6]:
# # begin_time = time()                         # 训练预测开始时间
# if __name__ == '__main__':
#     print('Start training SVM：',end='')
#     svm = sklearn.svm.SVC(kernel='rbf', C=1.0, gamma=1.5, tol=1e-2)  # 选择高斯核函数rbf，正则化系数为1，核函数系数0.5，SMO迭代精度1e-2
#     svm.fit(train_feature, train_label)      # 开始训练SVM
#     print(svm)
#     print('Training done！')
    
#     print('Start prediction SVM：')
#     test_predict = svm.predict(test_feature) # 对测试集进行类别预测
#     print('Prediction done！')
    
#     print('预测结果：',test_predict)
#     print('实际结果：',test_label)
#     print('正确预测的数量：',sum(test_predict==test_label)) 
# # end_time = time                             # 训练预测结束时间
# # total_time = end_time - begin_time
# # print('训练预测耗时：',total_time,'s')

### 输出分类报告

In [7]:
print('准确率:', metrics.accuracy_score(test_label, test_predict))                         # 预测准确率输出
print('宏平均精确率:',metrics.precision_score(test_label,test_predict,average='macro'))    # 预测宏平均精确率输出
print('微平均精确率:', metrics.precision_score(test_label, test_predict, average='micro')) # 预测微平均精确率输出
print('宏平均召回率:',metrics.recall_score(test_label,test_predict,average='macro'))       # 预测宏平均召回率输出
print('平均F1-score:',metrics.f1_score(test_label,test_predict,average='weighted'))        # 预测平均f1-score输出
print('混淆矩阵输出:')
print(metrics.confusion_matrix(test_label,test_predict))                                   # 混淆矩阵输出
# 从精确率:precision、召回率:recall、 调和平均f1值:f1-score和支持度:support四个维度进行衡量
print('分类报告:')
print(metrics.classification_report(test_label, test_predict))                             # 分类报告输出

准确率: 0.994260405885151
宏平均精确率: 0.8369072347251401
微平均精确率: 0.994260405885151
宏平均召回率: 0.7903197567844534
平均F1-score: 0.9942064323051741
混淆矩阵输出:
[[ 383254    5307       3      37       5       2      12      11       6
        3]
 [   3995 1548599       0       1       0       0       0       0       0
        0]
 [      5       0     530       0       0       0       0       0       0
        0]
 [   1199      19       1    7844       0       0       8       0       1
        0]
 [      2       1       0       0       5       0       0       0       0
        0]
 [      8       0       0       0       0     870       0       0       0
        0]
 [    137       0       0      11       0       0    6148       0       0
        0]
 [      9       1       0     436       0       0       1     487       1
        0]
 [     20       0       0       0       0       0       0       0     389
        0]
 [      4       0       0       0       0       0       0       0       0
        0]]
分类报告:
 