### 程序功能：对前三步处理后的数据集KDDCUP99，使用支持向量机SVM实现分类并输出评价指标

In [1]:
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import svm
from time import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

### 输出python和package的版本

In [3]:
import platform 
print("platform  version: {}".format(platform .__version__))
print("python version: {}".format(platform.python_version()))
import csv
print("csv version: {}".format(csv.__version__))
import numpy as np
print("numpy version: {}".format(np.__version__))
import pandas as pd
print("pandas version: {}".format(pd.__version__))
import sklearn
print("scikit-learn version: {}".format(sklearn.__version__))
import matplotlib
print("matplotlib version: {}".format(matplotlib.__version__))
import IPython
print("IPython version: {}".format(IPython.__version__))

platform  version: 1.0.8
python version: 3.6.12
csv version: 1.0
numpy version: 1.19.2
pandas version: 1.1.5
scikit-learn version: 0.23.2
matplotlib version: 3.3.2
IPython version: 7.16.1


### 读取、处理数据集

In [8]:
fr = pd.read_csv("kddcup.data.txt", encoding='utf-8',error_bad_lines=False)
data = np.array(fr)
print('数据集大小：',data.shape)

data[:,-1] = LabelEncoder().fit_transform(data[:,-1])        # 标签的编码
data[:,0:-1] = OrdinalEncoder().fit_transform(data[:,0:-1])  # 特征的分类编码
data = StandardScaler().fit_transform(data)                  # 标准化：利用Sklearn库的StandardScaler对数据标准化
# data = MinMaxScaler().fit_transform(data)                  # 归一化：利用Sklearn库的MinMaxScaler对数据归一化，返回[0,1]区间的数据

# 划分特征和标签
line_nums = len(data)
# data_feature = np.zeros((line_nums, 41))   # 创建line_nums行 41列的矩阵
data_feature = np.zeros((line_nums, 10))   # 创建line_nums行 10列的矩阵
data_labels = []
for i in range(line_nums):                 # 依次读取每行
#     data_feature[i,:] = data[i][0:41]     # 选择前41个特征  划分数据集特征和标签
    feature = [3,4,5,6,8,10,13,23,24,37]   # 选择第3,4,5,6,8,10,13,23,24,37这10个特征分类
    for j in feature:
        data_feature[i,feature.index(j)] = data[i][j]
        
    data_labels.append(data[i][-1])       # 标签
    
print('数据集特征大小：',data_feature.shape)
print('数据集标签大小：',len(data_labels))

b'Skipping line 4817100: expected 42 fields, saw 56\n'


数据集大小： (4898429, 42)
数据集特征大小： (4898429, 10)
数据集标签大小： 4898429


### 划分训练集和测试集

In [9]:
data_label = []
for i in data_labels: 
    data_label.append(int(float(i)))
data_label =  np.array(data_label, dtype = int)       # list转换数组
train_feature, test_feature, train_label, test_label = train_test_split(data_feature, data_label,test_size=0.4,random_state=4)# 测试集40%
print('训练集特征大小：{}，训练集标签大小：{}'.format(train_feature.shape, train_label.shape))
print('测试集特征大小：{}，测试集标签大小：{}'.format(test_feature.shape, test_label.shape))

训练集特征大小：(2939057, 10)，训练集标签大小：(2939057,)
测试集特征大小：(1959372, 10)，测试集标签大小：(1959372,)


### 模型训练、预测

#### 方法一：决策树DT

In [10]:
begin_time = time()                     # 训练预测开始时间
if __name__ == '__main__':
    print('Start training DT：',end='')
    dt = sklearn.tree.DecisionTreeClassifier(criterion='gini',splitter='best', max_depth=20, min_samples_split=2, min_samples_leaf =1)
    dt.fit(train_feature, train_label)
    print(dt)
    print('Training done！')

    print('Start prediction DT：')
    test_predict = dt.predict(test_feature)
    print('Prediction done！')

    print('预测结果：',test_predict)
    print('实际结果：',test_label)
    print('正确预测的数量：',sum(test_predict==test_label)) 
end_time = time()                        # 训练预测结束时间
total_time = end_time - begin_time
print('训练预测耗时：',total_time,'s')

Start training DT：DecisionTreeClassifier(max_depth=20)
Training done！
Start prediction DT：
Prediction done！
预测结果： [0 0 0 ... 0 0 0]
实际结果： [0 0 0 ... 0 0 0]
正确预测的数量： 1949172
训练预测耗时： 24.315146446228027 s


#### 方法二：支持向量机SVM

In [None]:
# # begin_time = time()                         # 训练预测开始时间
# if __name__ == '__main__':
#     print('Start training SVM：',end='')
#     svm = sklearn.svm.SVC(kernel='rbf', C=1.0, gamma=1.5, tol=1e-2)  # 选择高斯核函数rbf，正则化系数为1，核函数系数0.5，SMO迭代精度1e-2
#     svm.fit(train_feature, train_label)      # 开始训练SVM
#     print(svm)
#     print('Training done！')
    
#     print('Start prediction SVM：')
#     test_predict = svm.predict(test_feature) # 对测试集进行类别预测
#     print('Prediction done！')
    
#     print('预测结果：',test_predict)
#     print('实际结果：',test_label)
#     print('正确预测的数量：',sum(test_predict==test_label)) 
# # end_time = time                             # 训练预测结束时间
# # total_time = end_time - begin_time
# # print('训练预测耗时：',total_time,'s')

### 输出分类报告

In [11]:
print('准确率:', metrics.accuracy_score(test_label, test_predict))                         # 预测准确率输出
print('宏平均精确率:',metrics.precision_score(test_label,test_predict,average='macro'))    # 预测宏平均精确率输出
print('微平均精确率:', metrics.precision_score(test_label, test_predict, average='micro')) # 预测微平均精确率输出
print('宏平均召回率:',metrics.recall_score(test_label,test_predict,average='macro'))       # 预测宏平均召回率输出
print('平均F1-score:',metrics.f1_score(test_label,test_predict,average='weighted'))        # 预测平均f1-score输出
print('混淆矩阵输出:')
print(metrics.confusion_matrix(test_label,test_predict))                                   # 混淆矩阵输出
# 从精确率:precision、召回率:recall、 调和平均f1值:f1-score和支持度:support四个维度进行衡量
print('分类报告:')
print(metrics.classification_report(test_label, test_predict))                             # 分类报告输出

准确率: 0.99479425040268
宏平均精确率: 0.9757850958567713
微平均精确率: 0.99479425040268
宏平均召回率: 0.9802354835751075
平均F1-score: 0.9947985471535458
混淆矩阵输出:
[[    883       0       0       8       0]
 [      0    4705       3     315       0]
 [      0     434  425276    4120       0]
 [      4      33    5265 1517515       8]
 [      1       0       1       8     793]]
分类报告:
              precision    recall  f1-score   support

          -3       0.99      0.99      0.99       891
          -2       0.91      0.94      0.92      5023
          -1       0.99      0.99      0.99    429830
           0       1.00      1.00      1.00   1522825
           1       0.99      0.99      0.99       803

    accuracy                           0.99   1959372
   macro avg       0.98      0.98      0.98   1959372
weighted avg       0.99      0.99      0.99   1959372

