In [1]:
import torch.nn as nn
import numpy as np
import torch as th
import torch.nn.functional as F
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from models.GraphSAGE import SAGE
from models.MLP import MLPPredictor
from utils.tools import generate_ton_iot_graph, load_ton_iot_train_test, compute_accuracy, save_model
from sklearn.metrics import accuracy_score
import time

params={
    'ndim_out': 128,
    'dropout': 0.2,
    'epochs': 500, # 2000
}

device = th.device('cuda' if th.cuda.is_available() else 'cpu')
print('Using device:', device)

bin = False

class Model(nn.Module):
    def __init__(self, ndim_in, ndim_out, edim, activation, dropout, bin=False):
        super(Model, self).__init__()
        self.gnn = SAGE(ndim_in, ndim_out, edim, activation, dropout)
        if bin:
            self.pred = MLPPredictor(ndim_out, edim, 2)
        else:
            self.pred = MLPPredictor(ndim_out, edim, 10)
    def forward(self, g, nfeats, efeats):
        h = self.gnn(g, nfeats, efeats)
        return self.pred(g, h)

# 获取训练测试集
print('Loading data...')
X_train, X_test, y_train, y_test = load_ton_iot_train_test('datasets/TON-IoT/ton_iot_train.csv', 'datasets/TON-IoT/ton_iot_test.csv', bin)
X_train['is_train'] = 1
X_test['is_train'] = 0

Using device: cuda
Loading data...


In [2]:
import pandas as pd
# 合并训练测试集
X = pd.concat([X_train, X_test])
X.sort_values(by='ID', ascending=True, inplace=True)

In [3]:
# 构图
print('Generating graph...')
G = generate_ton_iot_graph(X, False)
G = G.to(device)
G_train = generate_ton_iot_graph(X_train, False)
G_train = G_train.to(device)
G_test = generate_ton_iot_graph(X_test, False)
G_test = G_test.to(device)

Generating graph...


In [4]:
import dgl
dgl.save_graphs('datasets/TON-IoT/ton_iot_graphs.dgl', [G, G_train, G_test])

In [5]:
import dgl
graphs, _ = dgl.load_graphs('datasets/TON-IoT/ton_iot_graphs.dgl')
G = graphs[0].to(device)
G_train = graphs[1].to(device)
G_test = graphs[2].to(device)

In [6]:
if bin:
    label_key = 'label'
else:
    label_key = 'type'

# 加载模型
print('Loading model...')
model = Model(G.ndata['h'].shape[1], params['ndim_out'], G.edata['h'].shape[1], F.relu, params['dropout'], bin)
model.load_state_dict(th.load('pts/TON-IoT-p-ndim_out_128dropout_0.2epochs_500_multi(0.8653).pt'))
model = model.to(device)

Loading model...


In [4]:
# # 训练模型
# class_weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(G_train.edata[label_key].cpu().numpy()), y=G_train.edata[label_key].cpu().numpy())
# class_weights = th.FloatTensor(class_weights).to(device)
# criterion = nn.CrossEntropyLoss(weight = class_weights)
# optim = th.optim.Adam(model.parameters())

# print('Training...')
# model.train()
# for epoch in range(1, params['epochs']+1):
#     pred = model(G_train, G_train.ndata['h'], G_train.edata['h'])
#     loss = criterion(pred, G_train.edata[label_key])
#     optim.zero_grad()
#     loss.backward()
#     optim.step()
#     if epoch % 10 == 0:
#         print('Epoch:', epoch ,' Training acc:', compute_accuracy(pred, G_train.edata[label_key]))

Training...
Epoch: 10  Training acc: 0.3720617890357971
Epoch: 20  Training acc: 0.47440585494041443
Epoch: 30  Training acc: 0.5361633896827698
Epoch: 40  Training acc: 0.629041314125061
Epoch: 50  Training acc: 0.6795773506164551
Epoch: 60  Training acc: 0.6571980118751526
Epoch: 70  Training acc: 0.6107644438743591
Epoch: 80  Training acc: 0.7317246794700623
Epoch: 90  Training acc: 0.7654804587364197
Epoch: 100  Training acc: 0.778218686580658
Epoch: 110  Training acc: 0.7334102988243103
Epoch: 120  Training acc: 0.7284014821052551
Epoch: 130  Training acc: 0.6860564947128296
Epoch: 140  Training acc: 0.6742912530899048
Epoch: 150  Training acc: 0.6839308738708496
Epoch: 160  Training acc: 0.8289607763290405
Epoch: 170  Training acc: 0.7594553232192993
Epoch: 180  Training acc: 0.7830679416656494
Epoch: 190  Training acc: 0.8499225378036499
Epoch: 200  Training acc: 0.8276128768920898
Epoch: 210  Training acc: 0.8496003150939941
Epoch: 220  Training acc: 0.8210145235061646
Epoch: 2

In [7]:
# 测试
import torch.nn.functional as F
print('Testing...')
model.eval()
graph = G
pred_origin = model(graph, graph.ndata['h'], graph.edata['h'])
pred_origin = F.softmax(pred_origin, dim=1)

pred = pred_origin.argmax(dim=1)
pred = pred.cpu().detach().numpy()
label = th.tensor(graph.edata[label_key]).cpu().detach().numpy()

print(classification_report(label, pred, digits=4))

Testing...


  label = th.tensor(graph.edata[label_key]).cpu().detach().numpy()


              precision    recall  f1-score   support

           0     0.9911    0.9337    0.9615    600000
           1     0.4775    0.9927    0.6448     40000
           2     0.7655    0.9005    0.8276     40000
           3     0.8169    0.1956    0.3157     40000
           4     0.6391    0.5229    0.5752     40000
           5     0.6254    0.6491    0.6370     40000
           6     0.8862    0.8790    0.8826     40000
           7     0.7787    0.8114    0.7947     40000
           8     0.7703    0.8734    0.8186     40000
           9     0.1207    0.9329    0.2138      2086

    accuracy                         0.8623    922086
   macro avg     0.6871    0.7691    0.6671    922086
weighted avg     0.8950    0.8623    0.8646    922086



In [15]:
# 保存模型
# print('Saving model...')
# acc=accuracy_score(th.tensor(G_test.edata[label_key]).cpu().detach().numpy(), model(G_test, G_test.ndata['h'], G_test.edata['h']).argmax(dim=1).cpu().detach().numpy())
# model = model.to('cpu')
# save_model(model, params, acc, 'TON-IoT', bin)

Saving model...


  acc=accuracy_score(th.tensor(G_test.edata[label_key]).cpu().detach().numpy(), model(G_test, G_test.ndata['h'], G_test.edata['h']).argmax(dim=1).cpu().detach().numpy())


Model saved as pts/TON-IoT-p-ndim_out_128dropout_0.2epochs_500_multi(0.8495).pt


In [8]:
# 把结果存下来
import pandas as pd

df = pd.DataFrame(columns=['ID', 'type', 'pred'])

df['ID'] = th.tensor(G.edata['ID']).cpu().detach().numpy()
df['pred'] = pred
df['pred_origin'] = pred_origin.cpu().detach().numpy().tolist()
df['type'] = label

df.sort_values(by='ID', ascending=True, inplace=True)

  df['ID'] = th.tensor(G.edata['ID']).cpu().detach().numpy()


In [9]:
# 把奇数行和偶数行分开
df_even = df.iloc[::2, :]
df_odd = df.iloc[1::2, :]

In [10]:
df_odd

Unnamed: 0,ID,type,pred,pred_origin
1,0,0,0,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,1,0,0,"[0.9381766319274902, 2.5866864916679333e-07, 3..."
22304,2,0,0,"[0.7484839558601379, 1.740892846546771e-10, 2...."
27677,3,0,0,"[0.7653656601905823, 1.6738192776255545e-10, 1..."
51996,4,0,0,"[0.9984911680221558, 8.606618195905469e-11, 2...."
...,...,...,...,...
922079,461038,8,8,"[0.007000509183853865, 5.765279573188842e-11, ..."
556195,461039,6,6,"[0.04750542715191841, 1.279949479737752e-08, 1..."
922082,461040,1,9,"[0.06039558723568916, 3.823271526925964e-06, 6..."
922083,461041,9,9,"[0.007658644113689661, 0.001143360510468483, 2..."


In [11]:
# 保存预测原始结果
df_odd[['ID', 'type', 'pred', 'pred_origin']].to_csv('./datasets/TON-IoT/predictdetail_ton_iot.csv', index=False)

In [12]:
print(classification_report(df_odd['type'], df_odd['pred'], digits=4))

              precision    recall  f1-score   support

           0     0.9911    0.9337    0.9615    300000
           1     0.4773    0.9927    0.6446     20000
           2     0.7656    0.9005    0.8276     20000
           3     0.8164    0.1945    0.3142     20000
           4     0.6391    0.5229    0.5752     20000
           5     0.6254    0.6491    0.6370     20000
           6     0.8860    0.8789    0.8824     20000
           7     0.7784    0.8115    0.7946     20000
           8     0.7705    0.8731    0.8186     20000
           9     0.1207    0.9329    0.2138      1043

    accuracy                         0.8623    461043
   macro avg     0.6870    0.7690    0.6670    461043
weighted avg     0.8950    0.8623    0.8645    461043



In [13]:
# 把所有数据存到一个大表里
data = pd.read_csv('./datasets/TON-IoT/train_test_ton_iot.csv')
type_map = {'normal': 0, 'backdoor': 1, 'ddos': 2, 'dos': 3, 'injection': 4, 'password': 5, 'ransomware': 6, 'scanning': 7, 'xss': 8, 'mitm': 9}

import socket
import struct
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import category_encoders as ce

# 处理构建图所需属性
# data['src_ip'] = data.src_ip.apply(lambda x: socket.inet_ntoa(struct.pack('>I', random.randint(0xac100001, 0xac1f0001))))
data['src_ip'] = data.src_ip.apply(str)
data['src_port'] = data.src_port.apply(str)
data['dst_ip'] = data.dst_ip.apply(str)
data['dst_port'] = data.dst_port.apply(str)
data['src_ip'] = data['src_ip'] + ':' + data['src_port']
data['dst_ip'] = data['dst_ip'] + ':' + data['dst_port']
data['type'] = data.type.apply(lambda x : type_map[x])

# 删除冗余属性
data.drop(columns=['ts','src_port','dst_port','http_uri','weird_name','weird_addl','weird_notice','dns_query','ssl_subject','ssl_issuer','http_user_agent'],inplace=True)
data['label'] = data.label.apply(int)
data['type'] = data.type.apply(int)
label = data.type

# 在第一列增加ID属性
data.insert(0, 'ID', range(0, 0 + len(data)))

In [14]:
data['src_ip'] = X['src_ip'].values
data['dst_ip'] = X['dst_ip'].values
data['is_train'] = X['is_train'].values

In [15]:
data

Unnamed: 0,ID,src_ip,dst_ip,proto,service,duration,src_bytes,dst_bytes,conn_state,missed_bytes,...,http_method,http_version,http_request_body_len,http_response_body_len,http_status_code,http_orig_mime_types,http_resp_mime_types,label,type,is_train
0,0,172.27.226.140:1883,192.168.1.152:52976,tcp,-,80549.530260,1762852,41933215,OTH,0,...,-,-,0,0,0,-,-,0,0,1
1,1,172.24.99.246:47260,192.168.1.255:15600,udp,-,0.000000,0,0,S0,0,...,-,-,0,0,0,-,-,0,0,1
2,2,172.18.57.119:1880,192.168.1.152:51782,tcp,-,0.000000,0,0,OTH,0,...,-,-,0,0,0,-,-,0,0,0
3,3,172.18.164.17:34296,192.168.1.152:10502,tcp,-,0.000000,0,0,OTH,0,...,-,-,0,0,0,-,-,0,0,0
4,4,172.29.91.92:46608,192.168.1.190:53,udp,dns,0.000549,0,298,SHR,0,...,-,-,0,0,0,-,-,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461038,461038,172.20.31.166:33108,176.28.50.165:80,tcp,-,0.000000,0,0,S0,0,...,-,-,0,0,0,-,-,1,8,0
461039,461039,172.20.166.189:37242,34.230.157.88:443,tcp,-,0.000000,0,0,OTH,0,...,-,-,0,0,0,-,-,1,6,1
461040,461040,172.29.243.207:4444,192.168.1.193:49178,tcp,-,290.371539,101568,2592,OTH,0,...,-,-,0,0,0,-,-,1,1,1
461041,461041,172.30.171.98:60816,104.98.29.100:443,tcp,-,23.190902,32,31,SF,0,...,-,-,0,0,0,-,-,1,9,1


In [16]:
norm_cols = list(data.columns)[3:-3]
data['h'] = data[norm_cols].values.tolist()

In [18]:
# data = data[['ID', 'src_ip', 'dst_ip', 'label', 'type', 'is_train', 'h']]
data = data[['ID', 'src_ip', 'dst_ip', 'label', 'type', 'is_train']]
data

Unnamed: 0,ID,src_ip,dst_ip,label,type,is_train
0,0,172.27.226.140:1883,192.168.1.152:52976,0,0,1
1,1,172.24.99.246:47260,192.168.1.255:15600,0,0,1
2,2,172.18.57.119:1880,192.168.1.152:51782,0,0,0
3,3,172.18.164.17:34296,192.168.1.152:10502,0,0,0
4,4,172.29.91.92:46608,192.168.1.190:53,0,0,0
...,...,...,...,...,...,...
461038,461038,172.20.31.166:33108,176.28.50.165:80,1,8,0
461039,461039,172.20.166.189:37242,34.230.157.88:443,1,6,1
461040,461040,172.29.243.207:4444,192.168.1.193:49178,1,1,1
461041,461041,172.30.171.98:60816,104.98.29.100:443,1,9,1


In [19]:
data.insert(5, 'pred', df_odd['pred'].values)
data

Unnamed: 0,ID,src_ip,dst_ip,label,type,pred,is_train
0,0,172.27.226.140:1883,192.168.1.152:52976,0,0,0,1
1,1,172.24.99.246:47260,192.168.1.255:15600,0,0,0,1
2,2,172.18.57.119:1880,192.168.1.152:51782,0,0,0,0
3,3,172.18.164.17:34296,192.168.1.152:10502,0,0,0,0
4,4,172.29.91.92:46608,192.168.1.190:53,0,0,0,0
...,...,...,...,...,...,...,...
461038,461038,172.20.31.166:33108,176.28.50.165:80,1,8,8,0
461039,461039,172.20.166.189:37242,34.230.157.88:443,1,6,6,1
461040,461040,172.29.243.207:4444,192.168.1.193:49178,1,1,9,1
461041,461041,172.30.171.98:60816,104.98.29.100:443,1,9,9,1


In [18]:
# data['pred_origin'] = df_odd['pred_origin'].values
# data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pred_origin'] = df_odd['pred_origin'].values


Unnamed: 0,ID,src_ip,dst_ip,label,type,pred,is_train,h,pred_origin
0,0,172.27.226.140:1883,192.168.1.152:52976,0,0,0,1,"[tcp, -, 80549.53026, 1762852, 41933215, OTH, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,172.24.99.246:47260,192.168.1.255:15600,0,0,0,1,"[udp, -, 0.0, 0, 0, S0, 0, 1, 63, 0, 0, 0, 0, ...","[0.9447329640388489, 2.729832193626436e-10, 1...."
2,2,172.18.57.119:1880,192.168.1.152:51782,0,0,0,0,"[tcp, -, 0.0, 0, 0, OTH, 0, 0, 0, 0, 0, 0, 0, ...","[0.7342113256454468, 1.394823790972985e-13, 9...."
3,3,172.18.164.17:34296,192.168.1.152:10502,0,0,0,0,"[tcp, -, 0.0, 0, 0, OTH, 0, 0, 0, 0, 0, 0, 0, ...","[0.751929521560669, 1.339806222780751e-13, 8.6..."
4,4,172.29.91.92:46608,192.168.1.190:53,0,0,0,0,"[udp, dns, 0.000549, 0, 298, SHR, 0, 0, 0, 2, ...","[0.9995991587638855, 1.9404616802276564e-14, 6..."
...,...,...,...,...,...,...,...,...,...
461038,461038,172.20.31.166:33108,176.28.50.165:80,1,8,8,0,"[tcp, -, 0.0, 0, 0, S0, 0, 1, 60, 0, 0, 0, 0, ...","[0.0032821244094520807, 1.3965890076350629e-11..."
461039,461039,172.20.166.189:37242,34.230.157.88:443,1,6,6,1,"[tcp, -, 0.0, 0, 0, OTH, 0, 0, 0, 1, 103, 0, 0...","[0.10491644591093063, 2.5225233360459676e-11, ..."
461040,461040,172.29.243.207:4444,192.168.1.193:49178,1,1,9,1,"[tcp, -, 290.371539, 101568, 2592, OTH, 0, 108...","[0.07980495691299438, 2.2610829830682633e-07, ..."
461041,461041,172.30.171.98:60816,104.98.29.100:443,1,9,7,1,"[tcp, -, 23.190902, 32, 31, SF, 0, 8, 411, 7, ...","[0.0034932619892060757, 5.0928379096149e-06, 6..."


In [20]:
data.to_csv('datasets/TON-IoT/edgelist_ton_iot.csv', index=False)

In [21]:
json_str = data.to_json(orient='records')
# 指定要保存的 JSON 文件路径
json_file_path = '../frontend/src/data/edgelist_ton_iot.json'

# 将 JSON 字符串写入文件
with open(json_file_path, 'w') as json_file:
    json_file.write(json_str)

In [22]:
# 计算混淆矩阵
from sklearn.metrics import confusion_matrix
confusion_matrix(data['type'], data['pred'])

array([[280105,    108,   4053,    545,   2869,      5,   2110,   3859,
          3039,   3307],
       [     0,  19853,      0,      0,      0,      0,    132,      0,
             0,     15],
       [    12,      0,  18010,      1,    350,    558,      0,     28,
           694,    347],
       [    26,  14061,    324,   3890,     58,      1,      1,    290,
            34,   1315],
       [    16,      0,    534,      0,  10458,   6942,      1,     67,
          1404,    578],
       [     0,   4550,     11,      1,   1923,  12982,      6,    281,
             2,    244],
       [  2421,      0,      0,      0,      0,      0,  17577,      2,
             0,      0],
       [     4,   3022,    252,    328,      0,      0,      1,  16230,
            19,    144],
       [     0,      0,    337,      0,    702,    270,      0,     91,
         17462,   1138],
       [    42,      0,      3,      0,      3,      0,     10,      2,
            10,    973]], dtype=int64)

In [23]:
print(classification_report(data['type'], data['pred'], digits=4))

              precision    recall  f1-score   support

           0     0.9911    0.9337    0.9615    300000
           1     0.4773    0.9927    0.6446     20000
           2     0.7656    0.9005    0.8276     20000
           3     0.8164    0.1945    0.3142     20000
           4     0.6391    0.5229    0.5752     20000
           5     0.6254    0.6491    0.6370     20000
           6     0.8860    0.8789    0.8824     20000
           7     0.7784    0.8115    0.7946     20000
           8     0.7705    0.8731    0.8186     20000
           9     0.1207    0.9329    0.2138      1043

    accuracy                         0.8623    461043
   macro avg     0.6870    0.7690    0.6670    461043
weighted avg     0.8950    0.8623    0.8645    461043



In [24]:
type_map = {'normal': 0, 'backdoor': 1, 'ddos': 2, 'dos': 3, 'injection': 4, 'password': 5, 'ransomware': 6, 'scanning': 7, 'xss': 8, 'mitm': 9}

In [25]:
cm_json = {}
cm_json['types'] = list(type_map.keys())
cm_json['cm'] = confusion_matrix(data['type'], data['pred']).tolist()

In [26]:
import json
with open('../frontend/src/data/cm_ton_iot.json', 'w') as json_file:
    json.dump(cm_json, json_file)