In [1]:
import torch
from transformers import  BertTokenizer, BertModel
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

import random




In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")   

In [3]:
# 加载Bert模型和tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [4]:
ISCX = pd.read_csv('./ISCX-URL2016.csv')
# 准备数据
# 假设urls和labels分别为URL列表和对应的标签列表
urls = ISCX['URL'].values
labels = [0 if i=='no'  else 1 for i in ISCX['Target'].values]  # 0代表正常，1代表恶意

In [5]:
def extract_features(url):
    input_ids = tokenizer.encode(url, add_special_tokens=True, max_length=512, truncation=True, padding='max_length', return_tensors='pt')
    with torch.no_grad():
        outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    features = torch.mean(last_hidden_states, dim=1).squeeze().numpy()
    return features

In [6]:
# 获取URL的特征
urls = [extract_features(i) for i in urls[:20]]
labels = labels[:20]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [8]:

num_nodes = len(urls)

In [9]:
# 生成trainmask和valmask
train_mask = np.random.choice([True, False], num_nodes, p=[0.8, 0.2])
val_mask = ~train_mask

In [10]:

edges = []

for i in range(num_nodes):
    for j in range(i+1, num_nodes):
        if random.random() < 0.05:  # randomly generate edges with probability 0.1
            edges.append([i, j])
            edges.append([j, i])

edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

data = Data(x=torch.tensor(np.array(urls),dtype=torch.float32),edge_index=edge_index,y=torch.tensor(labels,dtype=torch.long))

data.train_mask = torch.tensor(train_mask, dtype=torch.bool)
data.test_mask = torch.tensor(val_mask, dtype=torch.bool)
print(data)

Data(x=[20, 768], edge_index=[2, 20], y=[20], train_mask=[20], test_mask=[20])


In [11]:

class GCN_BERT(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN_BERT, self).__init__()
        self.feature_reduction = nn.Linear(input_dim, 64)
        self.conv1 = GCNConv(64, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)
        self.out0 = nn.Linear(output_dim, 32)
        self.out1 = nn.Linear(output_dim, 2)

    def forward(self, x, edge_index):
        x = self.feature_reduction(x)
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        x = F.relu(self.out0(x))
        x = self.out1(x)
        return x

In [12]:
# 创建GCN模型
model = GCN_BERT(input_dim=768, hidden_dim=128, output_dim=32)

In [13]:
model

GCN_BERT(
  (feature_reduction): Linear(in_features=768, out_features=64, bias=True)
  (conv1): GCNConv(64, 128)
  (conv2): GCNConv(128, 32)
  (out0): Linear(in_features=32, out_features=32, bias=True)
  (out1): Linear(in_features=32, out_features=2, bias=True)
)

In [14]:
from mertrics import calculate_metrics

In [17]:
# 定义优化器和损失函数
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()
# 训练模型
model.train()
for epoch in range(1000):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

# 在测试集上评估模型
model.eval()
with torch.no_grad():
    pred = model(data.x, data.edge_index)
    pred = pred.argmax(dim=1)
    calculate_metrics(pred, data.y, data.test_mask)

Accuracy: 0.5
Recall: 0.5
Precision: 1.0
F1 Score: 0.6666666666666666
Confusion Matrix:
 [[ 0  0]
 [10 10]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.50      0.67        20

    accuracy                           0.50        20
   macro avg       0.50      0.25      0.33        20
weighted avg       1.00      0.50      0.67        20

AUC: nan


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
pred