In [1]:
import pandas as pd
import torch
import numpy as np
from sklearn.preprocessing import StandardScaler
from torch_geometric.data import Data
from torch_geometric.data import InMemoryDataset
import pickle
from typing import Optional, Callable, List
import os.path as osp
import torch_geometric.transforms as T

In [2]:
# 读取 npz 文件
d = np.load('data.npz', allow_pickle=True)
print(list(d.keys()))

['x', 'y', 'edge_type', 'edge_index', 'train_mask', 'test_mask']


In [3]:
print(np.sum(d['y'] == 1), ' ', np.sum(d['y'] == 0), ' ', np.sum(d['y'] == -100))

4492   381612   193053


In [4]:
# 获取数据
x = d['x']
y = d['y']
edge_index = d['edge_index']
edge_type = d['edge_type']
# edge_timestamp = data['edge_timestamp']
train_mask = d['train_mask']
test_mask = d['test_mask']

# 输出数据的形状
print("x shape:", x.shape)
print("y shape:", y.shape)
print("edge_index shape:", edge_index.shape)
print("edge_type shape:", edge_type.shape)
# print("edge_timestamp shape:", edge_timestamp.shape)
print("train_mask shape:", train_mask.shape)
print("test_mask shape:", test_mask.shape)

x shape: (579157, 17)
y shape: (579157,)
edge_index shape: (167559, 2)
edge_type shape: (167559,)
train_mask shape: (386104,)
test_mask shape: (193053,)


In [5]:
from scipy.sparse import csr_matrix

def degree_feat(edge_index, x):
    adj = csr_matrix(
        (np.ones(edge_index.shape[0]), (edge_index[:, 0], edge_index[:, 1])),
        shape=(x.shape[0], x.shape[0]),
    )
    out_degree, in_degree = adj.sum(axis=1), adj.sum(axis=0).T
    return out_degree, in_degree

In [6]:
def edge_type_feat(edge_type, edge_index, x):
    edge_type_adj = csr_matrix(
        (edge_type, (edge_index[:, 0], edge_index[:, 1])),
        shape=(x.shape[0], x.shape[0]),
    )
    edge_type_feat = np.zeros((x.shape[0], 11))
    data, indptr = edge_type_adj.data, edge_type_adj.indptr
    for i in range(x.shape[0]):
        row = data[indptr[i] : indptr[i + 1]]
        unique, counts = np.unique(row, return_counts=True)
        for j, k in zip(unique, counts):
            edge_type_feat[i, j - 1] = k
    return edge_type_feat

In [7]:
out_degree, in_degree = degree_feat(edge_index, x)

In [8]:
def read_xygraphp1(downsample_ratio):
    names = ['data.npz']
    items = [np.load(name) for name in names]
    
    x = items[0]['x']
    y = items[0]['y'].reshape(-1,1)
    edge_index = items[0]['edge_index']
    edge_type = items[0]['edge_type']
    np.random.seed(42)
    train_mask_t = items[0]['train_mask']
    np.random.shuffle(train_mask_t)
    
    x = np.concatenate((x, edge_type_feat(edge_type, edge_index, x)), axis=1)
    x = np.concatenate((x, in_degree), axis=1)
    
    # imbalance
    positive_indices = (y == 1).nonzero()[0]
    # 获取类别为0的样本索引
    negative_indices = (y == 0).nonzero()[0]
    # 下采样类别为0的样本，使得0的个数是1的 downsample_ratio 倍
    num_negative_samples = int(len(positive_indices) * downsample_ratio)
    downsampled_negative_indices = torch.randint(len(negative_indices), size=(num_negative_samples,))
    # 映射下采样后的样本索引
    downsampled_indices = np.concatenate([positive_indices, negative_indices[downsampled_negative_indices]])
    
    print("***", len(downsampled_indices))
    
    num_samples = len(downsampled_indices)
    num_train_samples = int(num_samples * 0.8)
    indices_perm = torch.randperm(num_samples)
    train_mask = downsampled_indices[indices_perm[:num_train_samples]]
    valid_mask = downsampled_indices[indices_perm[num_train_samples:]]
    
    # train_mask = train_mask_t[:int(len(train_mask_t)/10*6)]
    # valid_mask = train_mask_t[int(len(train_mask_t)/10*6):]
    test_mask = items[0]['test_mask']

    x = torch.tensor(x, dtype=torch.float).contiguous()
    y = torch.tensor(y, dtype=torch.int64)
    edge_index = torch.tensor(edge_index.transpose(), dtype=torch.long).contiguous()
    edge_type = torch.tensor(edge_type, dtype=torch.long)
    train_mask = torch.tensor(train_mask, dtype=torch.long)
    test_mask = torch.tensor(test_mask, dtype=torch.long)

    data = Data(x=x, edge_index=edge_index, edge_attr=edge_type, y=y)
    data.train_mask = train_mask
    data.valid_mask = valid_mask
    data.test_mask = test_mask

    return data

In [9]:
class XYGraphP1(InMemoryDataset):

    url = ''

    def __init__(self, root: str, name: str, 
                 transform: Optional[Callable] = None,
                 pre_transform: Optional[Callable] = None):
        
        self.name = name
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def processed_file_names(self) -> str:
        return 'data.pt'

    def process(self):
        # 修改采样比例
        data = read_xygraphp1(5)
        data = data if self.pre_transform is None else self.pre_transform(data)
        torch.save(self.collate([data]), self.processed_paths[0])

In [10]:
# dataset = XYGraphP1(root='', name='', transform=T.ToSparseTensor())
data = read_xygraphp1(8)
'''
data = T.ToSparseTensor()(data)
data.adj_t = data.adj_t.to_symmetric()
# data.edge_index = edge_index

if dataset in ['XYGraphP1']:
    x = data.x
    x = (x - x.mean(0)) / x.std(0)
    data.x = x

if data.y.dim() == 2:
    data.y = data.y.squeeze(1)
'''

*** 40428


"\ndata = T.ToSparseTensor()(data)\ndata.adj_t = data.adj_t.to_symmetric()\n# data.edge_index = edge_index\n\nif dataset in ['XYGraphP1']:\n    x = data.x\n    x = (x - x.mean(0)) / x.std(0)\n    data.x = x\n\nif data.y.dim() == 2:\n    data.y = data.y.squeeze(1)\n"

In [11]:
print(data)

Data(x=[579157, 29], edge_index=[2, 167559], edge_attr=[167559], y=[579157, 1], train_mask=[32342], valid_mask=[8086], test_mask=[193053])


In [12]:
print(np.sum(data.y[data.train_mask].cpu().numpy() == 1))
print(np.sum(data.y[data.train_mask].cpu().numpy() == 0))
print(np.sum(data.y[data.valid_mask].cpu().numpy() == 1))
print(np.sum(data.y[data.valid_mask].cpu().numpy() == 0))

3616
28726
876
7210


In [13]:
import pickle
with open("graphs_balance_data.pkl", "wb") as file:
# with open("graphs_data2.pkl", "wb") as file:
    pickle.dump(data, file)
    file.close()

In [14]:
'''
class GCN(torch.nn.Module):
    def __init__(self
                 , in_channels
                 , hidden_channels
                 , out_channels
                 , num_layers
                 , dropout
                 , batchnorm=True):
        super(GCN, self).__init__()

        self.convs = torch.nn.ModuleList()
        self.convs.append(gnn.SAGEConv(256, hidden_channels, cached=True))
        self.batchnorm = batchnorm
        if self.batchnorm:
            self.bns = torch.nn.ModuleList()
            self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(
                gnn.SAGEConv(hidden_channels, hidden_channels, cached=True))
            if self.batchnorm: 
                self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        self.convs.append(gnn.SAGEConv(hidden_channels, out_channels, cached=True))
        
        self.conv1 = nn.Conv1d(in_channels=128, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.pool1 = nn.MaxPool1d(kernel_size=1, stride=1)
        self.drop1 = nn.Dropout(p=0.1)
        
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=256, kernel_size=3, stride=1, padding=1)
        self.pool2 = nn.MaxPool1d(kernel_size=1, stride=1)
        self.drop2 = nn.Dropout(p=0.1)
        
        self.regression = nn.Sequential(
            nn.Linear(128, 64),
            nn.Dropout(0.2),
            nn.ELU(),
            nn.Linear(64, 16),
            # nn.ELU(),
            # nn.Linear(16, out_channels),
        )
        
        # self.convs.append(gnn.SAGEConv(16, out_channels, cached=True))
        
        self.dropout = dropout

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        if self.batchnorm:
            for bn in self.bns:
                bn.reset_parameters()

    def forward(self, x, edge_index: Union[Tensor, SparseTensor]):

        for i, conv in enumerate(self.convs[:-1]):
            x = conv(x, edge_index)
            if self.batchnorm: 
                x = self.bns[i](x)
            x = F.elu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.convs[-1](x, edge_index)
        
        x = x.view(x.size(0), x.size(1), -1)
        x = self.drop1(self.pool1(F.elu(self.conv1(x))))
        x = self.drop2(self.pool2(F.relu(self.conv2(x))))
        x = x.view(x.size(0), -1)
        
        x = self.regression(x)
        x = self.convs[-1](x, edge_index)
        # print(x.shape)
        
        return x.log_softmax(dim=-1)
'''

'\nclass GCN(torch.nn.Module):\n    def __init__(self\n                 , in_channels\n                 , hidden_channels\n                 , out_channels\n                 , num_layers\n                 , dropout\n                 , batchnorm=True):\n        super(GCN, self).__init__()\n\n        self.convs = torch.nn.ModuleList()\n        self.convs.append(gnn.SAGEConv(256, hidden_channels, cached=True))\n        self.batchnorm = batchnorm\n        if self.batchnorm:\n            self.bns = torch.nn.ModuleList()\n            self.bns.append(torch.nn.BatchNorm1d(hidden_channels))\n        for _ in range(num_layers - 2):\n            self.convs.append(\n                gnn.SAGEConv(hidden_channels, hidden_channels, cached=True))\n            if self.batchnorm: \n                self.bns.append(torch.nn.BatchNorm1d(hidden_channels))\n        self.convs.append(gnn.SAGEConv(hidden_channels, out_channels, cached=True))\n        \n        self.conv1 = nn.Conv1d(in_channels=128, out_channels