In [38]:
import os
from datetime import datetime
import shutil
from typing import Optional, Callable, List, Union
import os.path as osp
import numpy as np
import pandas as pd
import argparse
import torch
import torch.nn as nn
from torch import Tensor
import torch.nn.functional as F
from torch_sparse import SparseTensor
from torch_geometric.data import InMemoryDataset
from torch_geometric.data import Data
import torch_geometric.transforms as T
from torch_geometric.utils import to_undirected
from torch_geometric.nn import SAGEConv

In [105]:
!pwd

/home/featurize/work/whl


In [2]:
#从npz文件里挨个读出需要的数据
def read_xygraphp1(folder):
    print('read_xygraphp1')
    names = ['phase1_gdata.npz']
    items = [np.load(folder+'/'+name) for name in names]
    
    x = items[0]['x']
    y = items[0]['y'].reshape(-1,1)
    edge_index = items[0]['edge_index']
    edge_type = items[0]['edge_type']
    np.random.seed(42)
    train_mask_t = items[0]['train_mask']
    np.random.shuffle(train_mask_t)
    train_mask = train_mask_t[:int(len(train_mask_t)/10*6)]
    valid_mask = train_mask_t[int(len(train_mask_t)/10*6):]
    test_mask = items[0]['test_mask']

    x = torch.tensor(x, dtype=torch.float).contiguous()
    y = torch.tensor(y, dtype=torch.int64)
    #边的index 必须要转置并且转化为连续型
    edge_index = torch.tensor(edge_index.transpose(), dtype=torch.int64).contiguous()
    edge_type = torch.tensor(edge_type, dtype=torch.float)
    train_mask = torch.tensor(train_mask, dtype=torch.int64)
    valid_mask = torch.tensor(valid_mask, dtype=torch.int64)
    test_mask = torch.tensor(test_mask, dtype=torch.int64)
    #非常重要 在pyg里必须要要转化为这样的图格式
    data = Data(x=x, edge_index=edge_index, edge_attr=edge_type, y=y)
    data.train_mask = train_mask
    data.valid_mask = valid_mask
    data.test_mask = test_mask

    return data

class XYGraphP1(InMemoryDataset):
    r"""
    Args:
        root (string): Root directory where the dataset should be saved.
        name (string): The name of the dataset (:obj:`"xygraphp1"`).
        transform (callable, optional): A function/transform that takes in an
            :obj:`torch_geometric.data.Data` object and returns a transformed
            version. The data object will be transformed before every access.
            (default: :obj:`None`)
        pre_transform (callable, optional): A function/transform that takes in
            an :obj:`torch_geometric.data.Data` object and returns a
            transformed version. The data object will be transformed before
            being saved to disk. (default: :obj:`None`)
    """
    #如果有下载需求
    url = ''

    def __init__(self, root: str, name: str, 
                 transform: Optional[Callable] = None,
                 pre_transform: Optional[Callable] = None):
        
        self.name = name
        super().__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

    @property
    def raw_dir(self) -> str:
        return osp.join(self.root, self.name, 'raw')

    @property
    def processed_dir(self) -> str:
        return osp.join(self.root, self.name, 'processed')

    @property
    def raw_file_names(self) -> List[str]:
        names = ['phase1_gdata.npz']
        return names

    @property
    def processed_file_names(self) -> str:
        return 'data.pt'

    def download(self):
        pass
#         for name in self.raw_file_names:
#             download_url('{}/{}'.format(self.url, name), self.raw_dir)

    def process(self):
        data = read_xygraphp1(self.raw_dir)
        data = data if self.pre_transform is None else self.pre_transform(data)
        torch.save(self.collate([data]), self.processed_paths[0])

    def __repr__(self) -> str:
        return f'{self.name}()'

用于建立文件夹，按（数据集，模型）的格式填写

In [3]:
def prepare_folder(name, model_name):
    model_dir = f'./model_files/{name}/{model_name}/'
   
    if os.path.exists(model_dir):
        shutil.rmtree(model_dir)
    os.makedirs(model_dir)
    return model_dir

In [22]:
from sklearn.metrics import roc_auc_score
class Evaluator:
    def __init__(self, eval_metric):
        if eval_metric not in ['acc', 'auc']:
            raise ValueError('eval_metric should be acc or auc')
            
        self.eval_metric = eval_metric

    def _check_input(self, y_true, y_pred):
        '''
            y_true: numpy ndarray or torch tensor of shape (num_node)
            y_pred: numpy ndarray or torch tensor of shape (num_node, num_tasks)
        '''

        # converting to torch.Tensor to numpy on cpu
        if torch is not None and isinstance(y_true, torch.Tensor):
            y_true = y_true.detach().cpu().numpy()

        if torch is not None and isinstance(y_pred, torch.Tensor):
            y_pred = y_pred.detach().cpu().numpy()

        ## check type
        if not (isinstance(y_true, np.ndarray) and isinstance(y_true, np.ndarray)):
            raise RuntimeError('Arguments to Evaluator need to be either numpy ndarray or torch tensor')

        if not y_pred.ndim == 2:
            raise RuntimeError('y_pred must to 2-dim arrray, {}-dim array given'.format(y_true.ndim))

        return y_true, y_pred

    def eval(self, y_true, y_pred):
        if self.eval_metric == 'auc':
            y_true, y_pred = self._check_input(y_true, y_pred)
            return self._eval_rocauc(y_true, y_pred)
        if self.eval_metric == 'acc':
            y_true, y_pred = self._check_input(y_true, y_pred)
            return self._eval_acc(y_true, y_pred)


    def _eval_rocauc(self, y_true, y_pred):
        '''
            compute ROC-AUC and AP score averaged across tasks
        '''
        
        if y_pred.shape[1] ==2:
            auc = roc_auc_score(y_true, y_pred[:, 1])
        else:
            onehot_code = np.eye(y_pred.shape[1])
            y_true_onehot = onehot_code[y_true]
            auc = roc_auc_score(y_true_onehot, y_pred)

        return {'auc': auc}

    def _eval_acc(self, y_true, y_pred):
        y_pred = y_pred.argmax(axis=-1)

        correct = y_true == y_pred
        acc = float(np.sum(correct))/len(correct)

        return {'acc': acc}



简简单单一个MLP和SAGE做对比

In [140]:
class MLP(torch.nn.Module):
    def __init__(self
                 , in_channels
                 , hidden_channels
                 , out_channels
                 , num_layers
                 , dropout
                 , batchnorm=True):
        super(MLP, self).__init__()
        self.lins = torch.nn.ModuleList()
        self.lins.append(torch.nn.Linear(in_channels, hidden_channels))
        self.batchnorm = batchnorm
        if self.batchnorm:
            self.bns = torch.nn.ModuleList()
            self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        for _ in range(num_layers - 2):
            self.lins.append(torch.nn.Linear(hidden_channels, hidden_channels))
            if self.batchnorm:
                self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        self.lins.append(torch.nn.Linear(hidden_channels, out_channels))

        self.dropout = dropout

    def reset_parameters(self):
        for lin in self.lins:
            lin.reset_parameters()
        if self.batchnorm:
            for bn in self.bns:
                bn.reset_parameters()

    def forward(self, x):    
        for i, lin in enumerate(self.lins[:-1]):
            x = lin(x)
            if self.batchnorm:
                x = self.bns[i](x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.lins[-1](x)
        return F.log_softmax(x, dim=-1)

    
class SAGE(torch.nn.Module):
    def __init__(self
                 , in_channels
                 , hidden_channels
                 , out_channels
                 , num_layers
                 , dropout
                 , batchnorm=True):
        super(SAGE, self).__init__()

        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        self.bns = torch.nn.ModuleList()
        self.batchnorm = batchnorm
        if self.batchnorm:
            self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
            if self.batchnorm:
                self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        self.convs.append(SAGEConv(hidden_channels, out_channels))

        self.dropout = dropout

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        if self.batchnorm:
            for bn in self.bns:
                bn.reset_parameters()

    def forward(self, x, edge_index: Union[Tensor, SparseTensor]):
        for i, conv in enumerate(self.convs[:-1]):
            x = conv(x, edge_index)
            if self.batchnorm: 
                x = self.bns[i](x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.convs[-1](x, edge_index)
        return x.log_softmax(dim=-1)


In [31]:
def train(model, data, train_idx, optimizer, no_conv=False):
    # data.y is labels of shape (N, ) 
    model.train()

    optimizer.zero_grad()
    if no_conv:
        out = model(data.x[train_idx])
    else:
        out = model(data.x, data.adj_t)[train_idx]
    #因为是softmax 用 nullloss进行反向传播
    loss = F.nll_loss(out, data.y[train_idx])
    loss.backward()
    optimizer.step()

    return loss.item()

@torch.no_grad()
def test(model, data, split_idx, no_conv=False):
    # data.y is labels of shape (N, )
    model.eval()
    
    if no_conv:
        out = model(data.x)
    else:
        out = model(data.x, data.adj_t)
        
    y_pred = out.exp()  # (N,num_classes)
    
    losses = dict()
    for key in ['train', 'valid', 'test']:
        node_id = split_idx[key]
        losses[key] = F.nll_loss(out[node_id], data.y[node_id]).item()
            
    return losses, y_pred 

In [5]:
parser = argparse.ArgumentParser(description='gnn_models')
parser.add_argument('--device', type=int, default=0)
parser.add_argument('--dataset', type=str, default='XYGraphP1')
parser.add_argument('--log_steps', type=int, default=10)
parser.add_argument('--model', type=str, default='mlp')
parser.add_argument('--epochs', type=int, default=200)
args = parser.parse_args(args=[])

In [6]:
device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
device = torch.device(device)
print(device)

cuda:0


In [7]:
dataset = XYGraphP1(root='./data/', name='xydata', transform=T.ToSparseTensor())
nlabels = dataset.num_classes
if args.dataset in ['XYGraphP1']: nlabels = 2
data = dataset[0]
data.adj_t = data.adj_t.to_symmetric()

In [8]:
if args.dataset in ['XYGraphP1']:
    x = data.x
    x = (x-x.mean(0))/x.std(0)
    data.x = x
if data.y.dim() == 2:
    data.y = data.y.squeeze(1)
split_idx = {'train': data.train_mask, 'valid': data.valid_mask, 'test': data.test_mask}
data = data.to(device)
train_idx = split_idx['train'].to(device)

In [206]:
mlp_parameters = {'lr':0.01
              , 'num_layers':3
              , 'hidden_channels':128
              , 'dropout':0.15
              , 'batchnorm': True
              , 'l2':5e-7
             }

In [208]:
if args.model == 'mlp':
    para_dict = mlp_parameters
    model_para = mlp_parameters.copy()
    model_para.pop('lr')
    model_para.pop('l2')
    model = MLP(in_channels = data.x.size(-1), out_channels = nlabels, **model_para).to(device)
print(f'Model {args.model} initialized')
print(sum(p.numel() for p in model.parameters()))
model.reset_parameters()
optimizer = torch.optim.Adam(model.parameters(), lr=para_dict['lr'], weight_decay=para_dict['l2'])
min_valid_loss = 1e8

Model mlp initialized
19586


In [209]:
no_conv = False
if args.model in ['mlp']: no_conv = True 

In [71]:
model_dir = prepare_folder(args.dataset, args.model)
print('model_dir:', model_dir)

In [214]:
for epoch in range(1, args.epochs+1):
    loss = train(model, data, train_idx, optimizer, no_conv)
    losses, out = test(model, data, split_idx, no_conv)
    train_loss, valid_loss, test_loss = losses['train'], losses['valid'], losses['test']

    if valid_loss < min_valid_loss:
        min_valid_loss = valid_loss
        torch.save(model.state_dict(), model_dir+'model.pt')

    if epoch % args.log_steps == 0:
        print(f'Epoch: {epoch:02d}, '
                    f'Loss: {loss:.4f}, '
                    f'Train: {100 * train_loss:.3f}%, '
                    f'Valid: {100 * valid_loss:.3f}% '
                    f'Test: {100 * test_loss:.3f}%')

Epoch: 10, Loss: 0.0655, Train: 6.797%, Valid: 6.960% Test: 0.000%
Epoch: 20, Loss: 0.0627, Train: 6.657%, Valid: 6.850% Test: 0.000%
Epoch: 30, Loss: 0.0613, Train: 6.106%, Valid: 6.324% Test: 0.000%
Epoch: 40, Loss: 0.0608, Train: 5.985%, Valid: 6.181% Test: 0.000%
Epoch: 50, Loss: 0.0606, Train: 5.974%, Valid: 6.169% Test: 0.000%
Epoch: 60, Loss: 0.0603, Train: 5.961%, Valid: 6.149% Test: 0.000%
Epoch: 70, Loss: 0.0602, Train: 5.951%, Valid: 6.135% Test: 0.000%
Epoch: 80, Loss: 0.0600, Train: 5.943%, Valid: 6.124% Test: 0.000%
Epoch: 90, Loss: 0.0600, Train: 5.934%, Valid: 6.113% Test: 0.000%
Epoch: 100, Loss: 0.0599, Train: 5.926%, Valid: 6.103% Test: 0.000%
Epoch: 110, Loss: 0.0597, Train: 5.919%, Valid: 6.094% Test: 0.000%
Epoch: 120, Loss: 0.0597, Train: 5.915%, Valid: 6.089% Test: 0.000%
Epoch: 130, Loss: 0.0597, Train: 5.908%, Valid: 6.082% Test: 0.000%
Epoch: 140, Loss: 0.0596, Train: 5.904%, Valid: 6.078% Test: 0.000%
Epoch: 150, Loss: 0.0596, Train: 5.900%, Valid: 6.074% Te

In [215]:
model_file = './model_files/{}/{}/model.pt'.format(args.dataset, args.model)
print('model_file:', model_file)
model.load_state_dict(torch.load(model_file))

model_file: ./model_files/XYGraphP1/mlp/model.pt


<All keys matched successfully>

In [244]:
@torch.no_grad()
def test(model, data, no_conv=False):
    # data.y is labels of shape (N, )
    model.eval()
    
    if no_conv:
        out = model(data.x)
    else:
        out = model(data.x, data.adj_t)
        
    y_pred = out.exp()  # (N,num_classes)
                
    return y_pred

In [217]:
out = test(model, data, no_conv)

In [219]:
preds_train, preds_valid = out[data.train_mask], out[data.valid_mask]
y_train, y_valid = data.y[data.train_mask], data.y[data.valid_mask]

In [225]:
evaluator = Evaluator('auc')
train_auc = evaluator.eval(y_train, preds_train)['auc']
valid_auc = evaluator.eval(y_valid, preds_valid)['auc']
print('train_auc:',train_auc)
print('valid_auc:',valid_auc)

train_auc: 0.7373539001379924
valid_auc: 0.7363658261568118


In [None]:
preds = out[data.test_mask].cpu().numpy()
np.save('./submit/preds.npy', preds)

In [9]:
from typing import Union

from torch import Tensor
from torch_sparse import SparseTensor
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

class SAGE(torch.nn.Module):
    def __init__(self
                 , in_channels
                 , hidden_channels
                 , out_channels
                 , num_layers
                 , dropout
                 , batchnorm=True):
        super(SAGE, self).__init__()

        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        self.bns = torch.nn.ModuleList()
        self.batchnorm = batchnorm
        if self.batchnorm:
            self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
            if self.batchnorm:
                self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        self.convs.append(SAGEConv(hidden_channels, out_channels))

        self.dropout = dropout

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        if self.batchnorm:
            for bn in self.bns:
                bn.reset_parameters()

    def forward(self, x, edge_index: Union[Tensor, SparseTensor]):
        for i, conv in enumerate(self.convs[:-1]):
            x = conv(x, edge_index)
            if self.batchnorm: 
                x = self.bns[i](x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.convs[-1](x, edge_index)
        return x.log_softmax(dim=-1)


In [25]:
sage_parameters = {'lr':0.01
              , 'num_layers':2
              , 'hidden_channels':128
              , 'dropout':0.1
              , 'batchnorm': False
              , 'l2':5e-7
             }


In [26]:
args.model='sage'

In [27]:
if args.model == 'sage':        
    para_dict = sage_parameters
    model_para = sage_parameters.copy()
    model_para.pop('lr')
    model_para.pop('l2')        
    model = SAGE(in_channels = data.x.size(-1), out_channels = nlabels, **model_para).to(device)

In [28]:
print(f'Model {args.model} initialized')
print(sum(p.numel() for p in model.parameters()))
model.reset_parameters()
optimizer = torch.optim.Adam(model.parameters(), lr=para_dict['lr'], weight_decay=para_dict['l2'])
min_valid_loss = 1e8

Model sage initialized
4994


In [29]:
no_conv = False
if args.model in ['mlp']: no_conv = True 
model_dir = prepare_folder(args.dataset, args.model)
print('model_dir:', model_dir)

model_dir: ./model_files/XYGraphP1/sage/


In [32]:
for epoch in range(1, args.epochs+1):
    loss = train(model, data, train_idx, optimizer, no_conv)
    losses, out = test(model, data, split_idx, no_conv)
    train_loss, valid_loss, test_loss = losses['train'], losses['valid'], losses['test']

    if valid_loss < min_valid_loss:
        min_valid_loss = valid_loss
        torch.save(model.state_dict(), model_dir+'model.pt')

    if epoch % args.log_steps == 0:
        print(f'Epoch: {epoch:02d}, '
                    f'Loss: {loss:.4f}, '
                    f'Train: {100 * train_loss:.3f}%, '
                    f'Valid: {100 * valid_loss:.3f}% '
                    f'Test: {100 * test_loss:.3f}%')

Epoch: 10, Loss: 0.1176, Train: 11.729%, Valid: 12.032% Test: 0.000%
Epoch: 20, Loss: 0.0814, Train: 7.728%, Valid: 7.895% Test: 0.000%
Epoch: 30, Loss: 0.0668, Train: 6.548%, Valid: 6.673% Test: 0.000%
Epoch: 40, Loss: 0.0626, Train: 6.208%, Valid: 6.356% Test: 0.000%
Epoch: 50, Loss: 0.0606, Train: 6.022%, Valid: 6.167% Test: 0.000%
Epoch: 60, Loss: 0.0599, Train: 5.954%, Valid: 6.114% Test: 0.000%
Epoch: 70, Loss: 0.0594, Train: 5.912%, Valid: 6.070% Test: 0.000%
Epoch: 80, Loss: 0.0591, Train: 5.882%, Valid: 6.044% Test: 0.000%
Epoch: 90, Loss: 0.0589, Train: 5.860%, Valid: 6.021% Test: 0.000%
Epoch: 100, Loss: 0.0587, Train: 5.842%, Valid: 6.004% Test: 0.000%
Epoch: 110, Loss: 0.0585, Train: 5.828%, Valid: 5.988% Test: 0.000%
Epoch: 120, Loss: 0.0585, Train: 5.817%, Valid: 5.977% Test: 0.000%
Epoch: 130, Loss: 0.0583, Train: 5.806%, Valid: 5.966% Test: 0.000%
Epoch: 140, Loss: 0.0582, Train: 5.798%, Valid: 5.956% Test: 0.000%
Epoch: 150, Loss: 0.0581, Train: 5.790%, Valid: 5.948% 

In [33]:
model_file = './model_files/{}/{}/model.pt'.format(args.dataset, args.model)
print('model_file:', model_file)
model.load_state_dict(torch.load(model_file))

model_file: ./model_files/XYGraphP1/sage/model.pt


<All keys matched successfully>

In [34]:
@torch.no_grad()
def test(model, data, no_conv=False):
    # data.y is labels of shape (N, )
    model.eval()
    
    if no_conv:
        out = model(data.x)
    else:
        out = model(data.x, data.adj_t)
        
    y_pred = out.exp()  # (N,num_classes)
                
    return y_pred

In [35]:
out = test(model, data, no_conv)
preds_train, preds_valid = out[data.train_mask], out[data.valid_mask]
y_train, y_valid = data.y[data.train_mask], data.y[data.valid_mask]

In [37]:
evaluator = Evaluator('auc')
train_auc = evaluator.eval(y_train, preds_train)['auc']
valid_auc = evaluator.eval(y_valid, preds_valid)['auc']
print('train_auc:',train_auc)
print('valid_auc:',valid_auc)

train_auc: 0.7727542935825387
valid_auc: 0.7719490989150097


In [None]:
preds = out[data.test_mask].cpu().numpy()
np.save('./submit/preds.npy', preds)