# Heterogeneous graph

- Node(s) add + delete
- Edge(s) add + delete
- Feature(s) add + delete in some nodes and edges -
graph is heterogeneous

In [170]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from torch_geometric.datasets import AMiner

import random
import torch

In [171]:
"""
The heterogeneous AMiner dataset from the “metapath2vec: Scalable Representation Learning for Heterogeneous Networks” paper, 
consisting of nodes from type "paper", "author" and "venue". Venue categories and author research interests are available as 
ground truth labels for a subset of nodes.

Class https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/datasets/aminer.html?highlight=y_index#
"""

dataset_het = AMiner(root=r"./AMiner2")
data_het = dataset_het[0]

data_het.keys #['y_index', 'y', 'edge_index', 'num_nodes']
print(data_het)

pp = data_het.edge_index_dict
num_edges = data_het.num_edges
node_store = data_het.get_node_store('paper')
node_types = data_het.node_types
edge_store = data_het.get_edge_store('author', 'writes', 'paper')
edge_types = data_het.edge_types


HeteroData(
  [1mauthor[0m={
    y=[246678],
    y_index=[246678],
    num_nodes=1693531
  },
  [1mvenue[0m={
    y=[134],
    y_index=[134],
    num_nodes=3883
  },
  [1mpaper[0m={ num_nodes=3194405 },
  [1m(paper, written_by, author)[0m={ edge_index=[2, 9323605] },
  [1m(author, writes, paper)[0m={ edge_index=[2, 9323605] },
  [1m(paper, published_in, venue)[0m={ edge_index=[2, 3194405] },
  [1m(venue, publishes, paper)[0m={ edge_index=[2, 3194405] }
)


In [172]:
data_het["author"].y_index # author id
len(data_het["author"].y) # author features

246678

In [173]:
data_het["venue"].y_index # venue id
data_het["venue"].y # venue features

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7])

In [174]:
def delType(data, type):
    """
    input: data - heterog data
           type - node type or edge type dtype=string

           del data['venue']  # Deleting 'field_of_study' node type
           del data['writes']       # Deleting 'has_topic' edge type    
    """
    node_types, edge_types = data.metadata()
    print("Before: ")
    print("Node types: ", node_types)
    print("Edge types: ", edge_types)
    del data[type]       # Deleting edge type
    node_types, edge_types = data.metadata()
    print("After: ")
    print("Node types: ", node_types)
    print("Edge types: ", edge_types)
    return

# delType(data_het, "venue")

In [175]:
def addNode(data, type, add_edge=True):
    """
    data - het data
    type - "venue" or "author", dtype=string
    add_edge=True if you want to add edge
    """
    new_id = random.randint(900000, 10000000) # to y_index
    new_type = random.randint(0, 7) # to y
    paper_id = random.randint(0, 3194404)
    
    data[type].y_index = np.append(data[type].y_index, new_id)
    data[type].y = np.append(data[type].y, new_type)

    if type == "author":
        data['author', 'writes', 'paper'].edge_index.cpu().detach().numpy();
        data["paper", "written_by", "author"].edge_index.cpu().detach().numpy();

        left = np.append(data['author', 'writes', 'paper'].edge_index[0], new_id)
        right = np.append(data['author', 'writes', 'paper'].edge_index[1], paper_id)
        new = torch.tensor(np.stack([left, right]))
        data['author', 'writes', 'paper'].edge_index = new

        left = np.append(data["paper", "written_by", "author"].edge_index[0], paper_id)
        right = np.append(data["paper", "written_by", "author"].edge_index[1], new_id)
        new = torch.tensor(np.stack([left, right]))
        data["paper", "written_by", "author"].edge_index = new

    if type == "venue":
        data["paper", "published_in", "venue"].edge_index.cpu().detach().numpy();
        data["venue", "publishes", "paper"].edge_index.cpu().detach().numpy();

        left = np.append(data["venue", "publishes", "paper"].edge_index[0], new_id)
        right = np.append(data["venue", "publishes", "paper"].edge_index[1], paper_id)
        new = torch.tensor(np.stack([left, right]))
        data["venue", "publishes", "paper"].edge_index = new

        left = np.append(data["paper", "published_in", "venue"].edge_index[0], paper_id)
        right = np.append(data["paper", "published_in", "venue"].edge_index[1], new_id)
        new = torch.tensor(np.stack([left, right]))
        data["venue", "publishes", "paper"].edge_index = new
    return
# print(len(data_het["venue"].y))    
# addNode(data_het, type="venue")
# print(len(data_het["venue"].y))

In [176]:
def delNode(data, type, del_edge=True):
    """
    Delete a random node of a certain type
    :input:
        data - het data
        type - "venue", "paper" or "author", dtype=string
        del_edge=True if you want to delete an edge
    :output: data object modified
    """
    # for edge deletion
    awp = data['author', 'writes', 'paper'].edge_index.cpu().detach().numpy();
    pwa = data["paper", "written_by", "author"].edge_index.cpu().detach().numpy();
    ppv = data["paper", "published_in", "venue"].edge_index.cpu().detach().numpy();
    vpp = data["venue", "publishes", "paper"].edge_index.cpu().detach().numpy();
    
    # Author
    if type == "author":
        node_index = random.randint(0, len(data["author"].y))
        data[type].y_index = np.delete(data[type].y_index, node_index)
        data[type].y = np.delete(data[type].y, node_index)
        id = (data[type].y_index)[node_index]

        if del_edge == True:
            to_delete = []
            for i in range(len(awp[0])):
                if awp[0][i] == id:
                    to_delete.append(i)
            left = np.delete(awp[0], to_delete)
            right = np.delete(awp[1], to_delete)
            new = torch.tensor(np.stack([left, right]))
            data['author', 'writes', 'paper'].edge_index = new

            to_delete = []
            for i in range(len(pwa[1])):
                if pwa[1][i] == id:
                    to_delete.append(i)

            left = np.delete(pwa[0], to_delete)
            right = np.delete(pwa[1], to_delete)
            new = torch.tensor(np.stack([left, right]))
            data["paper", "written_by", "author"].edge_index = new       

    # Venue
    if type == "venue":
        node_index = random.randint(0, len(data["venue"].y))
        data[type].y_index = np.delete(data[type].y_index, node_index)
        data[type].y = np.delete(data[type].y, node_index)
        id = (data[type].y_index)[node_index]

        if del_edge==True:
            to_delete = []
            for i in range(len(ppv[0])):
                if ppv[1][i] == id:
                    to_delete.append(i)
            left = np.delete(ppv[0], to_delete)
            right = np.delete(ppv[1], to_delete)
            new = torch.tensor(np.stack([left, right]))
            data["paper", "published_in", "venue"].edge_index = new  


            to_delete = []
            for i in range(len(vpp[1])):
                if vpp[0][i] == id:
                    to_delete.append(i)
            left = np.delete(vpp[0], to_delete)
            right = np.delete(vpp[1], to_delete)
            new = torch.tensor(np.stack([left, right]))
            data["venue", "publishes", "paper"].edge_index = new      
        
    # Paper    
    if type == "paper":
        id = random.randint(0, data_het[type]["num_nodes"])

        # Author
        to_delete = []
        for i in range(len(awp[1])):
            if awp[1][i] == id:
                to_delete.append(i)
        left = np.delete(awp[0], to_delete)
        right = np.delete(awp[1], to_delete)
        new = torch.tensor(np.stack([left, right]))
        data['author', 'writes', 'paper'].edge_index = new

        to_delete = []
        for i in range(len(pwa[0])):
            if pwa[0][i] == id:
                to_delete.append(i)

        left = np.delete(pwa[0], to_delete)
        right = np.delete(pwa[1], to_delete)
        new = torch.tensor(np.stack([left, right]))
        data["paper", "written_by", "author"].edge_index = new 
        
        # Venue
        to_delete = []
        for i in range(len(ppv[0])):
            if ppv[0][i] == id:
                to_delete.append(i)
        left = np.delete(ppv[0], to_delete)
        right = np.delete(ppv[1], to_delete)
        new = torch.tensor(np.stack([left, right]))
        data["paper", "published_in", "venue"].edge_index = new  

        to_delete = []
        for i in range(len(vpp[1])):
            if vpp[1][i] == id:
                to_delete.append(i)
        left = np.delete(vpp[0], to_delete)
        right = np.delete(vpp[1], to_delete)
        new = torch.tensor(np.stack([left, right]))
        data["venue", "publishes", "paper"].edge_index = new     
    return

# print(len(data_het["author"].y_index))
# print(len(data_het["paper", "written_by", "author"].edge_index[0]))
# print(len(data_het["paper", "written_by", "author"].edge_index[1]))
# delNode(data_het, type="author")
# print("After")
# print(len(data_het["author"].y_index))
# print(len(data_het["paper", "written_by", "author"].edge_index[0]))
# print(len(data_het["paper", "written_by", "author"].edge_index[1]))

In [177]:
data_het["author", "writes", "paper"].edge_index

tensor([[      0,       0,       0,  ..., 1693528, 1693529, 1693530],
        [      0,   45988,  124807,  ..., 3194371, 3194387, 3194389]])

In [178]:
def addEdge(data, type):
    """
    Add an edge of a certain type
    :input:
        data - het data
        type - one of edge types of AMiner, dtype=string, ex: '"paper", "written_by", "author"'
        del_edge=True if you want to delete an edge
    :output: data edges of a sertain type modified
    """
    id_paper = random.randint(0, data_het["paper"]["num_nodes"])

    if type == '"author", "writes", "paper"' or type=='"paper", "written_by", "author"':
        awp = data["author", "writes", "paper"].edge_index.cpu().detach().numpy();
        pwa = data["paper", "written_by", "author"].edge_index.cpu().detach().numpy();
        id_idx = random.randint(0, len(data["author"]))
        id = data["author"].y_index[id_idx]

        left = np.append(awp[0], id)
        right = np.append(awp[1], id_paper)
        new = torch.tensor(np.stack([left, right]))
        data["author", "writes", "paper"].edge_index = new

        right = np.append(pwa[1], id)
        left = np.append(pwa[0], id_paper)
        new = torch.tensor(np.stack([left, right]))
        data["paper", "written_by", "author"].edge_index = new

    if type == '"paper", "published_in", "venue"' or type == '"venue", "publishes", "paper"':
        ppv = data["paper", "published_in", "venue"].edge_index.cpu().detach().numpy();
        vpp = data["venue", "publishes", "paper"].edge_index.cpu().detach().numpy();
        id_idx = random.randint(0, len(data["venue"]))
        id = data["venue"].y_index[id_idx]

        right = np.append(ppv[1], id)
        left = np.append(ppv[0], id_paper)
        new = torch.tensor(np.stack([left, right]))
        data["paper", "published_in", "venue"].edge_index = new

        left = np.append(vpp[0], id)
        right = np.append(vpp[1], id_paper)
        new = torch.tensor(np.stack([left, right]))
        data["venue", "publishes", "paper"].edge_index = new
    return

In [179]:
def delEdge(data, type):
    """
    Delete the last edge in edges of a certain type
    :input:
        data - het data
        type - one of edge types of AMiner, dtype=string, ex: '"paper", "written_by", "author"'
    :output: data edges of a sertain type modified
    """
    if type == '"author", "writes", "paper"':
        awp = data["author", "writes", "paper"].edge_index.cpu().detach().numpy();
        left = np.delete(awp[0], -1)
        right = np.delete(awp[1], -1)
        new = torch.tensor(np.stack([left, right]))
        data["author", "writes", "paper"].edge_index = new

    if type == '"paper", "written_by", "author"':
        pwa = data["paper", "written_by", "author"].edge_index.cpu().detach().numpy();
        right = np.delete(pwa[1], -1)
        left = np.delete(pwa[0], -1)
        new = torch.tensor(np.stack([left, right]))
        data["paper", "written_by", "author"].edge_index = new

    if type == '"paper", "published_in", "venue"':
        ppv = data["paper", "published_in", "venue"].edge_index.cpu().detach().numpy();

        right = np.delete(ppv[1], -1)
        left = np.delete(ppv[0], -1)
        new = torch.tensor(np.stack([left, right]))
        data["paper", "published_in", "venue"].edge_index = new

    if type == '"venue", "publishes", "paper"':
        vpp = data["venue", "publishes", "paper"].edge_index.cpu().detach().numpy();
        left = np.delete(vpp[0], -1)
        right = np.delete(vpp[1], -1)
        new = torch.tensor(np.stack([left, right]))
        data["venue", "publishes", "paper"].edge_index = new
    return

In [200]:
def addFeature(data, type):
    """
    Add feature to .y tensor. New tensor [num_nodes, num_node_features]
    :input:
        data - het data
        type - one of node types of AMiner, dtype=string, ex: "author"
    :output: data nodes .y of a sertain type modified
    """
    if type == "author":
        a_features = data["author"].y.cpu().detach().numpy();
        new_feature_vec = np.random.randint(0,7,a_features.shape[0], dtype='int64')
        try:
            new_y = np.stack((a_features, new_feature_vec), axis=1)
        except:
            new_y = np.concatenate((a_features, np.array([new_feature_vec]).T), axis=1)

        data["author"].y = torch.tensor(new_y)

    if type == "venue":
        v_features = data["venue"].y.cpu().detach().numpy();
        new_feature_vec = np.random.randint(0,7,v_features.shape[0], dtype='int64')
        try:
            new_y = np.stack((v_features, new_feature_vec), axis=1)
        except:
            new_y = np.concatenate((v_features, np.array([new_feature_vec]).T), axis=1)

        data["venue"].y = torch.tensor(new_y)
    return
    
# print(data_het["author"].y)
# print(data_het["author"].y.shape)
# addFeature(data_het, type="author")
# print(data_het["author"].y)
# print(data_het["author"].y.shape)



tensor([[0, 3, 2, 6],
        [2, 5, 6, 4],
        [5, 0, 5, 6],
        ...,
        [0, 6, 5, 5],
        [1, 4, 4, 1],
        [5, 2, 0, 0]])
torch.Size([246678, 4])
tensor([[0, 3, 2, 6, 4],
        [2, 5, 6, 4, 1],
        [5, 0, 5, 6, 0],
        ...,
        [0, 6, 5, 5, 0],
        [1, 4, 4, 1, 6],
        [5, 2, 0, 0, 3]])
torch.Size([246678, 5])


In [183]:
def delFeature(data):
    """
    Delete features in .y tensor. If .y has only one feature, set it to the same value.
    :input:
        data - het data
        type - one of node types of AMiner, dtype=string, ex: "author"
    :output: data features of a sertain type modified
    """    
    if type == "author":
        a_features = data["author"].y.cpu().detach().numpy();
        new_feature_vec = np.random.randint(0,7,len(a_features), dtype='int64')
        new_y = np.stack((a_features, new_feature_vec), axis=1)
        data["author"].y = torch.tensor(new_y)

    if type == "venue":
        v_features = data["venue"].y.cpu().detach().numpy();
        new_feature_vec = np.random.randint(0,7,len(v_features), dtype='int64')
        new_y = np.stack((v_features, new_feature_vec), axis=1)
        data["venue"].y = torch.tensor(new_y)
    return