# Heterorogeneous graph

- Node(s) add + delete
- Edge(s) add + delete
- Feature(s) add + delete in some nodes and edges -
graph is heterogeneous

In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from torch_geometric.datasets import AMiner

import random
import torch

In [2]:
"""
The heterogeneous AMiner dataset from the “metapath2vec: Scalable Representation Learning for Heterogeneous Networks” paper, 
consisting of nodes from type "paper", "author" and "venue". Venue categories and author research interests are available as 
ground truth labels for a subset of nodes.

Class https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/datasets/aminer.html?highlight=y_index#
"""

dataset_het = AMiner(root=r"./AMiner2")
data_het = dataset_het[0]

data_het.keys #['y_index', 'y', 'edge_index', 'num_nodes']
print(data_het)

pp = data_het.edge_index_dict
num_edges = data_het.num_edges
node_store = data_het.get_node_store('paper')
node_types = data_het.node_types
edge_store = data_het.get_edge_store('author', 'writes', 'paper')
edge_types = data_het.edge_types


HeteroData(
  [1mauthor[0m={
    y=[246678],
    y_index=[246678],
    num_nodes=1693531
  },
  [1mvenue[0m={
    y=[134],
    y_index=[134],
    num_nodes=3883
  },
  [1mpaper[0m={ num_nodes=3194405 },
  [1m(paper, written_by, author)[0m={ edge_index=[2, 9323605] },
  [1m(author, writes, paper)[0m={ edge_index=[2, 9323605] },
  [1m(paper, published_in, venue)[0m={ edge_index=[2, 3194405] },
  [1m(venue, publishes, paper)[0m={ edge_index=[2, 3194405] }
)


In [3]:
data_het["author"].y_index # author id
len(data_het["author"].y) # author features

246678

In [4]:
data_het["venue"].y_index # venue id
data_het["venue"].y # venue features

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7])

In [5]:
def addNode(data, type, add_edge=True):
    """
    data - het data
    type - "venue" or "author", dtype=string
    add_edge=True if you want to add edge
    """
    new_id = random.randint(900000, 10000000) # to y_index
    new_type = random.randint(0, 7) # to y
    paper_id = random.randint(0, 3194404)

    # data[type].y_index = data[type].y_index.cpu().detach().numpy();
    # data[type].y = data[type].y.cpu().detach().numpy();
    
    data[type].y_index = np.append(data[type].y_index, new_id)
    data[type].y = np.append(data[type].y, new_type)

    if type == "author":
        data['author', 'writes', 'paper'].edge_index.cpu().detach().numpy();
        data["paper", "written_by", "author"].edge_index.cpu().detach().numpy();

        left = np.append(data['author', 'writes', 'paper'].edge_index[0], new_id)
        right = np.append(data['author', 'writes', 'paper'].edge_index[1], paper_id)
        new = torch.tensor(np.stack([left, right]))
        data['author', 'writes', 'paper'].edge_index = new

        left = np.append(data["paper", "written_by", "author"].edge_index[0], paper_id)
        right = np.append(data["paper", "written_by", "author"].edge_index[1], new_id)
        new = torch.tensor(np.stack([left, right]))
        data["paper", "written_by", "author"].edge_index = new

    if type == "venue":
        data["paper", "published_in", "venue"].edge_index.cpu().detach().numpy();
        data["venue", "publishes", "paper"].edge_index.cpu().detach().numpy();

        left = np.append(data["venue", "publishes", "paper"].edge_index[0], new_id)
        right = np.append(data["venue", "publishes", "paper"].edge_index[1], paper_id)
        new = torch.tensor(np.stack([left, right]))
        data["venue", "publishes", "paper"].edge_index = new

        left = np.append(data["paper", "published_in", "venue"].edge_index[0], paper_id)
        right = np.append(data["paper", "published_in", "venue"].edge_index[1], new_id)
        new = torch.tensor(np.stack([left, right]))
        data["venue", "publishes", "paper"].edge_index = new
    return
# print(len(data_het["venue"].y))    
# addNode(data_het, type="venue")
# print(len(data_het["venue"].y))

In [30]:
def delNode(data, type, del_edge=True):
    """
    Delete a random node of a certain type
    :input:
        data - het data
        type - "venue", "paper" or "author", dtype=string
        del_edge=True if you want to delete an edge
    :output: data object modified
    """
    # for edge deletion
    awp = data['author', 'writes', 'paper'].edge_index.cpu().detach().numpy();
    pwa = data["paper", "written_by", "author"].edge_index.cpu().detach().numpy();
    ppv = data["paper", "published_in", "venue"].edge_index.cpu().detach().numpy();
    vpp = data["venue", "publishes", "paper"].edge_index.cpu().detach().numpy();
    
    # Author
    if type == "author":
        node_index = random.randint(0, len(data["author"].y))
        data[type].y_index = np.delete(data[type].y_index, node_index)
        data[type].y = np.delete(data[type].y, node_index)
        id = (data[type].y_index)[node_index]

        if del_edge == True:
            to_delete = []
            for i in range(len(awp[0])):
                if awp[0][i] == id:
                    to_delete.append(i)
            left = np.delete(awp[0], to_delete)
            right = np.delete(awp[1], to_delete)
            new = torch.tensor(np.stack([left, right]))
            data['author', 'writes', 'paper'].edge_index = new

            to_delete = []
            for i in range(len(pwa[1])):
                if pwa[1][i] == id:
                    to_delete.append(i)

            left = np.delete(pwa[0], to_delete)
            right = np.delete(pwa[1], to_delete)
            new = torch.tensor(np.stack([left, right]))
            data["paper", "written_by", "author"].edge_index = new       

    # Venue
    if type == "venue":
        node_index = random.randint(0, len(data["venue"].y))
        data[type].y_index = np.delete(data[type].y_index, node_index)
        data[type].y = np.delete(data[type].y, node_index)
        id = (data[type].y_index)[node_index]

        if del_edge==True:
            to_delete = []
            for i in range(len(ppv[0])):
                if ppv[1][i] == id:
                    to_delete.append(i)
            left = np.delete(ppv[0], to_delete)
            right = np.delete(ppv[1], to_delete)
            new = torch.tensor(np.stack([left, right]))
            data["paper", "published_in", "venue"].edge_index = new  


            to_delete = []
            for i in range(len(vpp[1])):
                if vpp[0][i] == id:
                    to_delete.append(i)
            left = np.delete(vpp[0], to_delete)
            right = np.delete(vpp[1], to_delete)
            new = torch.tensor(np.stack([left, right]))
            data["venue", "publishes", "paper"].edge_index = new      
        
    # Paper    
    if type == "paper":
        id = random.randint(0, data_het[type]["num_nodes"])

        # Author
        to_delete = []
        for i in range(len(awp[1])):
            if awp[1][i] == id:
                to_delete.append(i)
        left = np.delete(awp[0], to_delete)
        right = np.delete(awp[1], to_delete)
        new = torch.tensor(np.stack([left, right]))
        data['author', 'writes', 'paper'].edge_index = new

        to_delete = []
        for i in range(len(pwa[0])):
            if pwa[0][i] == id:
                to_delete.append(i)

        left = np.delete(pwa[0], to_delete)
        right = np.delete(pwa[1], to_delete)
        new = torch.tensor(np.stack([left, right]))
        data["paper", "written_by", "author"].edge_index = new 
        
        # Venue
        to_delete = []
        for i in range(len(ppv[0])):
            if ppv[0][i] == id:
                to_delete.append(i)
        left = np.delete(ppv[0], to_delete)
        right = np.delete(ppv[1], to_delete)
        new = torch.tensor(np.stack([left, right]))
        data["paper", "published_in", "venue"].edge_index = new  

        to_delete = []
        for i in range(len(vpp[1])):
            if vpp[1][i] == id:
                to_delete.append(i)
        left = np.delete(vpp[0], to_delete)
        right = np.delete(vpp[1], to_delete)
        new = torch.tensor(np.stack([left, right]))
        data["venue", "publishes", "paper"].edge_index = new     
    return

print(len(data_het["author"].y_index))
print(len(data_het["paper", "written_by", "author"].edge_index[0]))
print(len(data_het["paper", "written_by", "author"].edge_index[1]))
delNode(data_het, type="author")
print("After")
print(len(data_het["author"].y_index))
print(len(data_het["paper", "written_by", "author"].edge_index[0]))
print(len(data_het["paper", "written_by", "author"].edge_index[1]))

246672
9323571
9323571
After
246671
9323559
9323559


In [22]:
data_het["paper"]["num_nodes"]

3194405

In [7]:
data_het["author"].y_index # author id
data_het["author"].y # author features
len(data_het["author"].y)

246678

In [8]:
data_het

HeteroData(
  [1mauthor[0m={
    y=[246678],
    y_index=[246678],
    num_nodes=1693531
  },
  [1mvenue[0m={
    y=[134],
    y_index=[134],
    num_nodes=3883
  },
  [1mpaper[0m={ num_nodes=3194405 },
  [1m(paper, written_by, author)[0m={ edge_index=[2, 9323605] },
  [1m(author, writes, paper)[0m={ edge_index=[2, 9323605] },
  [1m(paper, published_in, venue)[0m={ edge_index=[2, 3194405] },
  [1m(venue, publishes, paper)[0m={ edge_index=[2, 3194405] }
)

In [9]:
data_het["venue"].y_index

tensor([1741, 2245,  111,  837, 2588, 2116, 2696, 3648, 3784,  313, 3414,  598,
        2995, 2716, 1423,  783, 1902, 3132, 1753, 2748, 2660, 3182,  775, 3339,
        1601, 3589,  156, 1145,  692, 3048,  925, 1587,  820, 1374, 3719,  819,
         492, 3830, 2777, 3001, 3693,  517, 1808, 2353, 3499, 1763, 2372, 1030,
         721, 2680, 3355, 1217, 3400, 1271, 1970, 1127,  407,  353, 1471, 1095,
         477, 3701,   65, 1009, 1899, 1442, 2073, 3143, 2466,  289, 1996, 1070,
        3871, 3695,  281, 3633,   50, 2642, 1925, 1285, 2587, 3814, 3582, 1873,
        1339, 3450,  271, 2966,  453, 2638, 1354, 3211,  391, 1588, 3875, 2216,
        2146, 3765, 2486,  661, 3367,  426,  750, 2158,  519,  230, 1677,  839,
        2945, 1313, 1037, 2879, 2225, 3523, 1247,  448,  227, 3385,  529, 2849,
        1584, 1229,  373, 2235, 1819, 1764, 3155, 2852, 2789, 3474, 1571, 2088,
         208,  462])

In [10]:
data_het["author"].y

tensor([0, 2, 5,  ..., 0, 1, 5])

In [11]:
node_types, edge_types = data_het.metadata()
print("Node types: ", node_types)
print("Edge types: ", edge_types)

Node types:  ['author', 'venue', 'paper']
Edge types:  [('paper', 'written_by', 'author'), ('author', 'writes', 'paper'), ('paper', 'published_in', 'venue'), ('venue', 'publishes', 'paper')]


In [12]:
def delType(data, type):
    """
    input: data - heterog data
           type - node type or edge type dtype=string

           del data['venue']  # Deleting 'field_of_study' node type
           del data['writes']       # Deleting 'has_topic' edge type    
    """
    node_types, edge_types = data.metadata()
    print("Before: ")
    print("Node types: ", node_types)
    print("Edge types: ", edge_types)
    del data[type]       # Deleting edge type
    node_types, edge_types = data.metadata()
    print("After: ")
    print("Node types: ", node_types)
    print("Edge types: ", edge_types)
    return

# delType(data_het, "venue")

In [14]:
edge_stores = data_het.edge_stores
edge_stores
len(edge_stores)

4

In [17]:
node_types, edge_types = data_het.metadata()
print("After: ")
print("Node types: ", node_types)
print("Edge types: ", edge_types)

After: 
Node types:  ['author', 'venue', 'paper']
Edge types:  [('paper', 'written_by', 'author'), ('author', 'writes', 'paper'), ('paper', 'published_in', 'venue'), ('venue', 'publishes', 'paper')]
