# Heterorogeneous graph

- Node(s) add + delete
- Edge(s) add + delete
- Feature(s) add + delete in some nodes and edges -
graph is heterogeneous

In [144]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from torch_geometric.datasets import AMiner

import random
import torch

In [164]:
"""
The heterogeneous AMiner dataset from the “metapath2vec: Scalable Representation Learning for Heterogeneous Networks” paper, 
consisting of nodes from type "paper", "author" and "venue". Venue categories and author research interests are available as 
ground truth labels for a subset of nodes.

Class https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/datasets/aminer.html?highlight=y_index#
"""

dataset_het = AMiner(root=r"./AMiner2")
data_het = dataset_het[0]

data_het.keys #['y_index', 'y', 'edge_index', 'num_nodes']
print(data_het)

pp = data_het.edge_index_dict
num_edges = data_het.num_edges
node_store = data_het.get_node_store('paper')
node_types = data_het.node_types
edge_store = data_het.get_edge_store('author', 'writes', 'paper')
edge_types = data_het.edge_types


HeteroData(
  [1mauthor[0m={
    y=[246678],
    y_index=[246678],
    num_nodes=1693531
  },
  [1mvenue[0m={
    y=[134],
    y_index=[134],
    num_nodes=3883
  },
  [1mpaper[0m={ num_nodes=3194405 },
  [1m(paper, written_by, author)[0m={ edge_index=[2, 9323605] },
  [1m(author, writes, paper)[0m={ edge_index=[2, 9323605] },
  [1m(paper, published_in, venue)[0m={ edge_index=[2, 3194405] },
  [1m(venue, publishes, paper)[0m={ edge_index=[2, 3194405] }
)


In [186]:
data_het["author"].y_index # author id
len(data_het["author"].y) # author features

246678

In [184]:
data_het["venue"].y_index # venue id
data_het["venue"].y # venue features

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5,
        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7])

In [232]:
def addNode(data, type, add_edge=True):
    """
    data - het data
    type - "venue" or "author", dtype=string
    add_edge=True if you want to add edge
    """
    new_id = random.randint(900000, 10000000) # to y_index
    new_type = random.randint(0, 7) # to y
    paper_id = random.randint(0, 3194404)

    # data[type].y_index = data[type].y_index.cpu().detach().numpy();
    # data[type].y = data[type].y.cpu().detach().numpy();
    
    data[type].y_index = np.append(data[type].y_index, new_id)
    data[type].y = np.append(data[type].y, new_type)

    if type == "author":
        data['author', 'writes', 'paper'].edge_index.cpu().detach().numpy();
        data["paper", "written_by", "author"].edge_index.cpu().detach().numpy();

        left = np.append(data['author', 'writes', 'paper'].edge_index[0], new_id)
        right = np.append(data['author', 'writes', 'paper'].edge_index[1], paper_id)
        new = torch.tensor(np.stack([left, right]))
        data['author', 'writes', 'paper'].edge_index = new

        left = np.append(data["paper", "written_by", "author"].edge_index[0], paper_id)
        right = np.append(data["paper", "written_by", "author"].edge_index[1], new_id)
        new = torch.tensor(np.stack([left, right]))
        data["paper", "written_by", "author"].edge_index = new

    if type == "venue":
        data["paper", "published_in", "venue"].edge_index.cpu().detach().numpy();
        data["venue", "publishes", "paper"].edge_index.cpu().detach().numpy();

        left = np.append(data["venue", "publishes", "paper"].edge_index[0], new_id)
        right = np.append(data["venue", "publishes", "paper"].edge_index[1], paper_id)
        new = torch.tensor(np.stack([left, right]))
        data["venue", "publishes", "paper"].edge_index = new

        left = np.append(data["paper", "published_in", "venue"].edge_index[0], paper_id)
        right = np.append(data["paper", "published_in", "venue"].edge_index[1], new_id)
        new = torch.tensor(np.stack([left, right]))
        data["venue", "publishes", "paper"].edge_index = new
    return
# print(len(data_het["venue"].y))    
# addNode(data_het, type="venue")
# print(len(data_het["venue"].y))

150
151


In [None]:
def delRandomNode(data, ):
    return

In [215]:
data_het["author"].y_index # author id
data_het["author"].y # author features
len(data_het["author"].y)

246691

In [None]:
data_het

In [157]:
data_het["venue"].y_index

tensor([1741, 2245,  111,  837, 2588, 2116, 2696, 3648, 3784,  313, 3414,  598,
        2995, 2716, 1423,  783, 1902, 3132, 1753, 2748, 2660, 3182,  775, 3339,
        1601, 3589,  156, 1145,  692, 3048,  925, 1587,  820, 1374, 3719,  819,
         492, 3830, 2777, 3001, 3693,  517, 1808, 2353, 3499, 1763, 2372, 1030,
         721, 2680, 3355, 1217, 3400, 1271, 1970, 1127,  407,  353, 1471, 1095,
         477, 3701,   65, 1009, 1899, 1442, 2073, 3143, 2466,  289, 1996, 1070,
        3871, 3695,  281, 3633,   50, 2642, 1925, 1285, 2587, 3814, 3582, 1873,
        1339, 3450,  271, 2966,  453, 2638, 1354, 3211,  391, 1588, 3875, 2216,
        2146, 3765, 2486,  661, 3367,  426,  750, 2158,  519,  230, 1677,  839,
        2945, 1313, 1037, 2879, 2225, 3523, 1247,  448,  227, 3385,  529, 2849,
        1584, 1229,  373, 2235, 1819, 1764, 3155, 2852, 2789, 3474, 1571, 2088,
         208,  462])

In [163]:
data_het["author"].y

tensor([0, 2, 5,  ..., 0, 1, 5])

In [None]:
node_types, edge_types = data_het.metadata()
print("Node types: ", node_types)
print("Edge types: ", edge_types)

Node types:  ['author', 'venue', 'paper']
Edge types:  [('paper', 'written_by', 'author'), ('author', 'writes', 'paper'), ('paper', 'published_in', 'venue'), ('venue', 'publishes', 'paper')]


In [None]:
def delType(data, type):
    """
    input: data - heterog data
           type - node type or edge type dtype=string

           del data['venue']  # Deleting 'field_of_study' node type
           del data['writes']       # Deleting 'has_topic' edge type    
    """
    node_types, edge_types = data.metadata()
    print("Before: ")
    print("Node types: ", node_types)
    print("Edge types: ", edge_types)
    del data[type]       # Deleting edge type
    node_types, edge_types = data.metadata()
    print("After: ")
    print("Node types: ", node_types)
    print("Edge types: ", edge_types)
    return

# delType(data_het, "venue")

In [None]:
def delSetOfEdges(data, index):
    """
    does not work
    data - hetero data
    index - index of type of edge to delete, dtype=int
    """
    print("N types of edges: ", len(data.edge_stores))
    del data.edge_stores[index]
    print(data.edge_stores)
    return
    
#delSetOfEdges(data_het, 1)



N types of edges:  4
[{'edge_index': tensor([[      0,       1,       2,  ..., 3194404, 3194404, 3194404],
        [      0,       1,       2,  ...,    4393,   21681,  317436]])}, {'edge_index': tensor([[      0,       0,       0,  ..., 1693528, 1693529, 1693530],
        [      0,   45988,  124807,  ..., 3194371, 3194387, 3194389]])}, {'edge_index': tensor([[      0,       1,       2,  ..., 3194402, 3194403, 3194404],
        [   2190,    2190,    2190,  ...,    3148,    3148,    3148]])}, {'edge_index': tensor([[      0,       0,       0,  ...,    3882,    3882,    3882],
        [2203069, 2203070, 2203071,  ...,  952391,  952392,  952393]])}]


In [None]:
edge_stores = data_het.edge_stores
edge_stores
len(edge_stores)

4

In [None]:
venue = data_het["venue"]
author = data_het["author"]
paper = data_het["paper"]
gh = data_het['author']

data_het.node_stores[0]


{'y': tensor([0, 2, 5,  ..., 0, 1, 5]), 'y_index': tensor([ 168866, 1327323,     870,  ...,  168759,  254769,  264374]), 'num_nodes': 1693531}

In [None]:
data_het['paper', 'published_in', 'venue'].edge_index
# data_het['author']
# data_het['author'].y_index # all indexes of authors

tensor([[      0,       1,       2,  ..., 3194402, 3194403, 3194404],
        [   2190,    2190,    2190,  ...,    3148,    3148,    3148]])

In [None]:
node_types, edge_types = data_het.metadata()
print("After: ")
print("Node types: ", node_types)
print("Edge types: ", edge_types)

After: 
Node types:  ['author', 'venue', 'paper']
Edge types:  [('paper', 'written_by', 'author'), ('author', 'writes', 'paper'), ('paper', 'published_in', 'venue'), ('venue', 'publishes', 'paper')]
