In [1]:
# !pip uninstall pyTigerGraph -y
# !pip install git+https://github.com/tigergraph/pyTigerGraph.git --no-cache

### Connect to TigerGraph

The `TigerGraphConnection` class represents a connection to the TigerGraph database. Under the hood, it stores the necessary information to communicate with the database. It is able to perform quite a few database tasks. Please see its [documentation](https://docs.tigergraph.com/pytigergraph/current/intro/) for details.

To connect your database, modify the `config.json` file accompanying this notebook. Set the value of `getToken` based on whether token auth is enabled for your database. Token auth is always enabled for tgcloud databases. 

In [2]:
from pyTigerGraph import TigerGraphConnection
import json

# Read in DB configs
with open('../../config.json', "r") as config_file:
    config = json.load(config_file)
    
conn = TigerGraphConnection(
    host=config["host"],
    username=config["username"],
    password=config["password"]
)

### Ingest Data

In [3]:
from pyTigerGraph.datasets import Datasets

dataset = Datasets("imdb")

conn.ingestDataset(dataset, getToken=config["getToken"])

A folder with name imdb already exists in ./tmp. Skip downloading.
---- Checking database ----
A graph with name imdb already exists in the database. Skip ingestion.
Graph name is set to imdb for this connection.


### Visualize Schema

In [4]:
from pyTigerGraph.visualization import drawSchema

drawSchema(conn.getSchema(force=True))

CytoscapeWidget(cytoscape_layout={'name': 'circle', 'animate': True, 'padding': 1}, cytoscape_style=[{'selecto…

## NodePiece Algorithm <a name="nodepiece_algorithm"></a>

The [NodePiece algorithm](https://arxiv.org/abs/2106.12144) was introduced as a way to both conserve the memory cost of vertex embeddings, as well as be able to generalize to unseen vertices during the testing process. This makes NodePiece a much more scalable approach for large, real-world graphs compared to other transductive techniques such as FastRP or Node2Vec. For more information about the algorithm, check out the author's [Medium post](https://towardsdatascience.com/nodepiece-tokenizing-knowledge-graphs-6dd2b91847aa).

We implement the NodePiece dataloader, which will allow us to iterate through batches of vertices. We take advantage of the callback functionality to process the batch into PyTorch tensors for less data manipulation in the training loop.

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd

# Testcase1: v_feats = None.
## Results: all vertex types will be used, but no vertex attributes will be loaded.

In [6]:
np_loader1 = conn.gds.nodepieceLoader(filter_by = "train_mask",
                                     batch_size = 128,
                                     compute_anchors = True,
                                     clear_cache = True,
                                     anchor_percentage = 0.1,
                                     v_feats = None, 
                                     target_vertex_types=["Movie"], 
                                     max_anchors=5,
                                     max_relational_context=5,
                                     e_types=conn.getEdgeTypes(),
                                     timeout=204_800_000)
for i, batch in enumerate(np_loader1):
    print("----Batch {}----".format(i))
    for batch_key in batch:
        print("batch type:", batch_key)
        print("batch type content:", batch[batch_key])

Number of Anchors: 1161


Exception in thread Thread-5:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.9/site-packages/pyTigerGraph/gds/dataloaders.py", line 587, in _read_data
    data = BaseLoader._parse_data(
  File "/opt/conda/lib/python3.9/site-packages/pyTigerGraph/gds/dataloaders.py", line 965, in _parse_data
    return callback_fn(data)
  File "/opt/conda/lib/python3.9/site-packages/pyTigerGraph/gds/dataloaders.py", line 3278, in nodepiece_process
    ancs = data["closest_anchors"].apply(lambda x: processAnchors(x))
  File "/opt/conda/lib/python3.9/site-packages/pandas/core/series.py", line 4774, in apply
    return SeriesApply(self, func, convert_dtype, args, kwargs).apply()
  File "/opt/conda/lib/python3.9/site-packages/pandas/core/apply.py", line 1100, in apply
    return self.appl

KeyboardInterrupt: 

# Testcase2: v_feats = ["Movie", "Actor","Director"].
## Results: fail, only support list of same type vertex.

In [7]:
np_loader2 = conn.gds.nodepieceLoader(filter_by = "train_mask",
                                     batch_size = 128,
                                     compute_anchors = True,
                                     clear_cache = True,
                                     anchor_percentage = 0.1,
                                     v_feats = ["Movie", "Actor","Director"],
                                     target_vertex_types=["Movie"], 
                                     max_anchors=5,
                                     max_relational_context=5,
                                     e_types=conn.getEdgeTypes(),
                                     timeout=204_800_000)
for i, batch in enumerate(np_loader2):
    print("----Batch {}----".format(i))
    for batch_key in batch:
        print("batch type:", batch_key)
        print("batch type content:", batch[batch_key])

ValueError: Attributes {'Actor', 'Director', 'Movie'} are not available for vertex type Movie.

# Testcase3: v_feats ={'movie_director':[]}.
## Results: fail, prompt friendly.

In [8]:
np_loader3 = conn.gds.nodepieceLoader(filter_by = "train_mask",
                                     batch_size = 128,
                                     compute_anchors = True,
                                     clear_cache = True,
                                     anchor_percentage = 0.1,
                                     v_feats ={'movie_director':[]},
                                     target_vertex_types=["Movie"], 
                                     max_anchors=5,
                                     max_relational_context=5,
                                     e_types=conn.getEdgeTypes(),
                                     timeout=204_800_000)
for i, batch in enumerate(np_loader3):
    print("----Batch {}----".format(i))
    for batch_key in batch:
        print("batch type:", batch_key)
        print("batch type content:", batch[batch_key])

ValueError: vertex type movie_director is not available in the database.

# Testcase4: target_vertex_types not included in v_feats.
## Results: fail, prompt friendly.

In [12]:
np_loader4 = conn.gds.nodepieceLoader(filter_by = "train_mask",
                                     batch_size = 128,
                                     compute_anchors = True,
                                     clear_cache = True,
                                     anchor_percentage = 0.1,
                                     v_feats ={"Actor": [], "Director": []},
                                     target_vertex_types=["Movie"], 
                                     max_anchors=5,
                                     max_relational_context=5,
                                     e_types=conn.getEdgeTypes(),
                                     timeout=204_800_000)
for i, batch in enumerate(np_loader4):
    print("----Batch {}----".format(i))
    for batch_key in batch:
        print("batch type:", batch_key)
        print("batch type content:", batch[batch_key])

Number of Anchors: 733


Exception in thread Thread-11:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 970, in _finalize_columns_and_data
    columns = _validate_or_indexify_columns(contents, columns)
  File "/opt/conda/lib/python3.9/site-packages/pandas/core/internals/construction.py", line 1018, in _validate_or_indexify_columns
    raise AssertionError(
AssertionError: 1 columns passed, passed data had 3 columns

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.9/site-packages/pyTigerGraph/gds/dataloaders.py", line 587, in _read_data
    data = BaseLoader._parse_data(
  File "/opt/conda/lib/python3.9/site-packages/pyTigerGraph/gds/dataloaders.p

KeyboardInterrupt: 

# Testcase5: anchor_method equal to unsupported method.
## Results: fail, prompt friendly.

In [14]:
np_loader5 = conn.gds.nodepieceLoader(filter_by = "train_mask",
                                     batch_size = 128,
                                     compute_anchors = True,
                                     clear_cache = True,
                                     anchor_percentage = 0.1,
                                     v_feats ={"Movie": ["y", "x"], "Actor": [], "Director": []},
                                     target_vertex_types=["Movie"], 
                                     max_anchors=5,
                                     max_relational_context=5,
                                     e_types=conn.getEdgeTypes(),
                                     timeout=204_800_000,
                                     anchor_method = 'un-random'
                                     
                                     )
for i, batch in enumerate(np_loader5):
    print("----Batch {}----".format(i))
    for batch_key in batch:
        print("batch type:", batch_key)
        print("batch type content:", batch[batch_key])

NotImplementedError: un-random anchor selection method is not supported. Please try 'random' anchor selection method

# Testcase6: anchor_percentage = 0.
## Results: run successfully(anchors equal to zeros)

In [15]:
np_loader6 = conn.gds.nodepieceLoader(filter_by = "train_mask",
                                     batch_size = 128,
                                     compute_anchors = True,
                                     clear_cache = True,
                                     anchor_percentage = 0,
                                     v_feats ={"Movie": ["y", "x"], "Actor": [], "Director": []},
                                     target_vertex_types=["Movie"], 
                                     max_anchors=5,
                                     max_relational_context=5,
                                     e_types=conn.getEdgeTypes(),
                                     timeout=204_800_000,                                 
                                     )
for i, batch in enumerate(np_loader6):
    print("----Batch {}----".format(i))
    for batch_key in batch:
        print("batch type:", batch_key)
        print("batch type content:", batch[batch_key])

Number of Anchors: 0
----Batch 0----
batch type: Movie
batch type content:           vid   relational_context  y  \
0     1048608  [12, 12, 12, 13, 0]  0   
1     1048644  [12, 12, 12, 13, 0]  2   
2     1048680  [12, 12, 12, 13, 0]  1   
3     1048688  [12, 12, 12, 13, 0]  1   
4     1048696  [12, 12, 12, 13, 0]  2   
..        ...                  ... ..   
100  31457308  [12, 12, 12, 13, 0]  1   
101  32505880  [12, 12, 12, 13, 0]  0   
102  32505932  [12, 12, 12, 13, 0]  2   
103  32505964  [12, 12, 12, 13, 0]  1   
104  32505972  [12, 12, 12, 13, 0]  1   

                                                     x          anchors  \
0    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...  [0, 0, 0, 0, 0]   
1    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...  [0, 0, 0, 0, 0]   
2    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...  [0, 0, 0, 0, 0]   
3    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...  [0, 0, 0, 0, 0]   
4    0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 ...  [

# Testcase7: e_types = []
## Results: run successfully

In [21]:
np_loader7 = conn.gds.nodepieceLoader(filter_by = "train_mask",
                                     batch_size = 128,
                                     compute_anchors = True,
                                     clear_cache = True,
                                     anchor_percentage = 0.1,
                                     v_feats ={"Movie": ["y", "x"], "Actor": [], "Director": []},
                                     target_vertex_types=["Movie"], 
                                     max_anchors=5,
                                     max_relational_context=5,
                                     e_types=[],
                                     timeout=204_800_000, 
                                     )
for i, batch in enumerate(np_loader7):
    print("----Batch {}----".format(i))
    for batch_key in batch:
        print("batch type:", batch_key)
        print("batch type content:", batch[batch_key])

Number of Anchors: 1161
----Batch 0----
batch type: Movie
batch type content:           vid   relational_context  y  \
0     1048608  [12, 12, 12, 13, 0]  0   
1     1048644  [12, 12, 12, 13, 0]  2   
2     1048680  [12, 12, 12, 13, 0]  1   
3     1048688  [12, 12, 12, 13, 0]  1   
4     1048696  [12, 12, 12, 13, 0]  2   
..        ...                  ... ..   
100  31457308  [12, 12, 12, 13, 0]  1   
101  32505880  [12, 12, 12, 13, 0]  0   
102  32505932  [12, 12, 12, 13, 0]  2   
103  32505964  [12, 12, 12, 13, 0]  1   
104  32505972  [12, 12, 12, 13, 0]  1   

                                                     x  \
0    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
1    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
2    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
3    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
4    0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 ...   
..                                                 ...   
100  0 0 0 0 0 0 0 0 0 

# Testcase8: e_types = ['unexist_edgeType']
## Results: fail, prompt friendly

In [22]:
np_loader8 = conn.gds.nodepieceLoader(filter_by = "train_mask",
                                     batch_size = 128,
                                     compute_anchors = True,
                                     clear_cache = True,
                                     anchor_percentage = 0.1,
                                     v_feats ={"Movie": ["y", "x"], "Actor": [], "Director": []},
                                     target_vertex_types=["Movie"], 
                                     max_anchors=5,
                                     max_relational_context=5,
                                     e_types=['unexist_edgeType'],
                                     timeout=204_800_000, 
                                     )
for i, batch in enumerate(np_loader8):
    print("----Batch {}----".format(i))
    for batch_key in batch:
        print("batch type:", batch_key)
        print("batch type content:", batch[batch_key])

Number of Anchors: 1161
----Batch 0----
----Batch 1----
----Batch 2----
----Batch 3----


# Testcase9: reverse_edge = True, while e_types doesn’t have reverse edge.
## Results: run successfully

In [25]:
np_loader9 = conn.gds.nodepieceLoader(filter_by = "train_mask",
                                     batch_size = 128,
                                     compute_anchors = True,
                                     clear_cache = True,
                                     anchor_percentage = 0.1,
                                     v_feats ={"Movie": ["y", "x"], "Actor": [], "Director": []},
                                     target_vertex_types=["Movie"], 
                                     max_anchors=5,
                                     max_relational_context=5,
                                     e_types=['director_movie'],
                                     reverse_edge = True,
                                     timeout=204_800_000, 
                                     )
for i, batch in enumerate(np_loader9):
    print("----Batch {}----".format(i))
    for batch_key in batch:
        print("batch type:", batch_key)
        print("batch type content:", batch[batch_key])

Number of Anchors: 1161
----Batch 0----
----Batch 1----
----Batch 2----
----Batch 3----


# Testcase10: buffer_size = 0
## Results: run successfully

In [30]:
np_loader10 = conn.gds.nodepieceLoader(filter_by = "train_mask",
                                     batch_size = 128,
                                     compute_anchors = True,
                                     clear_cache = True,
                                     anchor_percentage = 0.1,
                                     v_feats ={"Movie": ["y", "x"], "Actor": [], "Director": []},
                                     target_vertex_types=["Movie"], 
                                     max_anchors=5,
                                     max_relational_context=5,
                                     e_types=conn.getEdgeTypes(),
                                     timeout=204_800_000, 
                                     buffer_size = 0,
                                     )
for i, batch in enumerate(np_loader10):
    print("----Batch {}----".format(i))
    for batch_key in batch:
        print("batch type:", batch_key)
        print("batch type content:", batch[batch_key])

Number of Anchors: 1161
----Batch 0----
batch type: Movie
batch type content:           vid   relational_context  y  \
0          12  [12, 12, 12, 13, 0]  1   
1          68  [12, 12, 12, 13, 0]  1   
2     3145732  [12, 12, 12, 13, 0]  1   
3     3145736  [12, 12, 12, 13, 0]  2   
4     3145768  [12, 12, 12, 13, 0]  0   
..        ...                  ... ..   
100  31457308  [12, 12, 12, 13, 0]  1   
101  32505880  [12, 12, 12, 13, 0]  0   
102  32505932  [12, 12, 12, 13, 0]  2   
103  32505964  [12, 12, 12, 13, 0]  1   
104  32505972  [12, 12, 12, 13, 0]  1   

                                                     x  \
0    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
1    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
2    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
3    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
4    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
..                                                 ...   
100  0 0 0 0 0 0 0 0 0 

# Testcase11: buffer_size = 2e31-1
## Results: run successfully

In [31]:
np_loader11 = conn.gds.nodepieceLoader(filter_by = "train_mask",
                                     batch_size = 128,
                                     compute_anchors = True,
                                     clear_cache = True,
                                     anchor_percentage = 0.1,
                                     v_feats ={"Movie": ["y", "x"], "Actor": [], "Director": []},
                                     target_vertex_types=["Movie"], 
                                     max_anchors=5,
                                     max_relational_context=5,
                                     e_types=conn.getEdgeTypes(),
                                     timeout=204_800_000, 
                                     buffer_size = 2e31-1,
                                     )
for i, batch in enumerate(np_loader11):
    print("----Batch {}----".format(i))
    for batch_key in batch:
        print("batch type:", batch_key)
        print("batch type content:", batch[batch_key])

Number of Anchors: 1161
----Batch 0----
batch type: Movie
batch type content:           vid   relational_context  y  \
0     2097156  [12, 12, 12, 13, 0]  1   
1     2097212  [12, 12, 12, 13, 0]  2   
2     2097216  [12, 12, 12, 13, 0]  1   
3     2097232  [12, 12, 12, 13, 0]  1   
4     2097264  [12, 12, 12, 13, 0]  2   
..        ...                  ... ..   
100  16777324  [12, 12, 12, 13, 0]  2   
101  25165828  [12, 12, 12, 13, 0]  1   
102  26214416  [12, 12, 12, 13, 0]  0   
103  26214444  [12, 12, 12, 13, 0]  1   
104  26214488  [12, 12, 12, 13, 0]  0   

                                                     x  \
0    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
1    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
2    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
3    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
4    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
..                                                 ...   
100  0 0 0 0 0 0 0 0 0 

# Testcase12: anchor_attribute equal to None
## Results: fail, prompt friendly

In [34]:
np_loader12 = conn.gds.nodepieceLoader(filter_by = "train_mask",
                                     batch_size = 128,
                                     compute_anchors = True,
                                     anchor_attribute = None,
                                     clear_cache = True,
                                     anchor_percentage = 0.1,
                                     v_feats ={"Movie": ["y", "x"], "Actor": [], "Director": []},
                                     target_vertex_types=["Movie"], 
                                     max_anchors=5,
                                     max_relational_context=5,
                                     e_types=conn.getEdgeTypes(),
                                     timeout=204_800_000, 
                                     )
for i, batch in enumerate(np_loader12):
    print("----Batch {}----".format(i))
    for batch_key in batch:
        print("batch type:", batch_key)
        print("batch type content:", batch[batch_key])

Adding anchor attribute
Attribute already exists


TigerGraphException: ("Runtime Error: Attribute 'None' does not exist.", None)

# Testcase13:max_distance = 0
## Results: fail, prompt friendly

In [39]:
np_loader13 = conn.gds.nodepieceLoader(filter_by = "train_mask",
                                     batch_size = 128,
                                     compute_anchors = True,
                                     clear_cache = True,
                                     anchor_percentage = 0.1,
                                     v_feats ={"Movie": ["y", "x"], "Actor": [], "Director": []},
                                     target_vertex_types=["Movie"], 
                                     max_anchors=5,
                                     max_relational_context=5,
                                     max_distance = 0, 
                                     e_types=conn.getEdgeTypes(),
                                     timeout=204_800_000,
                                       
                                       
                                     )
for i, batch in enumerate(np_loader13):
    print("----Batch {}----".format(i))
    for batch_key in batch:
        print("batch type:", batch_key)
        print("batch type content:", batch[batch_key])

Number of Anchors: 1161
----Batch 0----
batch type: Movie
batch type content:           vid relational_context  y  \
0     2097156    [7, 7, 7, 8, 0]  1   
1     2097212    [7, 7, 7, 8, 0]  2   
2     2097216    [7, 7, 7, 8, 0]  1   
3     2097232    [7, 7, 7, 8, 0]  1   
4     2097264    [7, 7, 7, 8, 0]  2   
..        ...                ... ..   
100  26214444    [7, 7, 7, 8, 0]  1   
101  26214488    [7, 7, 7, 8, 0]  0   
102  29360208    [7, 7, 7, 8, 0]  1   
103  29360216    [7, 7, 7, 8, 0]  0   
104  29360236    [7, 7, 7, 8, 0]  1   

                                                     x           anchors  \
0    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   [0, 0, 0, 0, 0]   
1    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...  [75, 0, 0, 0, 0]   
2    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   [0, 0, 0, 0, 0]   
3    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   [0, 0, 0, 0, 0]   
4    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   [0, 0, 0, 0, 0] 

# Testcase14:max_anchors = 0
## Results: run successfully

In [41]:
np_loader14 = conn.gds.nodepieceLoader(filter_by = "train_mask",
                                     batch_size = 128,
                                     compute_anchors = True,
                                     clear_cache = True,
                                     anchor_percentage = 0.1,
                                     v_feats ={"Movie": ["y", "x"], "Actor": [], "Director": []},
                                     target_vertex_types=["Movie"], 
                                     max_anchors=0,
                                     max_relational_context=5,
                                     e_types=conn.getEdgeTypes(),
                                     timeout=204_800_000,
                                     )
for i, batch in enumerate(np_loader14):
    print("----Batch {}----".format(i))
    for batch_key in batch:
        print("batch type:", batch_key)
        print("batch type content:", batch[batch_key])

Number of Anchors: 1161
----Batch 0----
batch type: Movie
batch type content:           vid   relational_context  y  \
0     4194368  [12, 12, 12, 13, 0]  2   
1     4194388  [12, 12, 12, 13, 0]  1   
2    13631524  [12, 12, 12, 13, 0]  0   
3    13631628  [12, 12, 12, 13, 0]  2   
4    15728640  [12, 12, 12, 13, 0]  1   
..        ...                  ... ..   
100  28311676  [12, 12, 12, 13, 0]  0   
101  32505880  [12, 12, 12, 13, 0]  0   
102  32505932  [12, 12, 12, 13, 0]  2   
103  32505964  [12, 12, 12, 13, 0]  1   
104  32505972  [12, 12, 12, 13, 0]  1   

                                                     x anchors  \
0    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...      []   
1    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...      []   
2    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...      []   
3    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...      []   
4    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...      []   
..                               

# Testcase15:max_relational_context = 0
## Results: run successfully

In [42]:
np_loader15 = conn.gds.nodepieceLoader(filter_by = "train_mask",
                                     batch_size = 128,
                                     compute_anchors = True,
                                     clear_cache = True,
                                     anchor_percentage = 0.1,
                                     v_feats ={"Movie": ["y", "x"], "Actor": [], "Director": []},
                                     target_vertex_types=["Movie"], 
                                     max_anchors=5,
                                     max_relational_context=0,
                                     e_types=conn.getEdgeTypes(),
                                     timeout=204_800_000,
                                     )
for i, batch in enumerate(np_loader15):
    print("----Batch {}----".format(i))
    for batch_key in batch:
        print("batch type:", batch_key)
        print("batch type content:", batch[batch_key])

Number of Anchors: 1161


Exception in thread Thread-58:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.9/site-packages/pyTigerGraph/gds/dataloaders.py", line 500, in _request_rest
    resp = tgraph.runInstalledQuery(
  File "/opt/conda/lib/python3.9/site-packages/pyTigerGraph/pyTigerGraphQuery.py", line 207, in runInstalledQuery
    ret = self._post(self.restppUrl + "/query/" + self.graphname + "/" + queryName,
  File "/opt/conda/lib/python3.9/site-packages/pyTigerGraph/pyTigerGraphBase.py", line 360, in _post
    res = self._req("POST", url, authMode, headers, data, resKey, skipCheck, params, jsonData=jsonData)
  File "/opt/conda/lib/python3.9/site-packages/pyTigerGraph/pyTigerGraphBase.py", line 283, in _req
    self._errorCheck(res)
  File "/opt/conda/lib/python3.9/site-packages/pyTigerGr

KeyboardInterrupt: 

# Testcase16:clear_cache and use_cache at the same time
## Results: run successfully

In [44]:
np_loader16 = conn.gds.nodepieceLoader(filter_by = "train_mask",
                                     batch_size = 128,
                                     compute_anchors = True,
                                     clear_cache = True,
                                     anchor_percentage = 0.1,
                                     v_feats ={"Movie": ["y", "x"], "Actor": [], "Director": []},
                                     target_vertex_types=["Movie"], 
                                     max_anchors=5,
                                     max_relational_context=5,
                                     e_types=conn.getEdgeTypes(),
                                     timeout=204_800_000,
                                     use_cache = True,
                                     )
for i, batch in enumerate(np_loader16):
    print("----Batch {}----".format(i))
    for batch_key in batch:
        print("batch type:", batch_key)
        print("batch type content:", batch[batch_key])

Number of Anchors: 1161
----Batch 0----
batch type: Movie
batch type content:           vid   relational_context  y  \
0     2097156  [12, 12, 12, 13, 0]  1   
1     2097212  [12, 12, 12, 13, 0]  2   
2     2097216  [12, 12, 12, 13, 0]  1   
3     2097232  [12, 12, 12, 13, 0]  1   
4     2097264  [12, 12, 12, 13, 0]  2   
..        ...                  ... ..   
100  27263072  [12, 12, 12, 13, 0]  1   
101  27263116  [12, 12, 12, 13, 0]  1   
102  29360208  [12, 12, 12, 13, 0]  1   
103  29360216  [12, 12, 12, 13, 0]  0   
104  29360236  [12, 12, 12, 13, 0]  1   

                                                     x  \
0    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
1    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
2    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
3    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
4    0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...   
..                                                 ...   
100  0 0 0 0 0 0 0 0 0 

# Testcase17:tokenMap equal to a nonexistent path
## Results: fail, prompt friendly

In [50]:
valid_loader = conn.gds.nodepieceLoader(anchor_cache_attr="anchors", 
                                        filter_by = "val_mask",
                                        batch_size = 8192,
                                        v_feats = {"Movie": ["y", "x"], "Actor": [], "Director": []}, 
                                        target_vertex_types=["Movie"], 
                                        compute_anchors=False,
                                        max_anchors=5,
                                        max_relational_context=5,
                                        use_cache = False,
                                        e_types=conn.getEdgeTypes(),
                                        timeout=204_800_000,
                                        tokenMap="./npAncs1.pkl",
                                        )
for i, batch in enumerate(valid_loader):
    print("----Batch {}----".format(i))
    for batch_key in batch:
        print("batch type:", batch_key)
        print("batch type content:", batch[batch_key])

FileNotFoundError: [Errno 2] No such file or directory: './npAncs1.pkl'

# Testcase18: using nodepieceLoader with callback_fn to loaddata(via Kafka).  
## Results: run successfully, data loaded completely

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
def process_batch(batch):
    x = {"relational_context": torch.tensor(batch["Movie"]["relational_context"], dtype=torch.long), 
         "anchors": torch.tensor(batch["Movie"]["anchors"], dtype=torch.long), 
         "distance": torch.tensor(batch["Movie"]["anchor_distances"], dtype=torch.long),
         "feats": torch.tensor(np.stack(batch["Movie"]["x"].apply(lambda x: np.fromstring(x, sep=" ")).values), dtype=torch.float),
         "y": torch.tensor(batch["Movie"]["y"].astype(int))}
    return x

conn.gds.configureKafka(kafka_address ="your_Kafka_address")
np_loader_test18 = conn.gds.nodepieceLoader(filter_by = "train_mask",
                                     batch_size = 128,
                                     compute_anchors = True,
                                     clear_cache = True,
                                     anchor_percentage = 0.1,
                                     v_feats = {"Movie": ["y", "x"], "Actor": [], "Director": []}, 
                                     target_vertex_types=["Movie"], 
                                     max_anchors=5,
                                     max_relational_context=5,
                                     e_types=conn.getEdgeTypes(),
                                     timeout=204_800_000,
                                     callback_fn = lambda x: process_batch(x))
for i, batch in enumerate(np_loader_test18):
    print("----Batch {}----".format(i))
    for batch_key in batch:
        print("batch type:", batch_key)
        print("batch type dim:", batch[batch_key].size())
        print("sample lastone in batch:{}\n".format(batch[batch_key][-1]))



Number of Anchors: 1434
----Batch 0----
batch type: relational_context
batch type dim: torch.Size([105, 5])
sample lastone in batch:tensor([12, 12, 12, 13,  0])

batch type: anchors
batch type dim: torch.Size([105, 5])
sample lastone in batch:tensor([ 790, 1160,  790,  790, 1292])

batch type: distance
batch type dim: torch.Size([105, 5])
sample lastone in batch:tensor([7, 8, 8, 8, 8])

batch type: feats
batch type dim: torch.Size([105, 3066])
sample lastone in batch:tensor([0., 0., 0.,  ..., 0., 0., 0.])

batch type: y
batch type dim: torch.Size([105])
sample lastone in batch:1

----Batch 1----
batch type: relational_context
batch type dim: torch.Size([110, 5])
sample lastone in batch:tensor([12, 12, 12, 13,  0])

batch type: anchors
batch type dim: torch.Size([110, 5])
sample lastone in batch:tensor([1294,  637, 1297, 1294,  637])

batch type: distance
batch type dim: torch.Size([110, 5])
sample lastone in batch:tensor([6, 6, 7, 7, 7])

batch type: feats
batch type dim: torch.Size([1