In [None]:
%cd ../..
# We need to change the working directory to the root of GiGL repo so we can import the necessary modules/scripts used below

# Setting up GCP Project and configs
Assuming you have a GCP project setup:

1. Open up `configs/example_resource_config.yaml` and fill all relevant fields under `common_compute_config`:
  - project
  - region
  - temp_assets_bucket
  - temp_regional_assets_bucket
  - perm_assets_bucket
  - temp_assets_bq_dataset_name
  - embedding_bq_dataset_name
  - gcp_service_account_email

2. Ensure your service account has relevant perms (A non-exaustive list):
  - roles/bigquery.user
  - roles/cloudprofiler.user
  - roles/compute.admin
  - roles/dataflow.admin
  - roles/dataflow.worker
  - roles/dataproc.editor
  - roles/logging.logWriter
  - roles/monitoring.metricWriter
  - roles/notebooks.legacyViewer
  - roles/aiplatform.user
  - roles/dataproc.worker
  - roles/storage.objectAdmin : on relevant buckets
  - roles/artifactregistry.reader
  - roles/artifactregistry.writer


In [None]:
import datetime
import getpass

from gigl.common import LocalUri
from gigl.env.pipelines_config import get_resource_config
from gigl.src.common.types.pb_wrappers.gigl_resource_config import GiglResourceConfigWrapper
from gigl.src.common.types.pb_wrappers.gbml_config import GbmlConfigPbWrapper

curr_datetime = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# Firstly, let's give your job a name and ensure that the resource and task configs exist and can be loaded
JOB_NAME = f"{getpass.getuser()}_cora_{curr_datetime}"
TEMPLATE_TASK_CONFIG_URI = LocalUri("examples/distributed/configs/e2e_cora_udl_glt_task_config.yaml")
RESOURCE_CONFIG_URI = LocalUri("examples/distributed/configs/example_resource_config.yaml")

TEMPLATE_TASK_CONFIG: GbmlConfigPbWrapper = GbmlConfigPbWrapper.get_gbml_config_pb_wrapper_from_uri(gbml_config_uri=TEMPLATE_TASK_CONFIG_URI)
RESOURCE_CONFIG: GiglResourceConfigWrapper = get_resource_config(resource_config_uri=RESOURCE_CONFIG_URI)
PROJECT = RESOURCE_CONFIG.project


print(f"Succesfully found task config and resource config. Script will help execute job: {JOB_NAME} on project: {PROJECT}")

In [None]:
# Lets run some basic checks to validate correctness of the task and resource config
from gigl.src.validation_check.config_validator import kfp_validation_checks

kfp_validation_checks(
    job_name=JOB_NAME,
    task_config_uri=TEMPLATE_TASK_CONFIG_URI,
    resource_config_uri=RESOURCE_CONFIG_URI,
    # config_populator is the first step in the pipeline; where we will populat the template task config specified above and generate a frozen config
    start_at="config_populator"
)

## Compiling Src Docker images

You will need to build and push docker images with your custom code so that individual GiGL components can leverage your code.
For this experiment we will consider the MAG240M specs and code to be "custom code", and we will guide you how to build a docker image with the code.

We will make use of `scripts/build_and_push_docker_image.py` for this.

Make note that this builds `containers/Dockerfile.src` and `containers/Dockerfile.dataflow.src`; which have instructions to `COPY` the `examples` folder - which contains all the source code for MAG240M, and it has all the GiGL src code.

In [None]:
from scripts.build_and_push_docker_image import build_and_push_cpu_image, build_and_push_cuda_image, build_and_push_dataflow_image

DOCKER_IMAGE_DATAFLOW_RUNTIME_NAME_WITH_TAG = f"gcr.io/{PROJECT}/gigl_dataflow_runtime:{curr_datetime}"
DOCKER_IMAGE_MAIN_CUDA_NAME_WITH_TAG = f"gcr.io/{PROJECT}/gigl_cuda:{curr_datetime}"
DOCKER_IMAGE_MAIN_CPU_NAME_WITH_TAG = f"gcr.io/{PROJECT}/gigl_cpu:{curr_datetime}"

build_and_push_dataflow_image(
    image_name=DOCKER_IMAGE_DATAFLOW_RUNTIME_NAME_WITH_TAG,
)
build_and_push_cuda_image(
    image_name=DOCKER_IMAGE_MAIN_CUDA_NAME_WITH_TAG,
)
build_and_push_cpu_image(
    image_name=DOCKER_IMAGE_MAIN_CPU_NAME_WITH_TAG,
)

print(f"""We built and pushed the following docker images:
- {DOCKER_IMAGE_DATAFLOW_RUNTIME_NAME_WITH_TAG}
- {DOCKER_IMAGE_MAIN_CUDA_NAME_WITH_TAG}
- {DOCKER_IMAGE_MAIN_CPU_NAME_WITH_TAG}
""")

## We will instantiate local runner to help orchestrate the test pipeline

In [None]:
from gigl.orchestration.local.runner import Runner, PipelineConfig


runner = Runner()
pipeline_config = PipelineConfig(
    applied_task_identifier=JOB_NAME,
    task_config_uri=TEMPLATE_TASK_CONFIG_URI,
    resource_config_uri=RESOURCE_CONFIG_URI,
    custom_cuda_docker_uri=DOCKER_IMAGE_MAIN_CUDA_NAME_WITH_TAG,
    custom_cpu_docker_uri=DOCKER_IMAGE_MAIN_CPU_NAME_WITH_TAG,
    dataflow_docker_uri=DOCKER_IMAGE_DATAFLOW_RUNTIME_NAME_WITH_TAG,
)


### First we will run config populator
The config populator takes in a template `GbmlConfig` and outputs a frozen `GbmlConfig` by populating all job related metadata paths in `sharedConfig`. These are mostly GCS paths which the following components read and write from, and use as an intermediary data communication medium. For example, the field `sharedConfig.trainedModelMetadata` is populated with a GCS URI, which indicates to the Trainer to write the trained model to this path, and to the Inferencer to read the model from this path

In [None]:
from gigl.src.common.utils.file_loader import FileLoader
frozen_config_uri = runner.run_config_populator(pipeline_config=pipeline_config)
frozen_config = GbmlConfigPbWrapper.get_gbml_config_pb_wrapper_from_uri(gbml_config_uri=frozen_config_uri)
file_loader = FileLoader()

print(f"Config Populator has successfully generated the following frozen config from the template ({TEMPLATE_TASK_CONFIG_URI}) :")
print(frozen_config.gbml_config_pb)

pipeline_config.task_config_uri = frozen_config_uri # We need to update the task config uri to the new frozen config uri

# Next we run the preprocessor
The Data Preprocessor reads node, edge and respective feature data from a data source, and produces preprocessed / transformed versions of all this data, for subsequent components to use.  It uses Tensorflow Transform to achieve data transformation in a distributed fashion, and allows for transformations like categorical encoding, scaling, normalization, casting and more.

In this case we are using preprocessing spec defined in `python/gigl/src/mocking/mocking_assets/passthrough_preprocessor_config_for_mocked_assets.py` - take a look for more details.

You will note that the preprocessor will create a few BQ jobs to prepare the node and edge tables, subsequently it will kick off TFT (dataflow) jobs to do the actual preprocessing. The preprocessor will: (1) create a preprocessing spec and dump it to path specified in frozen config `sharedConfig.preprocessedMetadataUri`. (2) Respective Dataflow jobs will dump the preprocessed assets as `.tfrecord` files to the paths specified inside the preprocessing spec `preprocessedMetadataUri`

In [None]:
# WARN: There is an issue when trying to run dataflow jobs from inside a jupyter kernel; thus we cannot use the line 
# below to run the preprocessor as you would normally in a python script.
# runner.run_data_preprocessor(pipeline_config=pipeline_config) 

# Instead, we will run the preprocessor from the command line.
# Note: You can actually do this with every component; we just make use of the runner to make it easier to run the components.
!python -m gigl.src.data_preprocessor.data_preprocessor \
--job_name=$JOB_NAME \
--task_config_uri=$frozen_config_uri \
--resource_config_uri=$RESOURCE_CONFIG_URI \
--custom_worker_image_uri=$DOCKER_IMAGE_DATAFLOW_RUNTIME_NAME_WITH_TAG

In [None]:
# TODO(mkolodner): Add trainer

In [None]:
# WARN: There is an issue when trying to run dataflow jobs from inside a jupyter kernel; thus we cannot use the line 
# below to run the inferencer as you would normally in a python script.
# runner.run_inferencer(pipeline_config=pipeline_config) 

# Instead, we will run the inferencer from the command line.
# Note: You can actually do this with every component; we just make use of the runner to make it easier to run the components.
!python -m gigl.src.inference.inferencer \
--job_name=$JOB_NAME \
--task_config_uri=$frozen_config_uri \
--resource_config_uri=$RESOURCE_CONFIG_URI \
--custom_worker_image_uri=$DOCKER_IMAGE_DATAFLOW_RUNTIME_NAME_WITH_TAG \
--cpu_docker_uri=$DOCKER_IMAGE_MAIN_CPU_NAME_WITH_TAG \
--cuda_docker_uri=$DOCKER_IMAGE_MAIN_CUDA_NAME_WITH_TAG

### Inference
The Inferencer component is responsible for running inference of a trained model on samples generated by the Subgraph Sampler component.  At a high level, it works by applying a trained model in an embarrassingly parallel and distributed fashion across these samples, and persisting the output embeddings and/or predictions.

In [None]:
# Looking at inference results
from gigl.src.common.utils.bq import BqUtils
from gigl.src.inference.lib.assets import InferenceAssets


bq_emb_out_table = InferenceAssets.get_enumerated_embedding_table_path(gbml_config_pb_wrapper=frozen_config, node_type="paper")
print(f"Embeddings should be successfully stored in the following location: {bq_emb_out_table}")

bq_utils = BqUtils(project=PROJECT)
query = f"SELECT * FROM {bq_emb_out_table} LIMIT 5"
result = list(bq_utils.run_query(query=query, labels={}))

print(f"Query result: {result}")