In [1]:
import os

source_directory = "gat_cora"

os.mkdir("./{}".format(source_directory))
os.mkdir("./{}/code".format(source_directory))

In [2]:
%%writefile $source_directory/code/model.py

import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv

class GAT(torch.nn.Module):
    def __init__(
        self, num_features, num_layers, out_dim, dropout, hidden_dim, num_heads
    ):
        super().__init__()
        self.dropout = dropout
        self.layers = torch.nn.ModuleList()
        for i in range(num_layers):
            in_units = num_features if i == 0 else hidden_dim * num_heads
            out_units = out_dim if i == (num_layers - 1) else hidden_dim
            heads = 1 if i == (num_layers - 1) else num_heads
            self.layers.append(
                GATConv(in_units, out_units, heads=heads, dropout=dropout)
            )

    def reset_parameters(self):
        for layer in self.layers:
            layer.reset_parameters()

    def forward(self, data):
        x, edge_index = data.x.float(), data.edge_index
        for layer in self.layers[:-1]:
            x = layer(x, edge_index)
            x = F.elu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.layers[-1](x, edge_index)
        return x

Writing gat_cora/code/model.py


In [3]:
parameters = {
    "model_name": "GAT",
    "model_config": {
        "num_features": 1433, # Number of features on Cora vertices 
        "out_dim": 7,         # Number of classes in Cora
        "num_heads": 8,       # Number of attention heads in GAT model
        "hidden_dim": 8,      # Number of hidden units in GAT model
        "num_layers": 2,      # Number of GAT layers in GAT model
        "dropout": 0.6        # Dropout probability in GAT model
    },
    "infer_loader_config": {
        "v_in_feats": ["x"],     # List of vertex features to be loaded
        "v_out_labels": ["y"],   # List of vertex labels to be loaded
        "v_extra_feats": [],     # Don't need any extra features for inference
        "output_format": "PyG",  # Using Pytorch Geometric format
        "batch_size": 64,        # Batch size for inference
        "num_neighbors": 10,     # Number of neighbors per vertex
        "num_hops": 2,           # How deep to go in the graph
        "shuffle": False         # Don't shuffle the data
    },
    "training_loader_config": {
        "v_in_feats": ["x"],
        "v_out_labels": ["y"],
        "v_extra_feats": ["train_mask","val_mask","test_mask"],
        "output_format": "PyG",
        "batch_size": 64, 
        "num_neighbors": 10, 
        "num_hops": 2,
        "shuffle": True
    },
    "optimizer_config": {
        "lr": 0.01,
        "weight_decay": 5e-4,
    },
    "connection_config": {
        "host": "http://35.230.92.92", 
        "graphname": "Cora", 
        "username": "tigergraph", 
        "password": "tigergraph"
    }
}

In [4]:
import json

json.dump(parameters, open("{}/code/config.json".format(source_directory), "w"))

In [5]:
import sys
sys.path.append(source_directory+"/code")

import model
GAT = getattr(model, parameters["model_name"])

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
gat = GAT(**parameters["model_config"])
gat

GAT(
  (layers): ModuleList(
    (0): GATConv(1433, 8, heads=8)
    (1): GATConv(64, 7, heads=1)
  )
)

In [7]:
from pyTigerGraph import TigerGraphConnection

conn = TigerGraphConnection(**parameters["connection_config"])

In [8]:
train_loader = conn.gds.neighborLoader(
    **parameters["training_loader_config"],
    filter_by="train_mask"
)

In [9]:
valid_loader = conn.gds.neighborLoader(
    **parameters["training_loader_config"],
    filter_by="val_mask"
)

In [10]:
test_loader = conn.gds.neighborLoader(
    **parameters["training_loader_config"],
    filter_by="test_mask"
)

In [11]:
import torch
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

gat.to(device)

optimizer = torch.optim.Adam(
    gat.parameters(), **parameters["optimizer_config"]
)

In [12]:
from datetime import datetime
from pyTigerGraph.gds.metrics import Accumulator, Accuracy

In [13]:
global_steps = 0
logs = {}
for epoch in range(10):
    # Train
    gat.train()
    epoch_train_loss = Accumulator()
    epoch_train_acc = Accuracy()
    for bid, batch in enumerate(train_loader):
        batchsize = batch.x.shape[0]
        batch.to(device)
        # Forward pass
        out = gat(batch)
        # Calculate loss
        loss = F.cross_entropy(out[batch.train_mask], batch.y[batch.train_mask])
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_train_loss.update(loss.item() * batchsize, batchsize)
        # Predict on training data
        with torch.no_grad():
            pred = out.argmax(dim=1)
            epoch_train_acc.update(pred[batch.train_mask], batch.y[batch.train_mask])
        # Log training status after each batch
        logs["loss"] = epoch_train_loss.mean
        logs["acc"] = epoch_train_acc.value
        print(
            "Epoch {}, Train Batch {}, Loss {:.4f}, Accuracy {:.4f}".format(
                epoch, bid, logs["loss"], logs["acc"]
            )
        )
        global_steps += 1
    # Evaluate
    gat.eval()
    epoch_val_loss = Accumulator()
    epoch_val_acc = Accuracy()
    for batch in valid_loader:
        batchsize = batch.x.shape[0]
        batch.to(device)
        with torch.no_grad():
            # Forward pass
            out = gat(batch)
            # Calculate loss
            valid_loss = F.cross_entropy(out[batch.val_mask], batch.y[batch.val_mask])
            epoch_val_loss.update(valid_loss.item() * batchsize, batchsize)
            # Prediction
            pred = out.argmax(dim=1)
            epoch_val_acc.update(pred[batch.val_mask], batch.y[batch.val_mask])
    # Log testing result after each epoch
    logs["val_loss"] = epoch_val_loss.mean
    logs["val_acc"] = epoch_val_acc.value
    print(
        "Epoch {}, Valid Loss {:.4f}, Valid Accuracy {:.4f}".format(
            epoch, logs["val_loss"], logs["val_acc"]
        )
    )

Epoch 0, Train Batch 0, Loss 1.9899, Accuracy 0.2174
Epoch 0, Train Batch 1, Loss 1.9629, Accuracy 0.2231
Epoch 0, Train Batch 2, Loss 1.8993, Accuracy 0.2290
Epoch 0, Valid Loss 1.6847, Valid Accuracy 0.5180
Epoch 1, Train Batch 0, Loss 1.3743, Accuracy 0.6471
Epoch 1, Train Batch 1, Loss 1.4057, Accuracy 0.5385
Epoch 1, Train Batch 2, Loss 1.3765, Accuracy 0.5634
Epoch 1, Valid Loss 1.4639, Valid Accuracy 0.6568
Epoch 2, Train Batch 0, Loss 1.2153, Accuracy 0.7015
Epoch 2, Train Batch 1, Loss 1.1691, Accuracy 0.6644
Epoch 2, Train Batch 2, Loss 1.2200, Accuracy 0.6381
Epoch 2, Valid Loss 1.3022, Valid Accuracy 0.6548
Epoch 3, Train Batch 0, Loss 1.2570, Accuracy 0.6389
Epoch 3, Train Batch 1, Loss 1.0519, Accuracy 0.6923
Epoch 3, Train Batch 2, Loss 1.0468, Accuracy 0.6863
Epoch 3, Valid Loss 1.1991, Valid Accuracy 0.6653
Epoch 4, Train Batch 0, Loss 0.8330, Accuracy 0.7164
Epoch 4, Train Batch 1, Loss 0.8263, Accuracy 0.7333
Epoch 4, Train Batch 2, Loss 0.8301, Accuracy 0.7404
Epoch

In [14]:
gat.eval()
acc = Accuracy()
for batch in test_loader:
    batch.to(device)
    with torch.no_grad():
        pred = gat(batch).argmax(dim=1)
        acc.update(pred[batch.test_mask], batch.y[batch.test_mask])
print("Accuracy: {:.4f}".format(acc.value))

Accuracy: 0.7204


In [15]:
torch.save(gat.state_dict(), "{}/model.pth".format(source_directory))

In [24]:
%%writefile $source_directory/code/inference.py
import torch
import pyTigerGraph as tg
import json
import model
import os

def model_fn(model_dir):
    with open(os.path.join(model_dir, "code/config.json")) as json_file:
        config = json.load(json_file)
    connection_config = config["connection_config"]
    model_config = config["model_config"]
    loader_config = config["infer_loader_config"]
    model_name = config["model_name"]

    mdl = getattr(model, model_name)

    conn = tg.TigerGraphConnection(**connection_config)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gnn_model = mdl(**model_config)
    with open(os.path.join(model_dir, "model.pth"), 'rb') as f:
        gnn_model.load_state_dict(torch.load(f))
    gnn_model.to(device).eval()

    infer_loader = conn.gds.neighborLoader(**loader_config)

    model_loader_dict = {"model": gnn_model, "loader": infer_loader}

    return model_loader_dict
    
def input_fn(request_body, content_type="application/json"):
    if content_type == "application/json":
        input_data = json.loads(request_body)
        verts = input_data["vertices"]
        return verts
    else:
        raise Exception("Requested unsupported ContentType in content_type: {}".format(content_type))

def predict_fn(input_data, model):
    loader = model["loader"]
    gnn = model["model"]
    sub_graphs = loader.fetch(input_data)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    sub_graphs.to(device)
    with torch.no_grad():
        output = gnn(sub_graphs)
    return (input_data, output.cpu())

def output_fn(prediction, content_type):
    if content_type == "application/json":
        returnJson = {}
        for i in range(len(prediction[0])):
            returnJson[prediction[0][i]["primary_id"]] = list(prediction[1][i].tolist())
        return json.dumps(returnJson)
    raise Exception("Requested unsupported ContentType in content_type: {}".format(content_type))

Overwriting gat_cora/code/inference.py


In [17]:
%%writefile $source_directory/code/requirements.txt
pyTigerDriver==1.0.14
pyTigerGraph[gds]==0.9
torch-geometric==2.0.4
torch-scatter==2.0.9
torch-sparse==0.6.13


Writing gat_cora/code/requirements.txt


In [25]:
import tarfile

zipped_model_path = "./model.tar.gz"

with tarfile.open(zipped_model_path, "w:gz") as tar:
    tar.add(source_directory+"/model.pth")
    tar.add(source_directory+"/code")

In [26]:
from sagemaker.pytorch.model import PyTorchModel

role = "AmazonSageMaker-ExecutionRole-20211022T101209"

pytorch_model = PyTorchModel(model_data="s3://tg-mlworkbench/sagemaker_inference/model.tar.gz", 
                             role=role,
                             source_dir="s3://tg-mlworkbench/sagemaker_inference/model.tar.gz",
                             entry_point='code/inference.py', 
                             py_version='py38', 
                             framework_version='1.11.0')

In [27]:
predictor = pytorch_model.deploy(initial_instance_count=1, 
                         instance_type='ml.m5.xlarge', 
                         endpoint_name="gat-cora-4")

-----!

In [28]:
predictor.endpoint_name

'gat-cora-4'

In [29]:
from sagemaker.predictor import Predictor
import json

# Read image into memory
payload = {"vertices": [{"primary_id": "1", "type": "Paper"}]}


payload = json.dumps(payload)
predictor = Predictor(predictor.endpoint_name)
inference_response = predictor.predict(data=payload)
print (inference_response)

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received server error (0) from primary with message "Your invocation timed out while waiting for a response from container primary. Review the latency metrics for each container in Amazon CloudWatch, resolve the issue, and try again.". See https://us-east-2.console.aws.amazon.com/cloudwatch/home?region=us-east-2#logEventViewer:group=/aws/sagemaker/Endpoints/gat-cora-4 in account 090082397457 for more information.