# Training a GDS Pipeline and Making Predictions using node embedding algorithms

In [1]:
import sys
import os
sys.path.append(os.path.join(os.curdir, ".."))
# sys.path

from utils import create_projected_graph

In [2]:
from graphdatascience import GraphDataScience
gds = GraphDataScience("bolt://localhost:7687", auth=("neo4j", "admin"))

## Training Embedding

In [3]:
projected_graph_object = create_projected_graph(
    gds,
    graph_name="graph-clf-emb-gs",
    node_spec={
        "Person": {
            "label": "MainTrain",
            "properties": {
                "nbMovies": {"defaultValue": 0},
            }
        }
    }, 
    relationship_spec={"KNOWS": {"orientation": "UNDIRECTED", "aggregation": "SINGLE"}},
)

In [4]:
try:
    model = gds.model.get("model-gs-clf-emb")
    gds.beta.model.drop(model)
except ValueError:
    pass

In [5]:
gs_model, _ = gds.beta.graphSage.train(
    projected_graph_object,
    modelName="model-gs-clf-emb",
    featureProperties=["nbMovies"],
    learningRate=0.001
)
gs_model

GraphSageModel({'modelInfo': {0: {'modelName': 'model-gs-clf-emb', 'modelType': 'graphSage', 'metrics': {'ranIterationsPerEpoch': [10], 'iterationLossesPerEpoch': [[26.577317408162536, 26.57703766469665, 26.576728840484463, 26.57607278617486, 26.575803606768364, 26.575693675103572, 26.575299107590205, 26.57480522101637, 26.573829529093054, 26.57258457550475]], 'didConverge': False, 'ranEpochs': 1, 'epochLosses': [26.57258457550475]}}}, 'trainConfig': {0: {'maxIterations': 10, 'negativeSampleWeight': 20, 'searchDepth': 5, 'aggregator': 'MEAN', 'activationFunction': 'SIGMOID', 'penaltyL2': 0.0, 'learningRate': 0.001, 'concurrency': 4, 'jobId': '6031a7ff-8efa-49c2-b065-04472959b031', 'modelName': 'model-gs-clf-emb', 'embeddingDimension': 64, 'nodeLabels': ['*'], 'sudo': False, 'featureProperties': ['nbMovies'], 'sampleSizes': [25, 10], 'relationshipTypes': ['*'], 'batchSize': 100, 'epochs': 1, 'tolerance': 0.0001}}, 'graphSchema': {0: {'graphProperties': {}, 'relationships': {'KNOWS': {}}

## Training Pipeline

In [6]:
projected_graph_object = create_projected_graph(
    gds,
    graph_name="graph-clf-emb",
    node_spec={
        "Person": {
            "label": "MainTrainDownSampled",
            "properties": {
                "nbMovies": {"defaultValue": 0},
                "isUSCitizen": {},
            }
        },
    }, 
    relationship_spec={"KNOWS": {"orientation": "UNDIRECTED", "aggregation": "SINGLE"}},
)

In [7]:
try:
    pipe = gds.pipeline.get("pipe-clf-emb")
    gds.beta.pipeline.drop(pipe)
except ValueError:
    pass

In [8]:
pipe, pipe_info = gds.beta.pipeline.nodeClassification.create("pipe-clf-emb")
pipe, pipe_info

(NCTrainingPipeline({'pipelineInfo': {0: {'featurePipeline': {'nodePropertySteps': [], 'featureProperties': []}, 'splitConfig': {'testFraction': 0.3, 'validationFolds': 3}, 'trainingParameterSpace': {'MultilayerPerceptron': [], 'RandomForest': [], 'LogisticRegression': []}, 'autoTuningConfig': {'maxTrials': 10}}}, 'pipelineName': {0: 'pipe-clf-emb'}, 'pipelineType': {0: 'Node classification training pipeline'}, 'creationTime': {0: neo4j.time.DateTime(2023, 1, 10, 15, 57, 43, 179417352, tzinfo=<DstTzInfo 'Europe/Paris' CET+1:00:00 STD>)}}),
 name                                                      pipe-clf-emb
 nodePropertySteps                                                   []
 featureProperties                                                   []
 splitConfig                {'testFraction': 0.3, 'validationFolds': 3}
 autoTuningConfig                                     {'maxTrials': 10}
 parameterSpace       {'MultilayerPerceptron': [], 'RandomForest': [...
 Name: 0, dtype: objec

In [9]:
pipe.addNodeProperty("beta.graphSage", modelName="model-gs-clf-emb", mutateProperty="embedding")
pipe.selectFeatures(["embedding"])
pipe

NCTrainingPipeline({'pipelineInfo': {0: {'featurePipeline': {'nodePropertySteps': [{'name': 'gds.beta.graphSage.mutate', 'config': {'modelName': 'model-gs-clf-emb', 'contextRelationshipTypes': [], 'contextNodeLabels': [], 'mutateProperty': 'embedding'}}], 'featureProperties': [{'feature': 'embedding'}]}, 'splitConfig': {'testFraction': 0.3, 'validationFolds': 3}, 'trainingParameterSpace': {'MultilayerPerceptron': [], 'RandomForest': [], 'LogisticRegression': []}, 'autoTuningConfig': {'maxTrials': 10}}}, 'pipelineName': {0: 'pipe-clf-emb'}, 'pipelineType': {0: 'Node classification training pipeline'}, 'creationTime': {0: neo4j.time.DateTime(2023, 1, 10, 15, 57, 43, 179417352, tzinfo=<DstTzInfo 'Europe/Paris' CET+1:00:00 STD>)}})

In [10]:
pipe.addRandomForest(minSplitSize=30, minLeafSize=10)

name                                                      pipe-clf-emb
nodePropertySteps    [{'name': 'gds.beta.graphSage.mutate', 'config...
featureProperties                                          [embedding]
splitConfig                {'testFraction': 0.3, 'validationFolds': 3}
autoTuningConfig                                     {'maxTrials': 10}
parameterSpace       {'MultilayerPerceptron': [], 'RandomForest': [...
Name: 0, dtype: object

In [11]:
#  drop model if already exists
try:
    model = gds.model.get("clf-rf-emb")
    gds.beta.model.drop(model)
except ValueError:
    pass

In [12]:
model, model_info = pipe.train(
    projected_graph_object,
    modelName="clf-rf-emb",
    targetProperty="isUSCitizen",
    metrics=["PRECISION(class=1)", "F1_WEIGHTED", "ACCURACY"],
)
model_info

Node Classification Train Pipeline:   0%|          | 0/100 [00:00<?, ?%/s]

modelSelectionStats    {'modelCandidates': [{'metrics': {'F1_WEIGHTED...
trainMillis                                                        17363
modelInfo              {'pipeline': {'nodePropertySteps': [{'name': '...
configuration          {'pipeline': 'pipe-clf-emb', 'jobId': 'd8b96ba...
Name: 0, dtype: object

In [13]:
model_info["modelSelectionStats"]

{'modelCandidates': [{'metrics': {'F1_WEIGHTED': {'validation': {'avg': 0.6897930969957505,
      'min': 0.6813331269931655,
      'max': 0.6979569542828127},
     'train': {'avg': 0.7920459528438485,
      'min': 0.7886443915657149,
      'max': 0.7949976143418733}},
    'PRECISION_class_1': {'validation': {'avg': 0.6850865069170586,
      'min': 0.6760651629030322,
      'max': 0.6926461345022461},
     'train': {'avg': 0.7837251418890424,
      'min': 0.7796134663317345,
      'max': 0.7901828681399096}},
    'ACCURACY': {'validation': {'avg': 0.6900972666666666,
      'min': 0.68169762,
      'max': 0.69827587},
     'train': {'avg': 0.79221928, 'min': 0.78879311, 'max': 0.79509284}}},
   'parameters': {'maxDepth': 2147483647,
    'minLeafSize': 10,
    'criterion': 'GINI',
    'minSplitSize': 30,
    'numberOfDecisionTrees': 100,
    'methodName': 'RandomForest',
    'numberOfSamplesRatio': 1.0}}],
 'bestParameters': {'maxDepth': 2147483647,
  'minLeafSize': 10,
  'criterion': 'GI

## Making predictions

In [14]:
projected_graph_object = create_projected_graph(
    gds,
    graph_name="graph-clf-emb-test",
    node_spec={
        "Person": {
            "label": "MainTest",
            "properties": {
                "nbMovies": {"defaultValue": 0},
                "isUSCitizen": {},
            }
        },
    }, 
    relationship_spec={"KNOWS": {"orientation": "UNDIRECTED", "aggregation": "SINGLE"}},
)

In [16]:
predictions = model.predict_stream(
    projected_graph_object,
    includePredictedProbabilities=True
)
predictions.head()

Unnamed: 0,nodeId,predictedClass,predictedProbabilities
0,9074,0,"[0.72, 0.28]"
1,9076,0,"[0.95, 0.05]"
2,9089,0,"[0.72, 0.28]"
3,9126,0,"[0.93, 0.07]"
4,9137,0,"[0.87, 0.13]"
