# Training a GDS Pipeline and Making Predictions

Building, configuring and training a node classification pipeline in the GDS.

In [None]:
import sys
import os
sys.path.append(os.path.join(os.curdir, ".."))
# sys.path
from utils import create_projected_graph

In [None]:
from graphdatascience import GraphDataScience
gds = GraphDataScience("bolt://localhost:7687", auth=("neo4j", "admin"))

## Training Pipeline

In [None]:
projected_graph_object = create_projected_graph(
    gds,
    graph_name="graph-clf",
    node_spec={
        "Train": {
            "label": "MainTrainDownSampled",
            "properties": {
                "nbMovies": {"defaultValue": 0},
                "louvain": {},
                "isUSCitizen": {},
            }
        },
        "Test": {
            "label": "MainTest",
            "properties": {
                "nbMovies": {"defaultValue": 0},
                "louvain": {},
                "isUSCitizen": {}
            }            
        }
    },
    relationship_spec={"KNOWS": {"orientation": "UNDIRECTED", "aggregation": "SINGLE"}},
)
projected_graph_object

In [None]:
try:
    pipe = gds.pipeline.get("pipe-clf")
    gds.beta.pipeline.drop(pipe)
except ValueError:
    pass

In [None]:
pipe, pipe_info = gds.beta.pipeline.nodeClassification.create("pipe-clf")
pipe_info

In [None]:
pipe.selectFeatures(["nbMovies", "louvain"])

In [None]:
pipe

In [None]:
pipe.addRandomForest(minSplitSize=30, minLeafSize=10, maxDepth=50)

In [None]:
pipe.configureSplit(testFraction=0.33, validationFolds=2)

In [None]:
#  drop model if already exists
try:
    model = gds.model.get("model-clf-rf")
    gds.beta.model.drop(model)
except ValueError:
    pass

In [None]:
model, model_info = pipe.train(
    projected_graph_object,
    modelName="model-clf-rf",
    targetNodeLabels=['Train'],
    targetProperty="isUSCitizen",
    metrics=["PRECISION(class=1)", "ACCURACY"],
    # metrics=["ACCURACY"],
    randomSeed=11,
)
model_info

In [None]:
model_info["modelSelectionStats"]

## Making predictions

In [None]:
predictions = model.predict_stream(
    projected_graph_object, 
    targetNodeLabels=["Test"], 
    includePredictedProbabilities=True
)
predictions.head()

In [None]:
len(predictions)

In [None]:
# inspect projected graph to find out true class
df_test = gds.graph.nodeProperty.stream(
    projected_graph_object, 
    "isUSCitizen",
)
df_test = df_test.set_index("nodeId")
df_test.head(10)

In [None]:
test_nodes = ["MainTest" in n.labels for n in gds.util.asNodes(list(df_test.index))]
df_test = df_test[test_nodes]

In [None]:
predictions = predictions.set_index("nodeId")
predictions.head(5)

In [None]:
df_test["prediction"] = predictions.predictedClass
# we drop the rows in df_test without prediction, actually corresponding to nodes with label MainTrain
# that were excluded from the predictions
df_test = df_test.dropna(subset=["prediction"])
df_test.head()

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
mat = confusion_matrix(df_test.propertyValue, df_test.prediction)

In [None]:
mat

In [None]:
# gds.beta.pipeline.drop(pipe)