# Link Prediction for future collaborations

In [1]:
import sys
import os
sys.path.append(os.path.join(os.curdir, "../.."))
# sys.path
import pandas as pd
from utils import create_projected_graph

In [2]:
from graphdatascience import GraphDataScience
gds = GraphDataScience("bolt://localhost:7687", auth=("neo4j", "admin"), database="neo4j")

## Configuring and training the pipeline

In [3]:
projected_graph_object = create_projected_graph(
    gds,
    graph_name="graph-lp-collab",
    node_spec={
        "Person": {
            "label": "MainComponent",
            "properties": {
                "louvain": {"defaultValue": 1},
            }
        },
    }, 
    relationship_spec={
        "KNOWS": {
            "type": "KNOWS_TRAIN", 
            "orientation": "UNDIRECTED",
            "properties": {"weight": {"defaultValue": 1.0}}
        }
    },
)
projected_graph_object

Graph({'graphName': 'graph-lp-collab', 'nodeCount': 36878, 'relationshipCount': 588862, 'database': 'neo4j', 'configuration': {'relationshipProjection': {'KNOWS': {'orientation': 'UNDIRECTED', 'aggregation': 'DEFAULT', 'type': 'KNOWS_TRAIN', 'properties': {'weight': {'defaultValue': 1.0, 'property': 'weight', 'aggregation': 'DEFAULT'}}}}, 'jobId': 'b58fc93a-cfb8-4d7f-ab17-484fa9efb119', 'nodeProjection': {'Person': {'label': 'MainComponent', 'properties': {'louvain': {'defaultValue': 1, 'property': 'louvain'}}}}, 'relationshipProperties': {}, 'creationTime': neo4j.time.DateTime(2023, 1, 12, 17, 4, 26, 367889085, tzinfo=<DstTzInfo 'Europe/Paris' CET+1:00:00 STD>), 'validateRelationships': False, 'readConcurrency': 4, 'sudo': False, 'nodeProperties': {}}, 'schema': {'graphProperties': {}, 'relationships': {'KNOWS': {'weight': 'Float (DefaultValue(1.0), PERSISTENT, Aggregation.DEFAULT)'}}, 'nodes': {'Person': {'louvain': 'Integer (DefaultValue(1), PERSISTENT)'}}}, 'memoryUsage': '44 MiB'}

In [4]:
gds.beta.node2vec.mutate(
    projected_graph_object,
    mutateProperty="embedding",
    embeddingDimension=32,
)

nodeCount                                                            36878
nodePropertiesWritten                                                36878
lossPerIteration                                      [100484612.35727863]
mutateMillis                                                            -1
postProcessingMillis                                                     0
preProcessingMillis                                                      0
computeMillis                                                        52265
configuration            {'negativeSamplingExponent': 0.75, 'initialLea...
Name: 0, dtype: object

In [5]:
try:
    pipe = gds.pipeline.get("pipe-lp-collab")
    gds.beta.pipeline.drop(pipe)
except ValueError:
    pass

In [6]:
pipe, pipe_info = gds.beta.pipeline.linkPrediction.create("pipe-lp-collab")
pipe_info

name                                                    pipe-lp-collab
nodePropertySteps                                                   []
featureSteps                                                        []
splitConfig          {'negativeSamplingRatio': 1.0, 'testFraction':...
autoTuningConfig                                     {'maxTrials': 10}
parameterSpace       {'MultilayerPerceptron': [], 'RandomForest': [...
Name: 0, dtype: object

In [7]:
# pipe.addFeature("hadamard", nodeProperties=["embedding"])
pipe.addFeature("cosine", nodeProperties=["embedding"])
pipe.addFeature("same_category", nodeProperties=["louvain"])

name                                                    pipe-lp-collab
nodePropertySteps                                                   []
featureSteps         [{'name': 'COSINE', 'config': {'nodeProperties...
splitConfig          {'negativeSamplingRatio': 1.0, 'testFraction':...
autoTuningConfig                                     {'maxTrials': 10}
parameterSpace       {'MultilayerPerceptron': [], 'RandomForest': [...
Name: 0, dtype: object

In [8]:
# pipe.addLogisticRegression(tolerance=0.0001, maxEpochs=400, learningRate=0.01)
pipe.addLogisticRegression()

name                                                    pipe-lp-collab
nodePropertySteps                                                   []
featureSteps         [{'name': 'COSINE', 'config': {'nodeProperties...
splitConfig          {'negativeSamplingRatio': 1.0, 'testFraction':...
autoTuningConfig                                     {'maxTrials': 10}
parameterSpace       {'MultilayerPerceptron': [], 'RandomForest': [...
Name: 0, dtype: object

In [9]:
try:
    model = gds.model.get("model-lp-collab")
    gds.beta.model.drop(model)
except (ValueError, NameError):
    pass

In [10]:
pipe.train_estimate(
    projected_graph_object,
    targetRelationshipType="KNOWS",
    modelName="model-lp-collab",
    randomSeed=42
)

requiredMemory                                    [17 MiB ... 229 MiB]
treeView             Memory Estimation: [17 MiB ... 229 MiB]\n|-- a...
mapView              {'components': [{'components': [{'components':...
bytesMin                                                      17901360
bytesMax                                                     240267304
nodeCount                                                        36878
relationshipCount                                               588862
heapPercentageMin                                                  0.1
heapPercentageMax                                                  0.1
Name: 0, dtype: object

In [11]:
model, train_result = pipe.train(
    projected_graph_object,
    targetRelationshipType="KNOWS",
    modelName="model-lp-collab",
    randomSeed=42
)

In [12]:
train_result["modelSelectionStats"]

{'modelCandidates': [{'metrics': {'AUCPR': {'validation': {'min': 0.9897480638648336,
      'avg': 0.9901698196424231,
      'max': 0.9909638230168275},
     'train': {'min': 0.9897688881615645,
      'avg': 0.9901689736188725,
      'max': 0.9903785385705357}}},
   'parameters': {'maxEpochs': 100,
    'minEpochs': 1,
    'penalty': 0.0,
    'patience': 1,
    'methodName': 'LogisticRegression',
    'batchSize': 100,
    'tolerance': 0.001,
    'learningRate': 0.001}}],
 'bestParameters': {'maxEpochs': 100,
  'minEpochs': 1,
  'penalty': 0.0,
  'patience': 1,
  'methodName': 'LogisticRegression',
  'batchSize': 100,
  'tolerance': 0.001,
  'learningRate': 0.001},
 'bestTrial': 1}

## Making predictions

In [13]:
res = model.predict_stream(
    projected_graph_object,
    topK=5,
    # threshold=0.5,
    sampleRate=0.8,
    # randomJoins=2,
    # maxIterations=3,
    # concurrency=1,
    # randomSeed=42,
)
res.head()

Link Prediction Predict Pipeline:   0%|          | 0/100 [00:00<?, ?%/s]

Unnamed: 0,node1,node2,probability
0,9047,21979,0.500482
1,9047,43523,0.500397
2,9047,43525,0.500395
3,9047,43573,0.500382
4,9047,43524,0.500381
