# Start of Provernance Part of Task D
Imports for Proverance and Pagerank

In [None]:
from prov.model import ProvDocument
from prov.dot import prov_to_dot
import random
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from mpl_toolkits.axes_grid1 import make_axes_locatable
from IPython.display import Image
import os

Setup provenance document and NetworkX graph

In [None]:
document = ProvDocument()
document.add_namespace("Leeds", "Leeds")
document.add_namespace("Civ", "Civilian")
document.add_namespace("Pol", "Police")
police = document.agent("Pol:Police")

G = nx.DiGraph()

Functions to create events in the provernanace netowrk and the assosciations between things

In [None]:
# Generate driver and their car and the association between the two.
def createDriver(name):
    driver = document.agent("Civ:person_" + name)
    car = document.entity("Civ:car_" + name)
    document.wasAttributedTo(car, driver)
    G.add_edge(car, driver)
    return (driver, car)

# Generate a car crash report, including the activity, and the associations between all the entities, activites, and agents. 
def createCarCrash(driverOne, driverTwo, street):
    car_crash_id = int(random.uniform(0,1 * 10)*1000)
    activity = document.activity(f"Civ:crash_car_{car_crash_id}")
    car_crash_report = document.entity(f"Pol:car_crash_report_{car_crash_id}" )
    document.used(activity, driverOne[1])
    G.add_edge(activity, driverOne[1])
    document.used(activity, driverTwo[1])
    G.add_edge(activity, driverTwo[1])
    document.used(activity, street)
    G.add_edge(activity, street)
    document.wasAssociatedWith(activity, driverOne[0])
    G.add_edge(activity, driverOne[0])
    document.wasAssociatedWith(activity, driverTwo[0])
    G.add_edge(activity, driverTwo[0])
    document.wasGeneratedBy(car_crash_report, activity)
    G.add_edge(car_crash_report, activity)
    document.wasAttributedTo(car_crash_report, police)
    G.add_edge(car_crash_report, police)
    return car_crash_report

In [None]:
# Create the streets
streets = []
streets.append(document.entity("Leeds:Trent Street"))
streets.append(document.entity("Leeds:Thames Street"))
streets.append(document.entity("Leeds:Taf Street"))
# random_streets = random.choices(streets, [1,1,25,1], k=30)


# Create The Cars
driver_car_pairs = []
for i in range(0,8):
    driver_car_pairs.append(createDriver(str(i)))

# Create The Car Crashes
# for i in range(0,len(driver_car_pairs) - 1, 2):
#     createCarCrash(driver_car_pairs[i], driver_car_pairs[i+1], random_streets[i])
createCarCrash(driver_car_pairs[0], driver_car_pairs[1], streets[0])
createCarCrash(driver_car_pairs[2], driver_car_pairs[3], streets[0])
createCarCrash(driver_car_pairs[4], driver_car_pairs[5], streets[1])
createCarCrash(driver_car_pairs[6], driver_car_pairs[7], streets[2])

Visualize the prov docuement and the pagerank diagram

In [None]:
# Code from adapted from week 10 labs
# https://github.kcl.ac.uk/k21190656/7CUSMNDA-2024/blob/main/notebooks/week_10.ipynb
def plot_pagerank(graph, pagerank):
    fig, ax = plt.subplots(1, 1, figsize=(8, 7))

    node_color = list(pagerank.values())
    # nx.draw(graph, with_labels=False, alpha=0.8, arrows=True, node_color=node_color, cmap=plt.cm.viridis) 
    nx.draw(graph, with_labels=True, alpha=0.8, arrows=True, node_color=node_color, cmap=plt.cm.viridis) 

    # adding color bar for pagerank importances
    sm = plt.cm.ScalarMappable(cmap=plt.get_cmap('viridis'), norm=colors.Normalize(vmin=min(node_color), vmax=max(node_color)))
    divider = make_axes_locatable(ax)
    cax = divider.append_axes("right", size="3%", pad=0.05)
    plt.colorbar(sm, cax=cax)

for f in ["leeds-prov.png", "leeds-pagerank.png"]:
    if os.path.isfile(f):
        os.remove(f)

dot = prov_to_dot(document)
plot_pagerank(G, nx.pagerank(G, alpha= 0.95))
plt.savefig("leeds-pagerank.png")
dot.write_png("leeds-prov.png")
Image(filename="leeds-prov.png")


# Start of Knowledge Graph Embeddings
Imports for PyKeen

In [None]:
from pykeen.datasets import CoDExMedium
from pykeen.pipeline import pipeline
from pykeen.hpo import hpo_pipeline
from pykeen.evaluation import RankBasedEvaluator
from pykeen.sampling import BasicNegativeSampler
from pykeen.models import CompGCN
from optuna.samplers import GridSampler


In [None]:
def convert_nx_to_dataset(G):
    #Take the two edges and the label attribute and turn them into triplets. Shouldn't be hard. 
    return None

Import the CoDExMedium dataset for evaluation and convert the NetworkX graph to triples

In [None]:
validation  = CoDExMedium(create_inverse_triples=False)
dataset = convert_nx_to_dataset(G) 

TransE

In [None]:
TransE_pipeline_result = pipeline(
    dataset=dataset,
    validation = validation,
    model='TransE',
    model_kwargs=dict(embedding_dim=50),
    optimizer_kwargs=dict(lr=1.0e-2),
    training_kwargs=dict(num_epochs=20, use_tqdm_batch=False),
    evaluation_kwargs=dict(use_tqdm=True),
    random_seed=1,
    device='gpu',
    training_loop='sLCWA',
    negative_sampler=BasicNegativeSampler,
    evaluator=RankBasedEvaluator
)

RotatE

In [None]:
RotatE_pipeline_result = pipeline(
    dataset=dataset,
    validation = validation,
    model='RotatE',
    model_kwargs=dict(embedding_dim=50),
    optimizer_kwargs=dict(lr=1.0e-2),
    training_kwargs=dict(num_epochs=128, use_tqdm_batch=False),
    evaluation_kwargs=dict(use_tqdm=True),
    random_seed=1,
    device='gpu',
    training_loop='sLCWA',
    negative_sampler=BasicNegativeSampler,
    evaluator = RankBasedEvaluator
)

GCN

In [None]:
GCN_pipeline_result = pipeline(
    dataset=dataset,
    validation = validation,
    model=CompGCN,
    model_kwargs=dict(embedding_dim=50),
    optimizer_kwargs=dict(lr=1.0e-2),
    training_kwargs=dict(num_epochs=128, use_tqdm_batch=False),
    evaluation_kwargs=dict(use_tqdm=True),
    random_seed=1,
    device='gpu',
    training_loop='sLCWA',
    negative_sampler=BasicNegativeSampler,
    evaluator = RankBasedEvaluator
)

Visualise TransE, RotatE, and GCN

In [None]:
for f in ["leeds-TransE-embedding.png", "leeds-RotatE-embedding.png", "leeds-GCN-embedding.png"]:
    if os.path.isfile(f):
        os.remove(f)

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))
TransE_pipeline_result.plot_er(plot_relations=False)
plt.savefig("leeds-TransE-embedding.png")

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))
RotatE_pipeline_result.plot_er(plot_relations=False)
plt.savefig("leeds-RotatE-embedding.png")

In [None]:
fig, ax = plt.subplots(figsize=(12, 5))
GCN_pipeline_result.plot_er(plot_relations=False)
plt.savefig("leeds-GCN-embedding.png")

Evalaluate TransE, RotatE, and GCN

In [None]:
TransE_Emrr = TransE_pipeline_result.get_metric('mrr')
TransE_hits_at_10 = TransE_pipeline_result.get_metric('hits@k')

RotatE_Emrr = RotatE_pipeline_result.get_metric('mrr')
RotatE_hits_at_10 = RotatE_pipeline_result.get_metric('hits@k')

GCN_Emrr = GCN_pipeline_result.get_metric('mrr')
GCN_hits_at_10 = GCN_pipeline_result.get_metric('hits@k')

print(f"Mean Reciprocal Rank\n\t TransE: {TransE_Emrr} | RotatE: {RotatE_Emrr} | GCN: {GCN_Emrr}")
print(f"Hits at 10k\n\t TransE: {TransE_hits_at_10} | RotatE: {RotatE_hits_at_10} | GCN: {GCN_hits_at_10}")

DELETE THIS CODE BEFORE SUBMITTING

In [None]:
model = ['TransE','RotatE', CompGCN]
hpo_pipeline_result = hpo_pipeline(
    dataset = dataset,
    model= model[0],
    device='gpu',
    training_loop='sLCWA',
    n_trials= 25,
    optimizer_kwargs=dict(lr=1.0e-2),
    training_kwargs=dict(num_epochs=20, use_tqdm_batch=False),
    evaluation_kwargs=dict(use_tqdm=True),

    negative_sampler=BasicNegativeSampler,
    evaluator=RankBasedEvaluator
)

TransE_hpo_pipeline_result.save_to_directory("TransE.HPO")

