# Filtered Shortest Path

Finding all the shortest paths between two nodes, using a filtered set of properties.

*outstanding questions:*
- can we order/filter these paths by 'uniqueness'? (e.g. connecting people because they are humans isn't that interesting!)

In [2]:
!pip install networkx

Collecting networkx
  Downloading networkx-2.6.2-py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 4.3 MB/s eta 0:00:01
[?25hInstalling collected packages: networkx
Successfully installed networkx-2.6.2
You should consider upgrading via the '/Users/kalyan/.pyenv/versions/3.9.1/envs/hc/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
# Imports
from rdflib import Graph
from rdflib.extras.external_graph_libs import rdflib_to_networkx_graph
import networkx as nx
from networkx import Graph as NXGraph
from networkx.algorithms.traversal.beamsearch import bfs_beam_edges
from networkx.algorithms.shortest_paths.generic import all_shortest_paths
from itertools import islice
import matplotlib.pyplot as plt
import statistics
import collections
import numpy as np
from rdflib import URIRef, Literal
import requests
import json


In [63]:
url = "https://d0rgkq.deta.dev/labels"

def get_labels(entities):
    payload = json.dumps({
      "uris": entities
    })
    headers = {
      'Content-Type': 'application/json'
    }

    return requests.post(url, headers=headers, data=payload).json()


In [5]:
# RDF graph loading
# This takes a while (10+ minutes). If you're working on a local machine it'll 
# be better to download the file from `path` below and give this notebook a 
# local path.
path = "https://heritageconnector.s3.eu-west-2.amazonaws.com/rdf/hc_dump_latest.nt"
rg = Graph()
rg.parse(path, format='nt')
print("rdflib Graph loaded successfully with {} triples".format(len(rg)))

rdflib Graph loaded successfully with 4821010 triples


In [40]:
# Optionally get a subgraph
# Here we filter out all the triples with skos:hasTopConcept 
# one of ({OBJECT, PERSON or ORGANISATION}) and sdo:isPartOf (describes collection
# membership for objects)
properties = [
    "hc:entityPERSON",
    "hc:entityORG",
    "hc:entityNORP",
    "hc:entityFAC",
    "hc:entityLOC",
    "hc:entityOBJECT",
    "hc:entityLANGUAGE",
    "hc:entityDATE",
    "sdo:birthDate",
    "sdo:deathDate",
    "sdo:foundingDate",
    "sdo:dissolutionDate",
    "foaf:maker",
    "foaf:made",
    "sdo:mentions",
    "owl:sameAs",
    "skos:related",
    "skos:relatedMatch",
#     "wdt:P101", # field of work
    "wdt:P1056",
#     "wdt:P106", # occupation
    "wdt:P127",
    "wdt:P135",
    "wdt:P136",
    "wdt:P137",
    "wdt:P1535",
#     "wdt:P17", # country
    "wdt:P176",
    "wdt:P18",
    "wdt:P180",
    "wdt:P20",
#     "wdt:P21", # sex or gender
    "wdt:P27",
    "wdt:P279",
    "wdt:P287",
    "wdt:P31",
    "wdt:P3342",
    "wdt:P452",
#     "wdt:P495", # country of origin
    "wdt:P607",
    "wdt:P61",
    "wdt:P710",
    "wdt:P749",
    "wdt:P793",
    "sdo:birthPlace",
    "sdo:deathPlace",
]

query = f"""
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX sdo: <https://schema.org/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX smgp: <https://collection.sciencemuseumgroup.org.uk/people/>
PREFIX smgo: <https://collection.sciencemuseumgroup.org.uk/objects/>
PREFIX smgd: <https://collection.sciencemuseumgroup.org.uk/documents/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX hc: <http://www.heritageconnector.org/RDF/>
CONSTRUCT {{ ?s ?p ?o }}
WHERE {{ ?s ?p ?o. FILTER (?p in ({", ".join(properties)})). }}
"""
print(query)

subg = rg.query(query)



PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX sdo: <https://schema.org/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX smgp: <https://collection.sciencemuseumgroup.org.uk/people/>
PREFIX smgo: <https://collection.sciencemuseumgroup.org.uk/objects/>
PREFIX smgd: <https://collection.sciencemuseumgroup.org.uk/documents/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX hc: <http://www.heritageconnector.org/RDF/>
CONSTRUCT { ?s ?p ?o }
WHERE { ?s ?p ?o. FILTER (?p in (hc:entityPERSON, hc:entityORG, hc:entityNORP, hc:entityFAC, hc:entityLOC, hc:entityOBJECT, hc:entityLANGUAGE, hc:entityDATE, sdo:birthDate, sdo:deathDate, sdo:foundingDate, sdo:dissolutionDate, foaf:maker, foaf:made, sdo:mentions, owl:sameAs, skos:related,

In [73]:
# Conversion of rdflib.Graph to networkx.Graph
if 'subg' in locals():
  print("Using subgraph generated in last cell")
  G = rdflib_to_networkx_graph(subg)
else:
  print("Using entire rdf graph")
  G = rdflib_to_networkx_graph(rg)
print("networkx Graph loaded successfully with length {}".format(len(G)))

Using subgraph generated in last cell


KeyboardInterrupt: 

## Shortest path

In [74]:
# Joy Division to Daphne Oram
ent_a, ent_b = URIRef("http://www.wikidata.org/entity/Q172763"), URIRef("https://collection.sciencemuseumgroup.org.uk/people/cp127589")

# Tim Burton to 'Vampire' aircraft
# ent_a, ent_b = URIRef("http://www.wikidata.org/entity/Q56008"), URIRef("https://collection.sciencemuseumgroup.org.uk/objects/co8223281")

In [75]:
all_sps = all_shortest_paths(G, ent_a, ent_b)
path_graphs = [nx.path_graph(sp) for sp in all_sps]

In [76]:
for idx, p in enumerate(path_graphs):
    print(f"Path {idx+1}")
    for idx, ea in enumerate(p.edges()):
        subj = ea[0]
        edges = [i[1] for i in G.edges[ea[0], ea[1]]['triples']]
        obj = ea[1]
        
        ent_labels = get_labels([e for e in ea if e.startswith("http")])

        if idx +1 < len(p.edges()):
            print(f"- {ent_labels.get(str(subj)) or subj} -> {edges[0]}")
        else:
            print(f"- {ent_labels.get(str(subj)) or subj} -> {edges[0]}")
            print(f"- {ent_labels.get(str(obj)) or obj}")


Path 1
- Joy Division -> http://www.wikidata.org/prop/direct/P31
- http://www.wikidata.org/entity/Q215380 -> http://www.wikidata.org/prop/direct/P31
- Kraftwerk -> http://www.heritageconnector.org/RDF/entityORG
- Kraftwerk Uncovered -> http://www.heritageconnector.org/RDF/entityPERSON
- Daphne Oram
Path 2
- Joy Division -> http://www.wikidata.org/prop/direct/P31
- http://www.wikidata.org/entity/Q215380 -> http://www.wikidata.org/prop/direct/P31
- Icebreaker -> http://www.heritageconnector.org/RDF/entityORG
- Kraftwerk Uncovered -> http://www.heritageconnector.org/RDF/entityPERSON
- Daphne Oram
Path 3
- Joy Division -> http://www.wikidata.org/prop/direct/P31
- http://www.wikidata.org/entity/Q215380 -> http://www.wikidata.org/prop/direct/P31
- Boomkat -> http://www.heritageconnector.org/RDF/entityORG
- Oramics To Electronica Phase Two -> http://www.heritageconnector.org/RDF/entityPERSON
- Daphne Oram
Path 4
- Joy Division -> http://www.wikidata.org/prop/direct/P136
- http://www.wikidata.