In [1]:
#!/usr/bin/env python3
# math_graph_prereq_noP361.py  – subclass/instance + MSC only

# —————————————————— imports
import os, time, itertools, pickle, functools, re, requests, networkx as nx
import torch
from transformers import pipeline

# —————————————————— zero‑shot maths gatekeeper
print("loading zero‑shot model … ", end="", flush=True)
zero_shot = pipeline("zero-shot-classification",
                     model="valhalla/distilbart-mnli-12-3",
                     device=-1)          # CPU
print("done.")

CANDIDATES = ["mathematics", "not mathematics"]

@functools.lru_cache(maxsize=8192)
def is_math(label: str) -> bool:
    s = label.lower()
    if re.search(r"\b(algebra|geometry|calculus|number theory|mathem)", s):
        return True
    if re.search(r"\b(music|film|politic|football|chemical|history)", s):
        return False
    return zero_shot(label, CANDIDATES)["labels"][0] == "mathematics"

# —————————————————— crawl params
SPARQL   = "https://query.wikidata.org/sparql"
HEADERS  = {"User-Agent": "MathGraph-NoP361/1.0"}
ROOTS    = ["Q395"]          # mathematics
CHUNK    = 100; PAUSE = 0.1; MAX_NODES = 8000

def sparql(q):  # helper
    r = requests.post(SPARQL, data={"query": q, "format": "json"},
                      headers=HEADERS, timeout=60)
    r.raise_for_status()
    return r.json()["results"]["bindings"]

def batches(seq, n):
    it = iter(seq)
    while (chunk := list(itertools.islice(it, n))):
        yield chunk

  from .autonotebook import tqdm as notebook_tqdm


loading zero‑shot model … 

Device set to use cpu


done.


In [2]:
# —————————————————— Stage 1: BFS on P279/P31
nodes, frontier = set(ROOTS), set(ROOTS)
print("\n‣ BFS crawl …")
while frontier and len(nodes) < MAX_NODES:
    nxt = set()
    for chunk in batches(frontier, CHUNK):
        vals = " ".join(f"wd:{q}" for q in chunk)
        q = f"""
        SELECT ?child ?childLabel WHERE {{
          VALUES ?parent {{ {vals} }}
          ?child (wdt:P279|wdt:P31) ?parent .
          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
        }}"""
        for row in sparql(q):
            cid, lab = row["child"]["value"].split("/")[-1], row["childLabel"]["value"]
            if cid not in nodes and is_math(lab):
                nodes.add(cid); nxt.add(cid)
        time.sleep(PAUSE)
    print(f"  +{len(nxt):3}  (total {len(nodes)})"); frontier = nxt
print(f"✔ Stage 1 nodes: {len(nodes)}")


‣ BFS crawl …
  + 51  (total 52)
  + 98  (total 150)
  + 52  (total 202)
  +  9  (total 211)
  +  3  (total 214)
  +  0  (total 214)
✔ Stage 1 nodes: 214


In [4]:
# ───── Stage 2 – collect ALL P279 / P31 edges (no dropping) ──────
raw_edges = set()
for chunk in batches(nodes, CHUNK):
    vals = " ".join(f"wd:{q}" for q in chunk)
    q = f"""
    SELECT ?p ?c ?rel WHERE {{
      VALUES ?p {{ {vals} }}
      ?p ?rel ?c .
      FILTER(?rel IN (wdt:P279, wdt:P31))
    }}"""
    for r in sparql(q):
        parent = r["p"]["value"].rsplit("/", 1)[-1]
        child  = r["c"]["value"].rsplit("/", 1)[-1]
        prop   = r["rel"]["value"].rsplit("/", 1)[-1]   # P279 or P31
        raw_edges.add((parent, child, prop))
    time.sleep(PAUSE)

print(f"✔ Stage 2 raw edges: {len(raw_edges)}")


✔ Stage 2 raw edges: 391


In [5]:
# —————————————————— Stage 3: labels
labels = {}
for chunk in batches(nodes, 200):
    vals = " ".join(f"wd:{q}" for q in chunk)
    q = f"""
    SELECT ?id ?label WHERE {{
      VALUES ?id {{ {vals} }}
      ?id rdfs:label ?label .
      FILTER(lang(?label)="en")
    }}"""
    for r in sparql(q):
        labels[r["id"]["value"].split("/")[-1]] = r["label"]["value"]
    time.sleep(PAUSE)
print(f"✔ labels: {len(labels)}")

✔ labels: 188


In [None]:
# —————————————————— Stage 4: enrichment (P279 both ways)
def enrich(snapshot):
    extra = []
    for chunk in batches(snapshot, CHUNK):
        vals = " ".join(f"wd:{q}" for q in chunk)
        q = f"""
        SELECT ?a ?b ?bLabel WHERE {{
          VALUES ?a {{ {vals} }}
          {{ ?a wdt:P279 ?b. }}
          UNION
          {{ ?b wdt:P279 ?a. }}
          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
        }}"""
        for r in sparql(q):
            a = r["a"]["value"].split("/")[-1]
            b = r["b"]["value"].split("/")[-1]
            lab = r.get("bLabel", {}).get("value","")
            if b in nodes or (lab and is_math(lab)):
                if b not in nodes:
                    nodes.add(b); labels[b] = lab or b
                extra.append((a,b,"P279"))
        time.sleep(PAUSE)
    return extra

loop=0
while True:
    loop+=1; snap=list(nodes)
    new = enrich(snap)
    if not new: break
    raw_edges.update(new)
    print(f"  enrich pass {loop}: +{len(new)} edges, {len(nodes)} nodes")

print(f"✔ after enrichment: {len(nodes)} nodes, {len(raw_edges)} edges")

  enrich pass 1: +389 edges, 225 nodes
  enrich pass 2: +572 edges, 393 nodes
  enrich pass 3: +910 edges, 541 nodes
  enrich pass 4: +1427 edges, 848 nodes
  enrich pass 5: +2503 edges, 1500 nodes


In [None]:
# —————————————————— Stage 5: MSC glue edges
msc_edges=[]
for chunk in batches(nodes, CHUNK):
    vals=" ".join(f"wd:{q}" for q in chunk)
    q=f"""
    SELECT ?x ?y WHERE {{
      VALUES ?x {{ {vals} }}
      ?x wdt:P2219 ?code . FILTER(strlen(?code)=2)
      ?y wdt:P2219 ?code . FILTER(?x!=?y)
    }}"""
    for r in sparql(q):
        msc_edges.append((r["x"]["value"].split("/")[-1],
                          r["y"]["value"].split("/")[-1],
                          "MSC"))
    time.sleep(PAUSE)
raw_edges.update(msc_edges)
print(f"✔ MSC edges: {len(msc_edges)}  (total {len(raw_edges)})")

In [None]:
# —————————————————— Stage 6: build weighted graph
WEIGHT = {"P279":1.0, "P31":1.0, "MSC":3.0}
COLOR  = {"P279":"#1f77b4", "P31":"#1f77b4", "MSC":"#2ca02c"}

G = nx.DiGraph()
for n in nodes:
    G.add_node(n, label=labels.get(n,n))
for p,c,prop in raw_edges:
    if p in nodes and c in nodes:
        G.add_edge(p,c,weight=WEIGHT[prop],color=COLOR[prop],prop=prop)

print(f"✔ graph: {G.number_of_nodes()} nodes | {G.number_of_edges()} edges")