In [2]:
import torch                              # import *first*
from transformers import pipeline         # transformers sees torch now
import requests, networkx as nx
import os, time, itertools, pickle, functools, re, sys

print("loading zero‑shot model … ", end="", flush=True)
zero_shot = pipeline(
    "zero-shot-classification",
    model="valhalla/distilbart-mnli-12-3",
)
print("done.")

loading zero‑shot model … 

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


done.


In [3]:
# ───── free local LLM gatekeeper ───────────────────────────────────

CANDIDATES = ["mathematics", "not mathematics"]

@functools.lru_cache(maxsize=8192)
def is_math_topic(label: str) -> bool:
    """Cheap heuristic first, else free MNLI classifier."""
    label_low = label.lower()
    if re.search(r"\b(algebra|geometry|calculus|number theory|mathem)", label_low):
        return True
    if re.search(r"\b(music|film|politic|football|chemical|history)", label_low):
        return False
    out = zero_shot(label, CANDIDATES)
    score = dict(zip(out["labels"], out["scores"]))["mathematics"]
    return score >= 0.5           # tweak threshold if desired

# ───── Wikidata crawl parameters ──────────────────────────────────
SPARQL   = "https://query.wikidata.org/sparql"
HEADERS  = {"User-Agent": "MathGraph-FreeLLM/1.0"}    # ASCII only
ROOTS    = ["Q395"]           # Mathematics
CHUNK    = 100               # Q‑ids per VALUES clause
PAUSE    = 0.1               # polite delay (s) between WDQS calls
MAX_NODES = 8000             # safety cap so you don’t DOS yourself

# ───── helper functions ───────────────────────────────────────────
def sparql(q: str):
    """POST query, return row list, die noisily on error."""
    r = requests.post(SPARQL,
                      data={"query": q, "format": "json"},
                      headers=HEADERS, timeout=60)
    r.raise_for_status()
    return r.json()["results"]["bindings"]

def batches(seq, n):
    it = iter(seq)
    while (chunk := list(itertools.islice(it, n))):
        yield chunk

In [4]:
# ───── Stage 1 – breadth‑first crawl with LLM filter ──────────────
all_nodes   = set(ROOTS)
frontier    = set(ROOTS)
depth       = 0

print("\n‣ crawling Wikidata …")
while frontier and len(all_nodes) < MAX_NODES:
    depth += 1
    next_frontier = set()

    for chunk in batches(frontier, CHUNK):
        vals = " ".join(f"wd:{q}" for q in chunk)
        q = f"""
        SELECT DISTINCT ?child ?childLabel WHERE {{
          VALUES ?parent {{ {vals} }}
          ?child (wdt:P279|wdt:P31) ?parent .
          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
        }}
        """
        for row in sparql(q):
            child = row["child"]["value"].rsplit("/", 1)[-1]
            label = row["childLabel"]["value"]
            if child not in all_nodes and is_math_topic(label):
                all_nodes.add(child)
                next_frontier.add(child)
        time.sleep(PAUSE)

    print(f"  depth {depth}: +{len(next_frontier):,} nodes "
          f"(total {len(all_nodes):,})")
    frontier = next_frontier

print(f"✔ Stage 1 done → {len(all_nodes):,} nodes\n")


‣ crawling Wikidata …
  depth 1: +51 nodes (total 52)
  depth 2: +98 nodes (total 150)
  depth 3: +52 nodes (total 202)
  depth 4: +9 nodes (total 211)
  depth 5: +3 nodes (total 214)
  depth 6: +0 nodes (total 214)
✔ Stage 1 done → 214 nodes



In [5]:
# ───── Stage 2 – collect edges within node set ────────────────────
edges = []
node_set = set(all_nodes)
print("‣ fetching intra‑set edges …")

for chunk in batches(all_nodes, CHUNK):
    vals = " ".join(f"wd:{q}" for q in chunk)
    q = f"""
    SELECT ?parent ?child WHERE {{
      VALUES ?child {{ {vals} }}
      {{ ?child (wdt:P279|wdt:P31|wdt:P361) ?parent. }}
      UNION
      {{ ?parent wdt:P361 ?child. }}
    }}
    """
    for row in sparql(q):
        p = row["parent"]["value"].rsplit("/", 1)[-1]
        c = row["child"]["value"].rsplit("/", 1)[-1]
        if p in node_set and c in node_set:
            edges.append((p, c))
    time.sleep(PAUSE)

edges = list(set(edges))
print(f"✔ Stage 2 done → {len(edges):,} edges\n")

‣ fetching intra‑set edges …
✔ Stage 2 done → 259 edges



In [6]:
# ───── Stage 3 – English labels for remaining nodes ───────────────
labels = {}
print("‣ downloading labels …")
for chunk in batches(all_nodes, 200):
    vals = " ".join(f"wd:{q}" for q in chunk)
    q = f"""
    SELECT ?id ?label WHERE {{
      VALUES ?id {{ {vals} }}
      ?id rdfs:label ?label .
      FILTER (lang(?label) = "en")
    }}
    """
    for row in sparql(q):
        qid   = row["id"]["value"].rsplit("/", 1)[-1]
        label = row["label"]["value"]
        labels[qid] = label
    time.sleep(PAUSE)

print(f"✔ labels fetched ({len(labels):,})\n")

‣ downloading labels …
✔ labels fetched (188)



In [7]:
# ───── Stage 4 – build graph & write artifacts ────────────────────
G = nx.DiGraph()
for q in all_nodes:
    G.add_node(q, label=labels.get(q, q))
G.add_edges_from(edges)

print(f"✔ graph built → {G.number_of_nodes():,} nodes | "
      f"{G.number_of_edges():,} edges")

PICKLE = "math_graph.pkl"
with open(PICKLE, "wb") as f:
    pickle.dump(G, f, protocol=pickle.HIGHEST_PROTOCOL)
print(f"✔ pickle saved → {PICKLE}")

✔ graph built → 214 nodes | 259 edges
✔ pickle saved → math_graph.pkl


In [8]:
# ───── Stage 5 – interactive PyVis graph ──────────────────────────
print("‣ generating interactive HTML … ", end="", flush=True)
from pyvis.network import Network
net = Network(height="900px", width="100%", directed=True,
              bgcolor="#ffffff", notebook=False)
net.from_nx(G)
net.toggle_physics(True)
HTML = "math_graph.html"
net.show(HTML, notebook=False)
print(f"done.  Open {HTML} in a browser.\n")

print("🏁 All set – enjoy exploring the mathematics universe!")

‣ generating interactive HTML … math_graph.html
done.  Open math_graph.html in a browser.

🏁 All set – enjoy exploring the mathematics universe!
