In [1]:
import time
import requests
from collections import defaultdict
import numpy as np

BASE_SEARCH = "https://inspirehep.net/api/literature"
BASE_RECORD = "https://inspirehep.net/api/literature/{recid}"

# --- Helpers ---------------------------------------------------------------

def to_recid(identifier):
    """
    Accepts:
      - integer or str recid (e.g., '1767256')
      - arXiv id (e.g., '1707.01621' or 'hep-ph/0603175')
      - DOI (e.g., '10.1103/PhysRevLett.19.1264')
    Returns INSPIRE recid (int).
    """
    # If it's already a recid-like integer, use it
    try:
        return int(identifier)
    except Exception:
        pass

    # Otherwise, resolve via a search (use fields=control_number to reduce payload)
    if identifier.lower().startswith("10."):
        q = f"doi:{identifier}"
    else:
        q = f"arxiv:{identifier}"

    r = requests.get(BASE_SEARCH, params={"q": q, "size": 1, "fields": "control_number"}, timeout=30)
    r.raise_for_status()
    hits = r.json().get("hits", {}).get("hits", [])
    if not hits:
        raise ValueError(f"Could not resolve identifier: {identifier}")
    return int(hits[0]["metadata"]["control_number"])

def get_cited_recids(recid):
    """
    Returns a list of recids that 'recid' cites (as resolved by INSPIRE).
    We request only 'references' field to minimize payload.
    """
    url = BASE_RECORD.format(recid=recid)
    # Although single-record 'fields' is not officially supported, you can emulate via a search:
    r = requests.get(BASE_SEARCH, params={"q": f"recid:{recid}", "size": 1, "fields": "references"}, timeout=30)
    r.raise_for_status()
    hits = r.json().get("hits", {}).get("hits", [])
    if not hits:
        return []
    refs = hits[0]["metadata"].get("references", []) or []

    cited = []
    for ref in refs:
        # When resolved, INSPIRE includes a linked record like:
        # ref["record"] = {"$ref": "https://inspirehep.net/api/literature/XXXXXXX"}
        rec = ref.get("record")
        if isinstance(rec, dict) and "$ref" in rec:
            try:
                cited_recid = int(rec["$ref"].rstrip("/").split("/")[-1])
                cited.append(cited_recid)
            except Exception:
                pass
    return cited

# --- Main builder ----------------------------------------------------------

def citation_matrix(identifiers, binary=True, sleep_every=15, sleep_secs=5):
    """
    identifiers: list of recids, arXiv IDs, or DOIs
    binary=True  -> M[i,j] in {0,1}
    binary=False -> M[i,j] = number of distinct references to j in i (usually 0/1 anyway)

    Returns: (M, recids, index_map) where recids[i] is the recid of row/col i.
    """
    # 1) Normalize to recids
    recids = [to_recid(x) for x in identifiers]

    # 2) Build an index map recid -> row/col index
    idx = {r: i for i, r in enumerate(recids)}

    # 3) Fetch references for each recid (respect INSPIRE rate-limit: 15 req / 5s)
    N = len(recids)
    M = np.zeros((N, N), dtype=int)
    req_counter = 0

    for i, r in enumerate(recids):
        if req_counter and req_counter % sleep_every == 0:
            time.sleep(sleep_secs)  # gentle rate-limit handling
        cited = get_cited_recids(r)
        req_counter += 1

        # 4) Fill matrix
        if binary:
            # Set 1 if j in cited AND j is within our set
            for cj in set(cited):
                j = idx.get(cj)
                if j is not None and j != i:  # typically exclude self-cites in adjacency
                    M[i, j] = 1
        else:
            counts = defaultdict(int)
            for cj in cited:
                counts[cj] += 1
            for cj, c in counts.items():
                j = idx.get(cj)
                if j is not None and j != i:
                    M[i, j] = c

    return M, recids, idx


In [2]:

# --- Example ---------------------------------------------------------------

# Mix of IDs: recid, arXiv, DOI
papers = [
	"1707.01621",                 # arXiv id
	"10.1103/PhysRevLett.19.1264",# DOI
	451647,                        # recid
    "1911.01303",                 # arXiv id (ref i which cites j)
    "10.1016/j.nima.2005.08.106"   # DOI (ref j which is cited by i)
]
M, recids, index_map = citation_matrix(papers, binary=True)
print("Order (rows/cols) by recid:", recids)
print(M)


Order (rows/cols) by recid: [1608879, 51188, 451647, 1762842, 644725]
[[0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 1]
 [0 0 0 0 0]]


In [3]:
#!/usr/bin/env python3
import time, sys, math, json, csv
import requests
from collections import defaultdict
import numpy as np

try:
    # SciPy for efficient sparse matrices + Matrix Market output
    from scipy.sparse import coo_matrix, save_npz
    from scipy.io import mmwrite
    have_scipy = True
except Exception:
    have_scipy = False

BASE = "https://inspirehep.net/api/literature"

# -------- settings --------
TOTAL = 100000
PAGE_SIZE = 1000                      # INSPIRE allows paging; keep payloads manageable
PAGES = math.ceil(TOTAL / PAGE_SIZE)  # 100
QUERY = "Charm"                             # empty query means "all literature"
SORT = "mostrecent"
FIELDS = "control_number,references"  # keep it lean: recid + references only
SLEEP_EVERY = 15                      # polite throttling
SLEEP_SECS = 5
TIMEOUT = 45

OUT_EDGE_CSV = "inspire100k_edges.csv"
OUT_RECID_ORDER = "inspire100k_recids.json"
OUT_MTX_MTX = "inspire100k_adj.mtx"    # Matrix Market (.mtx) if SciPy is present
OUT_NPZ = "inspire100k_adj.npz"        # SciPy .npz (optional)

# -------- helpers --------
def search_page(page):
    params = {
        "q": QUERY,       
        "sort": SORT,
        "size": PAGE_SIZE,
        "page": page,
        "fields": FIELDS
    }
    r = requests.get(BASE, params=params, timeout=TIMEOUT)
    r.raise_for_status()
    return r.json()

def extract_hits(payload):
    return payload.get("hits", {}).get("hits", [])

def cited_recids_from_hit(hit):
    md = hit.get("metadata", {})
    refs = md.get("references", []) or []
    cited = []
    for ref in refs:
        rec = ref.get("record")
        if isinstance(rec, dict) and "$ref" in rec:
            try:
                cited_recid = int(rec["$ref"].rstrip("/").split("/")[-1])
                cited.append(cited_recid)
            except Exception:
                pass
    return cited

# -------- main flow --------
def main():
    # 1) Fetch 10k most recent with pagination
    all_hits = []
    req_counter = 0
    for page in range(1, PAGES + 1):
        if req_counter and req_counter % SLEEP_EVERY == 0:
            time.sleep(SLEEP_SECS)
        data = search_page(page)
        req_counter += 1
        hits = extract_hits(data)
        if not hits:
            break
        all_hits.extend(hits)
        sys.stderr.write(f"Fetched page {page} with {len(hits)} records\n")

    # Trim in case we got >10k due to rounding
    all_hits = all_hits[:TOTAL]

    # 2) Build recid order and map
    recids = [int(h["metadata"]["control_number"]) for h in all_hits]
    idx = {r: i for i, r in enumerate(recids)}

    # 3) Build edges (i -> j) where i cites j and j is in-set
    rows, cols = [], []
    edge_count = 0
    for i, h in enumerate(all_hits):
        cited_js = cited_recids_from_hit(h)
        for cj in set(cited_js):      # binary adjacency; drop duplicates within a bib
            j = idx.get(cj)
            if j is not None and j != i:
                rows.append(i)
                cols.append(j)
                edge_count += 1

    sys.stderr.write(f"Unique in-set edges: {edge_count}\n")

    # 4) Save edge list CSV (recid_i, recid_j)
    with open(OUT_EDGE_CSV, "w", newline="") as f:
        w = csv.writer(f)
        w.writerow(["source_recid", "target_recid"])
        for r, c in zip(rows, cols):
            w.writerow([recids[r], recids[c]])
    sys.stderr.write(f"Wrote edges to {OUT_EDGE_CSV}\n")

    # 5) Save node order
    with open(OUT_RECID_ORDER, "w") as f:
        json.dump(recids, f)
    sys.stderr.write(f"Wrote recid order to {OUT_RECID_ORDER}\n")

    # 6) Build sparse adjacency and export
    n = len(recids)
    data = np.ones(len(rows), dtype=np.int8)
    if have_scipy:
        A = coo_matrix((data, (rows, cols)), shape=(n, n), dtype=np.int8)
        mmwrite(OUT_MTX_MTX, A)       # Matrix Market format (portable)
        save_npz(OUT_NPZ, A.tocsr())  # fast Python reload
        sys.stderr.write(f"Wrote sparse matrix to {OUT_MTX_MTX} and {OUT_NPZ}\n")
    else:
        # Fallback: dense (warning: ~100MB as bool for 10k^2)
        M = np.zeros((n, n), dtype=np.bool_)
        M[rows, cols] = True
        np.save("inspire10k_adj.npy", M)
        sys.stderr.write("SciPy not found: wrote dense numpy array to inspire10k_adj.npy\n")


In [None]:
# Run main:
main()


Fetched page 1 with 1000 records
Fetched page 2 with 1000 records


In [2]:
import numpy as np
from scipy.sparse import load_npz
from scipy.io import mmread

# Option 1: load from .npz
A = load_npz("inspire10k_adj.npz")   # SciPy CSR/COO etc.
print(A.shape)
print(A.nnz)  # number of nonzero entries

# Convert to dense NumPy (⚠️ 10000×10000 = 100M entries ~100 MB)
M = A.toarray()
print(M.shape)

# Option 2: load from Matrix Market (.mtx)
B = mmread("inspire10k_adj.mtx").tocsr()


(10000, 10000)
2170
(10000, 10000)


In [3]:
A

<Compressed Sparse Row sparse matrix of dtype 'int8'
	with 2170 stored elements and shape (10000, 10000)>

In [4]:
B

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 2170 stored elements and shape (10000, 10000)>

In [5]:
M

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)

In [6]:
import json
import numpy as np
import pandas as pd

# Try sparse first; fall back to dense
A = None
try:
    from scipy.sparse import load_npz
    A = load_npz("inspire10k_adj.npz")   # CSR/COO sparse matrix
except Exception:
    pass

if A is None:
    try:
        from scipy.io import mmread
        A = mmread("inspire10k_adj.mtx").tocsr()
    except Exception:
        pass

if A is None:
    # Dense fallback
    M = np.load("inspire10k_adj.npy")
    # In-citations = column sums; Out-citations = row sums
    in_cits  = M.sum(axis=0).astype(int)
    out_cits = M.sum(axis=1).astype(int)
else:
    # Ensure CSR for fast row/col ops
    A = A.tocsr()
    # In-citations: sum over rows (axis=0) → 1×N; Out-citations: sum over cols (axis=1) → N×1
    in_cits  = np.asarray(A.sum(axis=0)).ravel().astype(int)
    out_cits = np.asarray(A.sum(axis=1)).ravel().astype(int)

# Load recid order (row/col mapping)
with open("inspire10k_recids.json", "r") as f:
    recids = json.load(f)

# Pack results
df = pd.DataFrame({
    "recid": recids,
    "in_citations": in_cits,    # how many of the 10k cite this paper
    "out_citations": out_cits,  # how many references this paper makes (within the 10k set)
})

# Save & show top 20 by in_citations
df.sort_values("in_citations", ascending=False).head(20).to_csv("top20_in_citations.csv", index=False)
df.to_csv("inspire10k_citation_counts.csv", index=False)

print("Wrote:")
print(" - inspire10k_citation_counts.csv (recid,in_citations,out_citations)")
print(" - top20_in_citations.csv")

# Optional: quick peek
print(df.sort_values("in_citations", ascending=False).head(10))


Wrote:
 - inspire10k_citation_counts.csv (recid,in_citations,out_citations)
 - top20_in_citations.csv
        recid  in_citations  out_citations
6043  2963698            51              0
6050  2963711            31              7
6052  2963713            18              2
6055  2963737            16              0
3712  2968387            13              5
6039  2963692            11              1
7392  2960543            11              0
8434  2958533             9              0
7633  2960516             7              2
6603  2963373             7              7


In [11]:
import numpy as np
import networkx as nx
import plotly.graph_objects as go

def interactive_network_from_adjacency(
    adjacency_matrix,
    node_labels=None,
    is_directed=False,
    importance="pagerank",
    layout="spring",          # 'spring' | 'kamada_kawai'
    spread=3.0,               # affects spring spacing
    iterations=600,
    seed=42,
    html_path="network_map.html"
):
    # --- Build graph ---
    A = np.array(adjacency_matrix)
    G = nx.from_numpy_array(A, create_using=nx.DiGraph if is_directed else nx.Graph)

    if node_labels:
        G = nx.relabel_nodes(G, {i: lbl for i, lbl in enumerate(node_labels)})

    # --- Importance metric (same options as before) ---
    if importance == "pagerank":
        imp = nx.pagerank(G, weight="weight")
    elif importance == "eigenvector":
        try:
            imp = nx.eigenvector_centrality(G if not is_directed else G.to_undirected(), weight="weight", max_iter=1000)
        except nx.PowerIterationFailedConvergence:
            imp = nx.degree_centrality(G)
    elif importance == "betweenness":
        imp = nx.betweenness_centrality(G, weight="weight", normalized=True)
    elif importance == "degree":
        imp = nx.degree_centrality(G)
    elif importance == "strength":
        imp = {n: sum(d.get("weight", 1.0) for _, _, d in G.edges(n, data=True)) for n in G.nodes()}
    else:
        raise ValueError(f"Unknown importance metric: {importance}")

    # --- Layout (positions) ---
    if layout == "spring":
        k = spread * (1.0 / max(1, np.sqrt(G.number_of_nodes())))
        pos = nx.spring_layout(G, seed=seed, weight="weight", k=k, iterations=iterations)
    elif layout == "kamada_kawai":
        pos = nx.kamada_kawai_layout(G, weight="weight")
    else:
        raise ValueError("layout must be 'spring' or 'kamada_kawai'")

    nodes = list(G.nodes())
    imp_vals = np.array([imp[n] for n in nodes], dtype=float)
    if np.allclose(imp_vals.min(), imp_vals.max()):
        imp_vals = np.ones_like(imp_vals)

    # Node sizes mapped from importance
    sizes = np.interp(imp_vals, (imp_vals.min(), imp_vals.max()), (8, 28))

    # --- Edge trace(s) ---
    edge_x, edge_y, edge_text = [], [], []
    for u, v, d in G.edges(data=True):
        x0, y0 = pos[u]
        x1, y1 = pos[v]
        edge_x += [x0, x1, None]
        edge_y += [y0, y1, None]
        w = d.get("weight", 1.0)
        edge_text.append(f"{u}–{v}<br>weight={w}")

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.7),
        hoverinfo="none",  # hover on nodes instead; edges can be noisy
        mode="lines"
    )

    # --- Node trace ---
    node_x = [pos[n][0] for n in nodes]
    node_y = [pos[n][1] for n in nodes]
    node_text = [
        f"<b>{n}</b><br>importance={val:.4f}"
        for n, val in zip(nodes, imp_vals)
    ]

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode="markers+text",
        text=[str(n) for n in nodes],  # show numeric labels
        textposition="middle center",
        hovertext=node_text,
        hoverinfo="text",
        marker=dict(
            size=sizes,
            color=imp_vals,
            colorscale="Viridis",
            showscale=True,
            colorbar=dict(title="Node importance"),
            line=dict(width=1)
        )
    )

    title = f"Interactive Network (importance = {importance}, layout = {layout})"
    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title=title,
                        showlegend=False,
                        hovermode="closest",
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        margin=dict(l=20, r=20, t=50, b=20),
                    ))

    fig.write_html(html_path, include_plotlyjs="cdn")  # creates an interactive HTML
    print(f"Saved interactive map to {html_path}")
    return fig


In [12]:
labels = list(range(1, M.shape[0]+1))  # 1..n labels
interactive_network_from_adjacency(
    M,
    node_labels=labels,
    layout="spring",    # or "kamada_kawai"
    spread=5.0,
    iterations=1000,
    importance="pagerank",
    html_path="network_map.html"
)

Saved interactive map to network_map.html
