# Interactive 2-D Literature Map from a BibTeX File

This notebook will:
1. Parse a `.bib` file to build a text corpus from titles (and abstracts).
2. Embed texts with a Sentence-Transformer model.
3. Reduce embeddings to 2-D using UMAP.
4. (Optionally) Cluster points with HDBSCAN.
5. Save an interactive Plotly HTML map and highlight focal paper(s).

In [1]:
# Install dependencies (uncomment to run)
# !pip install bibtexparser sentence-transformers umap-learn hdbscan plotly pandas tqdm

In [None]:
import pathlib
import sys
import pandas as pd
from tqdm import tqdm
import bibtexparser
import plotly.express as px

In [3]:
import requests
import re

import pathlib
import pandas as pd
import bibtexparser

def load_bib(path: pathlib.Path) -> pd.DataFrame:
    """Read a BibTeX file → DataFrame with key, title, abstract, year, doi, doi_url."""
    with path.open() as fh:
        bib_db = bibtexparser.load(fh)

    records = []
    for entry in bib_db.entries:
        title = entry.get("title", "").strip().rstrip(".")
        abstract = entry.get("abstract", "")
        year = entry.get("year")
        key = entry.get("ID") or entry.get("citation_key")
        # Try common DOI fields
        doi = entry.get("doi") or entry.get("DOI") or None
        doi_url = f"https://doi.org/{doi}" if doi else None

        records.append({
            "key": key,
            "title": title,
            "abstract": abstract,
            "year": year,
            "doi": doi,
            "doi_url": doi_url,
        })

    df = pd.DataFrame(records)
    df.dropna(subset=["title"], inplace=True)
    return df
import requests

bib_path = pathlib.Path("/Users/rodrigo/Documents/dissertation/references.bib")
# === In your notebook ===
df = load_bib(bib_path)


Entry type dataset not standard. Not considered.


In [None]:
import re
import html
import requests
import pandas as pd
import numpy as np


def clean_xml_cell(text) -> str:
    """
    Strip <jats:...> and HTML tags that Crossref sometimes returns.
    Safe to call on NaN / None.
    """
    if pd.isna(text):
        return ""
    # ensure we have a str
    text = str(text)
    # remove JATS tags
    text = re.sub(r'</?jats:[^>]+>', '', text)
    # remove any other HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # unescape HTML entities
    return html.unescape(text).strip()

def fetch_abstract_cell(doi) -> str:
    """
    Try Crossref → Semantic Scholar → OpenAlex to fetch an abstract for DOI.
    Always returns a string (possibly empty) and handles exceptions internally.
    """
    if pd.isna(doi):
        return ""
    doi = str(doi).strip()
    try:
        # 1) Crossref
        cr = requests.get(f"https://api.crossref.org/works/{doi}", timeout=10)
        if cr.ok:
            msg = cr.json().get("message", {})
            raw_abs = msg.get("abstract")
            if raw_abs:
                return clean_xml_cell(raw_abs)

        # 2) Semantic Scholar
        ss = requests.get(
            f"https://api.semanticscholar.org/graph/v1/paper/DOI:{doi}",
            params={"fields": "title,abstract"},
            timeout=10
        )
        ss_json = ss.json() if ss.ok else {}
        raw_abs = ss_json.get("abstract")
        if raw_abs:
            return clean_xml_cell(raw_abs)

        # 3) OpenAlex
        oa = requests.get(
            f"https://api.openalex.org/works/https://doi.org/{doi}",
            timeout=10
        )
        oa_json = oa.json() if oa.ok else {}
        raw_abs = oa_json.get("abstract")
        if raw_abs:
            return clean_xml_cell(raw_abs)

    except Exception:
        # you could log out the error here if you want
        pass

    # fall-back
    return ""

In [7]:
df['abstract'] = df.apply(lambda row: fetch_abstract_cell(row["doi"]) if row["doi"] else None, axis=1)
import numpy as np
df['abstract'].replace("", np.nan, inplace=True)
df = df.dropna(subset=['abstract'])
df = df.drop_duplicates(subset='abstract')

In [None]:
from collections import deque
import time
from google import genai
from google.genai import types


class RateLimiter:
    def __init__(self, max_calls: int, period: float):
        self.max_calls = max_calls
        self.period = period
        self.calls = deque()

    def __call__(self, fn):
        def wrapped(*args, **kwargs):
            now = time.monotonic()
            while self.calls and now - self.calls[0] > self.period:
                self.calls.popleft()
            if len(self.calls) >= self.max_calls:
                sleep_time = self.period - (now - self.calls[0])
                time.sleep(sleep_time)
            result = fn(*args, **kwargs)
            self.calls.append(time.monotonic())
            return result
        return wrapped

rate_limiter = RateLimiter(max_calls=1500, period=60.0)

# ── New 0. 3rd-party sentence-level embedder ──────────────────────────────
# Any encoder that the *generation* models never saw during training is ok.
client = genai.Client(api_key=("YOUR_KEY"))

@rate_limiter
def gemini_embed_batch(texts: list[str]) -> list[list[float]]:
    """Send a batch of texts to Gemini embedding API and return a list of embedding vectors."""
    resp = client.models.embed_content(
        model="models/text-embedding-004",
        contents=texts,
        config=types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY")
    )
    return [emb.values for emb in resp.embeddings]


In [13]:
# iterate over the DataFrame in chunks to avoid memory issues
batch_size = 100
embeddings = []
for i in tqdm(range(0, len(df), batch_size), desc="Embedding abstracts"):
    batch = df['abstract'][i:i + batch_size].tolist()
    batch_embeddings = gemini_embed_batch(batch)
    embeddings.extend(batch_embeddings)

Embedding abstracts: 100%|██████████| 2/2 [00:04<00:00,  2.50s/it]


In [14]:
dissertation = """Neural populations generate emergent activity patterns ranging from microscopic spikes to whole-brain dynamics, representing one of nature’s richest collective phenomena underlying cognition and behavior. Evidence suggests that these neurodynamics operate near a second-order phase transition—known as the critical brain hypothesis—where the system balances order and disorder, thus maximizing information processing and dynamic range. However, how this critical organization changes across conditions such as developmental stages or pharmacological interventions remains unclear. To address this, resting-state fMRI was coupled with a 2D Ising model to quantify distance from criticality. A graph neural network (GNN) was trained on simulated Ising networks to learn the mapping between connectivity patterns and the model’s control parameter—the Ising temperature. When applied to empirical fMRI data, this GNN inferred shifts between ordered and disordered states, detecting critical transitions in large-scale neural dynamics. To explore this approach, two empirical studies were conducted. Study I explored age-related differences in critical dynamics by comparing typically developing children and adolescents with those diagnosed with attention-deficit/hyperactivity disorder (ADHD). Typical maturation was characterized by a monotonic decrease in Ising temperature, indicating a gradual shift from criticality towards a more ordered brain state. Study II quantified pharmacologically induced changes in brain entropy by estimating the Ising temperature during acute and subacute (24 hours post-administration) phases following ayahuasca ingestion, compared with placebo. This analysis examined whether ayahuasca shifts the brain toward a more disordered, paramagnetic regime, and whether such changes persist beyond the immediate drug effects. To bridge neural dynamics with subjective experience, subacute temperature changes were correlated with acute-phase subjective ratings from the Hallucinogenic Rating Scale (HRS). Results revealed an increase in Ising temperature during the acute phase, which partially persisted into the subacute period; notably, the persistence correlated with individual subjective experiences, particularly with the affective scores. These findings provide mechanistic support for the entropic-brain hypothesis, demonstrating that the GNN-derived Ising temperature serves as a sensitive, computationally efficient marker for detecting developmental and pharmacologically induced deviations from neural criticality."""

In [15]:
dissertation_embedding = gemini_embed_batch([dissertation])[0]

In [16]:
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from sklearn.decomposition import PCA
import numpy as np

# Stack all embeddings (papers + dissertation)
all_embeddings = np.vstack([embeddings, dissertation_embedding])

# Compute 2D T-SNE
# 1. Linear pre-compression
X50 = PCA(n_components=50, random_state=42).fit_transform(all_embeddings)

# 2. Heuristic hyper-parameters
n = len(X50)
perp = max(5, min(100, n // 100))         # tweak as needed

tsne = TSNE(
    n_components=2,
    init="pca",
    perplexity=perp,
    learning_rate="auto",                 # or int ≈ 0.1 × n
    early_exaggeration=8,
    method="barnes_hut",                  # drops to O(n log n)
    angle=0.3,
    n_iter=1000,
    random_state=42,
    verbose=1,
)

tsne_2d = tsne.fit_transform(X50)

import umap
umap_2d = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine",
                    random_state=42).fit_transform(all_embeddings)

# Split back: papers and dissertation
tsne_papers = umap_2d[:-1]
tsne_dissertation = umap_2d[-1]

# Plot
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=tsne_papers[:, 0], y=tsne_papers[:, 1],
    mode='markers',
    marker=dict(size=7, color='blue', opacity=0.6),
    name='Papers'
))
fig.add_trace(go.Scatter(
    x=[tsne_dissertation[0]], y=[tsne_dissertation[1]],
    mode='markers+text',
    marker=dict(size=14, color='red', symbol='star'),
    text=["Dissertation"],
    textposition="top center",
    name='Dissertation'
))
fig.update_layout(
    title="T-SNE Embedding: Papers and Dissertation",
    xaxis_title="T-SNE 1",
    yaxis_title="T-SNE 2",
    legend=dict(x=0.01, y=0.99)
)
fig.show()

[t-SNE] Computing 16 nearest neighbors...
[t-SNE] Indexed 148 samples in 0.001s...
[t-SNE] Computed neighbors for 148 samples in 0.064s...
[t-SNE] Computed conditional probabilities for sample 148 / 148
[t-SNE] Mean sigma: 0.184549
[t-SNE] KL divergence after 250 iterations with early exaggeration: 39.790840
[t-SNE] KL divergence after 1000 iterations: 0.655223


  warn(


In [19]:
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
import plotly.graph_objects as go
import plotly.express as px

# ───────────────────────────────────────────────────────────────────────
# 1) Cosine-similarity matrix
sim = cosine_similarity(all_embeddings)
np.fill_diagonal(sim, 0)

# ───────────────────────────────────────────────────────────────────────
# 2) Top-k edge list
K = 5
edges = [
    (i, j)
    for i in range(sim.shape[0])
    for j in np.argsort(sim[i])[::-1][:K]
    if sim[i, j] > 0
]
edges = np.array(edges)

# ───────────────────────────────────────────────────────────────────────
# 3) Map similarity → colour (Viridis)
weights = sim[edges[:, 0], edges[:, 1]]
norm = (weights - weights.min()) / (weights.max() - weights.min() + 1e-9)
colours = px.colors.sample_colorscale("jet", norm)

# ── gather edge segments, binned by colour so each trace is uniform ────
edge_bins = defaultdict(lambda: {"x": [], "y": []})
for (i, j), col in zip(edges, colours):
    x0, y0 = tsne_2d[i]
    x1, y1 = tsne_2d[j]
    edge_bins[col]["x"] += [x0, x1, None]
    edge_bins[col]["y"] += [y0, y1, None]

edge_traces = [
    go.Scatter(
        x=coords["x"], y=coords["y"],
        mode="lines",
        line=dict(width=1, color=col),
        hoverinfo="none",
        showlegend=False,
    )
    for col, coords in edge_bins.items()
]

# ───────────────────────────────────────────────────────────────────────
# 4) Node trace
titles_years = np.vstack([customdata, ["Your dissertation", ""]])
node_colors = ["#1f77b4"] * (len(tsne_2d) - 1) + ["#d62728"]

node_trace = go.Scatter(
    x=tsne_2d[:, 0], y=tsne_2d[:, 1],
    mode="markers",
    marker=dict(size=10, color=node_colors, line=dict(width=0.5, color="black")),
    customdata=titles_years,
    hovertemplate="<b>%{customdata[0]}</b><br>Year: %{customdata[1]}<extra></extra>",
    showlegend=False,
)

# ───────────────────────────────────────────────────────────────────────
# 5) Display
fig = go.Figure(data=edge_traces + [node_trace])
fig.update_layout(
    title=f"Cosine-similarity network (top-{K} neighbours · colour = similarity)",
    margin=dict(l=0, r=0, t=40, b=0),
    
)
fig.show()

In [None]:
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
import plotly.graph_objects as go
import plotly.express as px

# ───────────────────────────────────────────────────────────────────────
# 1) Cosine-similarity matrix
sim = cosine_similarity(all_embeddings)
np.fill_diagonal(sim, 0)

# ───────────────────────────────────────────────────────────────────────
# 2) Top-k edge list
K = 10
edges = [
    (i, j)
    for i in range(sim.shape[0])
    for j in np.argsort(sim[i])[::-1][:K]
    if sim[i, j] > 0
]
edges = np.asarray(edges)
weights = sim[edges[:, 0], edges[:, 1]]          # similarity per edge
w_min, w_max = weights.min(), weights.max()
norm = (weights - w_min) / (w_max - w_min + 1e-12)

# ───────────────────────────────────────────────────────────────────────
# 3) Build one-colour edge traces (Viridis)
edge_bins = defaultdict(lambda: {"x": [], "y": []})
for (i, j), n in zip(edges, norm):
    col = px.colors.sample_colorscale("jet", [n])[0]   # hex string
    x0, y0 = tsne_2d[i]
    x1, y1 = tsne_2d[j]
    edge_bins[col]["x"] += [x0, x1, None]
    edge_bins[col]["y"] += [y0, y1, None]

edge_traces = [
    go.Scatter(
        x=coords["x"], y=coords["y"],
        mode="lines",
        line=dict(width=1, color=col),
        hoverinfo="none",
        showlegend=False,
    )
    for col, coords in edge_bins.items()
]

# ───────────────────────────────────────────────────────────────────────
# 4) Dummy trace → owns the colour-bar
dummy_trace = go.Scatter(
    x=[None, None], y=[None, None],
    mode="markers",
    marker=dict(
        color=[w_min, w_max],                # endpoints of the scale
        colorscale="jet",
        cmin=w_min, cmax=w_max,
        showscale=True,
        colorbar=dict(title="cosine<br>similarity", thickness=12),
        size=0, opacity=0                    # fully invisible
    ),
    hoverinfo="skip",
    showlegend=False,
)

# ───────────────────────────────────────────────────────────────────────
# 5) Node trace
titles_years = np.vstack([customdata, ["Your dissertation", ""]])
node_colors = ["#1f77b4"] * (len(tsne_2d) - 1) + ["#d62728"]

node_trace = go.Scatter(
    x=tsne_2d[:, 0], y=tsne_2d[:, 1],
    mode="markers",
    marker=dict(size=10, color=node_colors,
                line=dict(width=0.5, color="black")),
    customdata=titles_years,
    hovertemplate="<b>%{customdata[0]}</b><br>Year: %{customdata[1]}<extra></extra>",
    showlegend=False,
)

# ───────────────────────────────────────────────────────────────────────
# 6) Display
fig = go.Figure(edge_traces + [node_trace, dummy_trace])
fig.update_layout(
    title=f"Cosine-similarity network (top-{K} neighbours · colour = similarity)",
    margin=dict(l=0, r=0, t=40, b=0),
)
fig.show()

fig.write_html("tsne_plot_graph.html", include_plotlyjs="cdn", full_html=True)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import textwrap

def lollipop_gradient(
        df,
        top_n=10,
        title_col='title',
        sim_col='Similarity_to_dissertation',
        title='Top {} papers most similar to the dissertation',
        xmin=0,
        wrap_width=55):
    """
    Plot a horizontal lollipop chart for the top-N most-similar papers,
    using a colour gradient to encode similarity.
    """
    # 1. Take the top-N and order by similarity (highest first)
    top = (df.nlargest(top_n, sim_col)
             .sort_values(sim_col, ascending=False)
             .reset_index(drop=True))

    # 2. Build a colour map scaled to the similarity range
    norm = mcolors.Normalize(vmin=top[sim_col].min(),
                             vmax=top[sim_col].max())
    colors = cm.viridis(norm(top[sim_col]))

    # 3. Wrap long titles for neat y-tick labels
    y_labels = [textwrap.fill(t, wrap_width) for t in top[title_col]]

    # 4. Create the plot
    fig, ax = plt.subplots(figsize=(14, 10), dpi=120)

    # Draw stems
    ax.hlines(y=top.index, xmin=xmin, xmax=top[sim_col],
              color='grey', linewidth=1.5, alpha=0.6)
    # Draw dots
    ax.scatter(top[sim_col], top.index,
               s=180,                        # dot size
               c=colors,
               edgecolors='black',
               linewidth=0.7,
               zorder=3)

    # 5. Formatting
    ax.set_yticks(top.index)
    ax.set_yticklabels(y_labels, fontsize=10)
    ax.invert_yaxis()                               # highest similarity on top
    ax.set_xlabel('Cosine similarity', fontsize=12, weight='bold')
    ax.set_title(title.format(top_n),
                 fontsize=14, weight='bold', pad=12)

    # Annotate dots with numeric similarity
    for x, y, s in zip(top[sim_col], top.index, top[sim_col]):
        ax.text(x + 0.01, y, f'{s:.3f}',
                va='center', ha='left', fontsize=9)

    # Clean up spines & add subtle grid
    for spine in ['top', 'right', 'left']:
        ax.spines[spine].set_visible(False)
    ax.grid(axis='x', linestyle='--', alpha=0.3)
    plt.tight_layout()
    plt.show()


In [None]:
import networkx as nx

# Build a weighted graph from the similarity matrix (excluding the dissertation row/col)
G = nx.from_numpy_array(sim)

# Compute centrality measures using edge weights
degree_centrality = np.array(list(nx.degree_centrality(G).values()))  # degree_centrality is not weighted
betweenness_centrality = np.array(list(nx.betweenness_centrality(G, weight='weight').values()))
closeness_centrality = np.array(list(nx.closeness_centrality(G, distance=lambda u, v, d: 1 - d['weight']).values()))

# Calculate the average centrality for each node
average_centrality = (degree_centrality + betweenness_centrality + closeness_centrality) / 3

df['average_centrality'] = average_centrality[:-1]  # exclude dissertation row
df['average_degree'] = sim[:-1, :-1].mean(axis=1)  # exclude dissertation row
df['Similarity_to_dissertation'] = sim[:-1, -1]  # last row is dissertation
df.sort_values(by='Similarity_to_dissertation', ascending=False, inplace=True)

lollipop_gradient(df, xmin=0.7)  # run this once `df` is defined

lollipop_gradient(df, sim_col='average_centrality', title='Top {} papers most central in the network', xmin=1.25)  # run this once `df` is defined