# PrimeKG Case Study — COVID‑19 → Baricitinib

This notebook adapts the Autism case study workflow to **COVID‑19** and traces mechanistic connections to the repurposed drug **Baricitinib**.

**What this notebook does**
- Load the PrimeKG CSV (`kg.csv`).
- Identify COVID‑19 disease nodes and **Baricitinib** drug nodes.
- Summarize connected phenotypes, proteins (e.g., **AAK1, GAK, JAK1/2**), and pathways (e.g., **endocytosis, cytokine signaling**).
- Compute shortest paths between COVID‑19 and Baricitinib and visualize a focused subgraph.
- (Optional) Run a small **permutation test** to compare shortest-path distance to random diseases (mini version of the paper’s approach).

> Tip: Place `kg.csv` in the same directory as this notebook (or change `KG_PATH` below).


In [1]:
# --- Parameters ---
KG_PATH = "kg.csv"
DISEASE_KEYWORDS = ["covid", "sars-cov-2", "coronavirus disease 2019"]
TARGET_DRUG = "baricitinib"
EGO_RADIUS = 2          # radius for ego-subgraph around key nodes
MAX_EDGES = 1_000_000   # cap (informational only)
N_RANDOM = 100          # number of random diseases for permutation test (keep modest)
RANDOM_SEED = 42

import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict, Counter
import random
import time

pd.set_option("display.max_colwidth", 160)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

def t0():
    s = time.time()
    return lambda: f"{time.time()-s:,.1f}s"

print("[INFO] Imports ready.")
timer = t0()

[INFO] Imports ready.


In [None]:
print("[INFO] Loading PrimeKG...")
kg = pd.read_csv(KG_PATH)
print(kg.head())
print("[INFO] Columns:", list(kg.columns))
print("[INFO] Done loading:", timer())

[INFO] Loading PrimeKG...


  kg = pd.read_csv(KG_PATH)


In [None]:
def contains_any(text, keywords):
    text = str(text).lower()
    return any(k in text for k in keywords)

def find_nodes_by_name(df, keywords):
    # Returns unique node names from x_name and y_name that contain any of the keywords
    mask_x = df['x_name'].str.lower().apply(lambda t: contains_any(t, keywords))
    mask_y = df['y_name'].str.lower().apply(lambda t: contains_any(t, keywords))
    nodes = set(df.loc[mask_x, 'x_name']).union(set(df.loc[mask_y, 'y_name']))
    return sorted(nodes)

def rows_touching_keywords(df, keywords):
    return df[
        df['x_name'].str.lower().apply(lambda t: contains_any(t, keywords)) |
        df['y_name'].str.lower().apply(lambda t: contains_any(t, keywords))
    ]

In [None]:
covid_nodes = find_nodes_by_name(kg, DISEASE_KEYWORDS)
baricitinib_nodes = find_nodes_by_name(kg, [TARGET_DRUG])

print(f"[INFO] Candidate COVID-19 nodes found: {len(covid_nodes)}")
print(covid_nodes[:10])
print(f"[INFO] Candidate Baricitinib nodes found: {len(baricitinib_nodes)}")
print(baricitinib_nodes)

In [None]:
print("[INFO] Building graph...")
G = nx.from_pandas_edgelist(kg, source="x_name", target="y_name", edge_attr=True)
print(f"[INFO] Graph built with {G.number_of_nodes():,} nodes and {G.number_of_edges():,} edges.")

In [None]:
def shortest_path_between_sets(G, sources, targets, cutoff=None):
    best = None
    best_pair = (None, None)
    for s in sources:
        if s not in G: 
            continue
        for t in targets:
            if t not in G: 
                continue
            try:
                sp = nx.shortest_path(G, s, t, cutoff=cutoff)
                if best is None or len(sp) < len(best):
                    best = sp
                    best_pair = (s, t)
            except nx.NetworkXNoPath:
                continue
    return best, best_pair

best_path, (best_covid, best_baricitinib) = shortest_path_between_sets(G, covid_nodes, baricitinib_nodes, cutoff=6)
if best_path is None:
    print("[WARN] No path found up to cutoff=6. Try increasing cutoff or broaden keywords.")
else:
    print("[INFO] Best COVID-19 → Baricitinib path (length={}):".format(len(best_path)-1))
    for i, node in enumerate(best_path):
        print(f"  {i}. {node}")

In [None]:
if best_path is not None:
    core_nodes = set(best_path)
    # Expand around the endpoints for context
    if best_covid in G:
        core_nodes.update(nx.ego_graph(G, best_covid, radius=EGO_RADIUS).nodes())
    if best_baricitinib in G:
        core_nodes.update(nx.ego_graph(G, best_baricitinib, radius=EGO_RADIUS).nodes())
    H = G.subgraph(core_nodes).copy()
    print(f"[INFO] Focused subgraph: {H.number_of_nodes()} nodes, {H.number_of_edges()} edges.")
else:
    H = nx.Graph()


In [None]:
def guess_type(name: str):
    # Very rough heuristic using tokens seen in PrimeKG names/types.
    n = str(name).lower()
    if "covid" in n or "sars-cov-2" in n or "disease" in n:
        return "disease"
    if "pathway" in n or "reactome" in n:
        return "pathway"
    if "protein" in n or "gene" in n:
        return "protein"
    if "drug" in n or "db0" in n or "baricitinib" in n:
        return "drug"
    return "other"

if H.number_of_nodes() > 0:
    node_type_counts = Counter(guess_type(n) for n in H.nodes())
    print("[INFO] Node type distribution in focused subgraph:")
    for k, v in node_type_counts.items():
        print(f"  {k}: {v}")

    # Inspect for key proteins frequently mentioned for Baricitinib mechanism
    key_proteins = ["AAK1", "GAK", "JAK1", "JAK2"]
    present = [p for p in key_proteins if any(p.lower() in str(n).lower() for n in H.nodes())]
    print("\n[INFO] Key proteins present among nodes:", present)
else:
    print("[INFO] No focused subgraph constructed.")

In [None]:
if H.number_of_nodes() > 0:
    plt.figure(figsize=(11,9))
    pos = nx.spring_layout(H, k=0.35, seed=RANDOM_SEED)
    nx.draw_networkx_nodes(H, pos, node_size=55)
    nx.draw_networkx_edges(H, pos, alpha=0.25)
    nx.draw_networkx_labels(H, pos, font_size=6)
    plt.title("COVID‑19 → Baricitinib focused subgraph (ego radius = {})".format(EGO_RADIUS))
    plt.axis("off")
    plt.show()

In [None]:
covid_edges = rows_touching_keywords(kg, DISEASE_KEYWORDS)
bari_edges = rows_touching_keywords(kg, [TARGET_DRUG])

print("[INFO] Sample COVID‑19-related edges:")
display(covid_edges[['x_name','x_type','relation','y_name','y_type']].head(15))

print("\n[INFO] Sample Baricitinib-related edges:")
display(bari_edges[['x_name','x_type','relation','y_name','y_type']].head(15))

### Optional: mini permutation test

This approximates the paper’s proximity analysis by comparing the shortest-path distance between **COVID‑19** and **Baricitinib** to distances between **Baricitinib** and random **non‑COVID diseases**.


In [None]:
# Identify candidate disease nodes by heuristic from names touching 'disease' or ontology terms in types if present.
disease_like = set()
for col_name, col_type in [('x_name','x_type'), ('y_name','y_type')]:
    if 'x_type' in kg.columns and 'y_type' in kg.columns:
        mask = kg[col_type].str.lower().str.contains('disease', na=False)
        disease_like.update(kg.loc[mask, col_name].astype(str).tolist())
# Fallback: names that contain 'disease' if types aren't reliable
if not disease_like:
    for col in ['x_name','y_name']:
        disease_like.update(kg[col].astype(str).str.lower().str.contains('disease', na=False).replace({False: None}).dropna().index.tolist())

disease_like = list(disease_like)
print(f"[INFO] Candidate disease nodes (by heuristic): {len(disease_like):,}")

if best_path is not None and disease_like:
    # Exclude COVID-ish nodes from random sampling
    exclude = set(n for n in disease_like if any(k in str(n).lower() for k in DISEASE_KEYWORDS))
    pool = [n for n in disease_like if n not in exclude and n in G]
    sample = random.sample(pool, min(N_RANDOM, len(pool))) if pool else []

    def sp_len(u, v):
        try:
            return len(nx.shortest_path(G, u, v)) - 1
        except nx.NetworkXNoPath:
            return np.inf

    observed = sp_len(best_covid, best_baricitinib) if best_path is not None else np.inf
    rnd = [sp_len(n, best_baricitinib) for n in sample]
    rnd_finite = [x for x in rnd if np.isfinite(x)]

    print(f"[INFO] Observed shortest-path length (COVID → Baricitinib): {observed}")
    if rnd_finite:
        print(f"[INFO] Random diseases — mean ± std (finite only): {np.mean(rnd_finite):.2f} ± {np.std(rnd_finite):.2f} (n={len(rnd_finite)})")
        frac_better = np.mean([observed <= x for x in rnd_finite])
        print(f"[INFO] Fraction of random >= observed (1-sided): {frac_better:.3f}")
    else:
        print("[INFO] No finite random distances found (graph may be disconnected or sampling too small).")
else:
    print("[INFO] Skipping permutation test (no path or no disease pool).")

---

## Next Steps
- Verify specific protein nodes (e.g., **AAK1, GAK, JAK1, JAK2**) and **Reactome** pathways in your graph.
- Increase `cutoff` in path search or `EGO_RADIUS` to reveal more mechanism context.
- Expand permutation test (e.g., `N_RANDOM=1000`) once performance is acceptable on your machine.
- Compare with other candidate JAK inhibitors to contextualize Baricitinib.

