In [1]:
import scanpy as sc
import pandas as pd
import numpy as np


In [3]:
#Load the ANNData with symbols
adata_xeno = sc.read_h5ad("../adata_xeno_with_symbols.h5ad")
adata_xeno.var.head()


Unnamed: 0,raw_name,xeno_symbol
gene25011|Xelaev18004747m,gene25011|Xelaev18004747m,Xelaev18004747m
gene21250|Xetrov90028798m.L,gene21250|Xetrov90028798m.L,Xetrov90028798m.L
gene27977|Xelaev18004749m,gene27977|Xelaev18004749m,Xelaev18004749m
gene26149|Xelaev18004750m,gene26149|Xelaev18004750m,Xelaev18004750m
gene25611|Xelaev18004751m,gene25611|Xelaev18004751m,Xelaev18004751m


In [12]:
# Xenopus AnnData genes
adata_xeno.n_obs, adata_xeno.n_vars

(33985, 41560)

In [4]:
#Load and parse the Xenbase ortholog file
orth = pd.read_csv("../data/XenbaseGeneHumanOrthologMapping.txt",
                   sep="\t", header=None)

orth = orth.rename(columns={0: "xeno_entrez",
                            1: "xenbase_id",
                            2: "xeno_symbol_raw",
                            3: "human_desc"})

orth["xeno_symbol"] = orth["xeno_symbol_raw"].astype(str)
orth["human_symbol"] = orth["human_desc"].astype(str).str.split(",").str[0].str.strip()

orth = orth.dropna(subset=["human_symbol"])
orth = orth[orth["human_symbol"] != ""]
orth[["xeno_symbol", "human_symbol"]].head()


Unnamed: 0,xeno_symbol,human_symbol
0,trnt1,tRNA nucleotidyl transferase
1,foxh1.2,forkhead box H1
2,nr5a2,nuclear receptor subfamily 5 group A member 2
3,tbx1,T-box 1
4,nr1d1,nuclear receptor subfamily 1 group D member 1


In [13]:
# Xenbase ortholog rows
orth.shape

(15897, 6)

In [16]:
# Set of Xenopus gene symbols present in the scRNA-seq object
xeno_symbols_data = set(adata_xeno.var["xeno_symbol"])
# Set of Xenopus gene symbols that have a human ortholog in the Xenbase table 
xeno_symbols_xenbase = set(orth["xeno_symbol"])

# Reports the no. of unique symbols in the scRNA data, Xenbase table, and shared between the two
len(xeno_symbols_data), len(xeno_symbols_xenbase), len(xeno_symbols_data & xeno_symbols_xenbase)


(41188, 15884, 101)

In [17]:
# Keeping only the Xenbase enteries whose Xenopus symbol actually occurs in our scRNA-seq data
xeno_orth_sub = orth[orth["xeno_symbol"].isin(xeno_symbols_data)].copy()
# Check the remaining ortholog pairs after filtering
xeno_orth_sub.shape

(101, 6)

In [18]:
# Inspect a few example mappings (Xenopus symbol -> human symbol)
xeno_orth_sub[["xeno_symbol", "human_symbol"]].head()

Unnamed: 0,xeno_symbol,human_symbol
7,csnk1a1,casein kinase 1 alpha 1
133,fgfr3,fibroblast growth factor receptor 3 (achondrop...
223,hspa5,heat shock protein family A (Hsp70) member 5
497,nr2f2,nuclear receptor subfamily 2 group F member 2
576,nr4a1,nuclear receptor subfamily 4 group A member 1


In [21]:
# Count how many distinct human symbols each Xenopus symbol maps to
counts = xeno_orth_sub.groupby("xeno_symbol")["human_symbol"].nunique().rename("n_human")

# Add this count back into the table
xeno_orth_sub = xeno_orth_sub.merge(counts, on="xeno_symbol")

# Classify relationships
xeno_orth_sub["relationship"] = np.where(xeno_orth_sub["n_human"] == 1, "1to1", "1toMany")

# How many genes are 1-to-1 vs 1-to-many?
xeno_orth_sub["relationship"].value_counts()


relationship
1to1    101
Name: count, dtype: int64

In [22]:
# Keep only 1-to-1 orthologs
orth_baseline = xeno_orth_sub[xeno_orth_sub["relationship"] == "1to1"].copy()

# In case of duplicates, keep one row per Xenopus symbol
orth_baseline = orth_baseline.drop_duplicates(subset=["xeno_symbol"])

# Build dict: Xenopus symbol -> human symbol
xeno_to_human_baseline = pd.Series(
    orth_baseline["human_symbol"].values,
    index=orth_baseline["xeno_symbol"].values
).to_dict()

len(xeno_to_human_baseline), list(xeno_to_human_baseline.items())[:10]


(101,
 [('csnk1a1', 'casein kinase 1 alpha 1'),
  ('fgfr3', 'fibroblast growth factor receptor 3 (achondroplasia'),
  ('hspa5', 'heat shock protein family A (Hsp70) member 5'),
  ('nr2f2', 'nuclear receptor subfamily 2 group F member 2'),
  ('nr4a1', 'nuclear receptor subfamily 4 group A member 1'),
  ('ruvbl2', 'RuvB like AAA ATPase 2'),
  ('ptf1a', 'pancreas associated transcription factor 1a'),
  ('lef1', 'lymphoid enhancer binding factor 1'),
  ('sox5', 'SRY-box 5'),
  ('smo', 'smoothened')])

In [23]:
# Save as TSV
orth_baseline.to_csv("../data/orthologs_1to1_xeno_human_101.tsv",
                     sep="\t", index=False)


In [24]:
import pickle

with open("../data/xeno_to_human_baseline.pkl", "wb") as f:
    pickle.dump(xeno_to_human_baseline, f)
