# 🔗 Tabula Sapiens ↔ Tabula Muris Senis Ortholog Workflow

### Overview  
Process human **Tabula Sapiens** and mouse **Tabula Muris Senis** atlases to create matched protein‑coding, 1‑to‑1 ortholog AnnData objects and scDRS covariates, keeping only cell types with ≥ 20 cells. All paths are relative (`TS_DIR`, `TMS_DIR`, `OUTPUT_DIR`).

## 🔧 Environment

In [None]:
import scanpy as sc
import anndata as ad
import pandas as pd
from pathlib import Path

## 📂 Paths

In [None]:
TS_DIR      = Path('data/TabulaSapiens')
TMS_DIR     = Path('data/TabulaMurisSenis')
REF_DIR     = Path('data')
OUTPUT_DIR  = Path('output/TS_TMS')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

TS_H5AD     = TS_DIR/'TabulaSapiens.h5ad'
TMS_H5AD    = TMS_DIR/'TMS_facs_raw.h5ad'
ORTHO_TXT   = REF_DIR/'biomartMouse2HumanOrthos.txt'
GENE_COORDS = REF_DIR/'geneMatrix.tsv.gz'

## 🧬 Load & Clean Tabula Sapiens

In [None]:
ts = sc.read_h5ad(TS_H5AD)
ts.X = ts.layers['raw_counts']
if 'decontXcounts' in ts.layers: del ts.layers['decontXcounts']


## 🐭 Load & Map Tabula Muris Senis

In [None]:
tms = sc.read_h5ad(TMS_H5AD)
orth = pd.read_csv(ORTHO_TXT, sep='\t', names=['ENSMUSG','ensgid','HUMAN','MOUSE'])
map_d = dict(zip(north.MOUSE, north.ensgid))
tms.var.index = [map_d.get(g,g) for g in tms.var_names]
# unique genes
uniq = tms.var_names[~tms.var_names.duplicated()]
tms = tms[:, uniq].copy()


## 🔗 Identify 1‑to‑1 Protein‑Coding Orthologs

In [None]:
north = north.drop_duplicates('MOUSE').drop_duplicates('HUMAN')
expr_ts = set(ts.var_names)
expr_tms = set(tms.var_names)
common  = set(north.HUMAN).intersection(expr_ts).intersection(expr_tms)
coords  = pd.read_csv(GENE_COORDS, sep='\t')
pc_set = set(coords[coords['gene_type']=='protein_coding']['Gene'])
final_genes = sorted(common & pc_set)
print('Genes retained:', len(final_genes))

ts  = ts[:, final_genes]
tms = tms[:, final_genes]


## 🧹 Cell‑Type Cleaning & Min‑Cell Filter (≥20)

In [None]:
def clean(a, label):
    col = 'cell_ontology_class'
    a.obs[col] = a.obs[col].str.replace('[\s,\-]','_', regex=True)
    keep = a.obs[col].value_counts()[lambda s: s>=20].index
    return a[a.obs[col].isin(keep)].copy()

ts  = clean(ts,  'donor')
tms = clean(tms, 'mouse.id')


## 💾 Save Filtered AnnData Objects

In [None]:
ts_fp  = OUTPUT_DIR/'TabulaSapiens_pc_ortholog_minCell20.h5ad'
 tms_fp = OUTPUT_DIR/'TMS_pc_ortholog_minCell20.h5ad'
ts.write(ts_fp)
tms.write(tms_fp)


## 📊 Generate scDRS Covariates

In [None]:
def make_cov(a, id_col, path):
    cov = pd.DataFrame(index=a.obs.index)
    cov['const']=1
    cov['n_genes']=(a.X>0).sum(1)
    for ident in sorted(a.obs[id_col].unique()):
        cov[f'{id_col}_{ident}']=(a.obs[id_col]==ident).astype(int)
    cov.to_csv(path, sep='\t')

make_cov(ts,'donor', OUTPUT_DIR/'TabulaSapiens_pc_ortholog_minCell20.cov')
make_cov(tms,'mouse.id', OUTPUT_DIR/'TMS_pc_ortholog_minCell20.cov')