This notebook recapitulates the scrnaseq analysis performed by cohen, using their notebook code where possible, with the aim of producing a list of cell-type calls and filtered lists.

First, we need to get their data.

In [1]:
import urllib.request
import subprocess
data_root="/home/mcn26/palmer_scratch/tabula_data"

In [9]:
urllib.request.urlretrieve("https://zenodo.org/records/14907846/files/scMPRA.zip?download=1",f"{data_root}/dl")

('/home/mcn26/palmer_scratch/tabula_data/dl',
 <http.client.HTTPMessage at 0x151125a61be0>)

In [13]:
subprocess.run(["file",f"{data_root}/dl"])

/home/mcn26/palmer_scratch/tabula_data/dl: Zip archive data, at least v2.0 to extract


CompletedProcess(args=['file', '/home/mcn26/palmer_scratch/tabula_data/dl'], returncode=0)

In [16]:
subprocess.run(["unzip","-d",data_root,f"{data_root}/dl"])

Archive:  /home/mcn26/palmer_scratch/tabula_data/dl
   creating: /home/mcn26/palmer_scratch/tabula_data/scMPRA/
  inflating: /home/mcn26/palmer_scratch/tabula_data/__MACOSX/._scMPRA  
 extracting: /home/mcn26/palmer_scratch/tabula_data/scMPRA/.Rhistory  
  inflating: /home/mcn26/palmer_scratch/tabula_data/scMPRA/mascot.yml  
  inflating: /home/mcn26/palmer_scratch/tabula_data/scMPRA/.DS_Store  
  inflating: /home/mcn26/palmer_scratch/tabula_data/__MACOSX/scMPRA/._.DS_Store  
  inflating: /home/mcn26/palmer_scratch/tabula_data/scMPRA/readme.md  
  inflating: /home/mcn26/palmer_scratch/tabula_data/__MACOSX/scMPRA/._readme.md  
  inflating: /home/mcn26/palmer_scratch/tabula_data/scMPRA/.RData  
   creating: /home/mcn26/palmer_scratch/tabula_data/scMPRA/scripts/
   creating: /home/mcn26/palmer_scratch/tabula_data/scMPRA/data/
   creating: /home/mcn26/palmer_scratch/tabula_data/scMPRA/notebooks/
  inflating: /home/mcn26/palmer_scratch/tabula_data/scMPRA/scripts/sc_crs_exp.py  
  inflating: 

CompletedProcess(args=['unzip', '-d', '/home/mcn26/palmer_scratch/tabula_data', '/home/mcn26/palmer_scratch/tabula_data/dl'], returncode=0)

In [2]:
subprocess.run(["rm",f"{data_root}/dl"])

CompletedProcess(args=['rm', '/home/mcn26/palmer_scratch/tabula_data/dl'], returncode=0)

Code below adapted from cohen zenodo, `Part1_section3_analyze_scTranscriptome_for_retina.ipynb`.

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import rc_context

In [4]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

scanpy==1.10.4 anndata==0.11.3 umap==0.5.7 numpy==2.1.3 scipy==1.15.1 pandas==2.2.3 scikit-learn==1.6.1 statsmodels==0.14.4 pynndescent==0.5.13


In [5]:
zenodo_root=f"{data_root}/scMPRA"
zenodo_root

'/home/mcn26/palmer_scratch/tabula_data/scMPRA'

In [6]:
adata_1 = sc.read_10x_mtx(
    f'{zenodo_root}/data/sc_transcriptome_data/retina_matrix_r1/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)  

adata_2 = sc.read_10x_mtx(
    f'{zenodo_root}/data/sc_transcriptome_data/retina_matrix_r2/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)

... writing an h5ad cache file to speedup reading next time
... writing an h5ad cache file to speedup reading next time


In [7]:
adata= adata_1.concatenate(adata_2, batch_key='sample')

  adata= adata_1.concatenate(adata_2, batch_key='sample')


In [8]:
adata.var_names_make_unique()

In [9]:
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

filtered out 4 cells that have less than 200 genes expressed
filtered out 12101 genes that are detected in less than 3 cells


In [10]:
adata.var['mt'] = adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

Seems as though they never filtered on mitochondrial, total_counts, but rather just regressed the effects out. So for our purposes (selecting valid cells) our replication of their analysis ends here.

In [11]:
cell_names = list(adata.obs_names)

replicate_1 = [name for name in cell_names if name.endswith("-0")]
replicate_2 = [name for name in cell_names if name.endswith("-1")]

In [12]:
replicate_1=[bc.split("-")[0] for bc in replicate_1]
replicate_2=[bc.split("-")[0] for bc in replicate_2]

In [33]:
with open(f"{data_root}/raw_recap/cohen_retina/cell_names/retina_rep1_cell_names", "w") as f:
    for name in replicate_1:
        f.write(f"{name}\n")

In [34]:
with open(f"{data_root}/raw_recap/cohen_retina/cell_names/retina_rep2_cell_names", "w") as f:
    for name in replicate_2:
        f.write(f"{name}\n")