In [None]:
%cd /content/drive/MyDrive/GSE210681/
!wget "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE210681&format=file" -O GSE210681_RAW.tar

/content/drive/MyDrive/GSE210681
--2024-07-20 18:21:40--  https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE210681&format=file
Resolving www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)... 130.14.29.110, 2607:f220:41e:4290::110
Connecting to www.ncbi.nlm.nih.gov (www.ncbi.nlm.nih.gov)|130.14.29.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6851092480 (6.4G) [application/x-tar]
Saving to: ‘GSE210681_RAW.tar’


2024-07-20 18:27:13 (19.6 MB/s) - ‘GSE210681_RAW.tar’ saved [6851092480/6851092480]



In [1]:
!tar -xvf GSE210681_RAW.tar

tar: GSE210681_RAW.tar: Cannot open: No such file or directory
tar: Error is not recoverable: exiting now


In [1]:
%cd /content/drive/MyDrive/GSE210681/
%pip install scanpy -q
import os
import anndata as ad
import pandas as pd
import numpy as np
import scanpy as sc

from tqdm import tqdm
from pathlib import Path

/content/drive/MyDrive/GSE210681


In [4]:
sample_list = list(set(["_".join(file.split('_')[:5]) for file in os.listdir()
                            if len(file.split('_')) >= 5 and '.xlsx' not in file
                        ]))
sample_list

['GSM6435418_scRNAseq_2kG_2PMDox2_3',
 'GSM6435408_scRNAseq_2kG_11AMDox_6',
 'GSM6435413_scRNAseq_2kG_2PMDox1_5',
 'GSM6435421_scRNAseq_2kG_2PMDox2_6',
 'GSM6435417_scRNAseq_2kG_2PMDox2_2',
 'GSM6435405_scRNAseq_2kG_11AMDox_3',
 'GSM6435411_scRNAseq_2kG_2PMDox1_3',
 'GSM6435412_scRNAseq_2kG_2PMDox1_4',
 'GSM6435410_scRNAseq_2kG_2PMDox1_2',
 'GSM6435422_scRNAseq_2kG_2PMDox2_7',
 'GSM6435409_scRNAseq_2kG_2PMDox1_1',
 'GSM6435414_scRNAseq_2kG_2PMDox1_6',
 'GSM6435419_scRNAseq_2kG_2PMDox2_4',
 'GSM6435420_scRNAseq_2kG_2PMDox2_5',
 'GSM6435415_scRNAseq_2kG_2PMDox1_7',
 'GSM6435404_scRNAseq_2kG_11AMDox_2',
 'GSM6435407_scRNAseq_2kG_11AMDox_5',
 'GSM6435416_scRNAseq_2kG_2PMDox2_1',
 'GSM6435403_scRNAseq_2kG_11AMDox_1',
 'GSM6435406_scRNAseq_2kG_11AMDox_4']

In [5]:
for sample in sample_list:
    print(f'processing {sample}')
    os.makedirs(sample, exist_ok=True)
    file_list = [file for file in os.listdir() if sample in file and file != sample]
    for file in file_list:
        os.rename(file, os.path.join(sample, file))

processing GSM6435418_scRNAseq_2kG_2PMDox2_3
processing GSM6435408_scRNAseq_2kG_11AMDox_6
processing GSM6435413_scRNAseq_2kG_2PMDox1_5
processing GSM6435421_scRNAseq_2kG_2PMDox2_6
processing GSM6435417_scRNAseq_2kG_2PMDox2_2
processing GSM6435405_scRNAseq_2kG_11AMDox_3
processing GSM6435411_scRNAseq_2kG_2PMDox1_3
processing GSM6435412_scRNAseq_2kG_2PMDox1_4
processing GSM6435410_scRNAseq_2kG_2PMDox1_2
processing GSM6435422_scRNAseq_2kG_2PMDox2_7
processing GSM6435409_scRNAseq_2kG_2PMDox1_1
processing GSM6435414_scRNAseq_2kG_2PMDox1_6
processing GSM6435419_scRNAseq_2kG_2PMDox2_4
processing GSM6435420_scRNAseq_2kG_2PMDox2_5
processing GSM6435415_scRNAseq_2kG_2PMDox1_7
processing GSM6435404_scRNAseq_2kG_11AMDox_2
processing GSM6435407_scRNAseq_2kG_11AMDox_5
processing GSM6435416_scRNAseq_2kG_2PMDox2_1
processing GSM6435403_scRNAseq_2kG_11AMDox_1
processing GSM6435406_scRNAseq_2kG_11AMDox_4


In [8]:
def parse_perturb(item):
    if '-and-' in str(item):
        return '+'.join(item.split('-and-'))
    elif pd.isna(item):
        return 'No_Guide'
    else:
        return item

def parse_guide(item):
    if pd.isna(item):
        return 'No_Guide'
    else:
        return item

In [9]:
adatas = {}
for sample in tqdm(sample_list):
    adata = sc.read_10x_mtx(sample, var_names='gene_symbols', cache=True, prefix=f"{sample}_")
    perturb_file = [file for file in os.listdir(sample) if 'guidemut' in file][0]
    df = pd.read_csv(Path(sample) / perturb_file, sep='\t', index_col=0)
    df.set_index('CBC_10x', drop=True, inplace=True)
    df.index = df.index.to_series() + '-1'

    adata.obs['perturbation_name'] = df['Gene'].apply(parse_perturb)
    adata.obs['guide_sequence'] = df['Guide'].apply(parse_guide)
    adata.obs['n_guides'] = adata.obs['perturbation_name'].apply(lambda x: len(x.split("+")))

    adata.var_names_make_unique()
    adata.obs_names_make_unique()
    adatas[sample] = adata

100%|██████████| 20/20 [02:59<00:00,  8.98s/it]


In [14]:
adata = ad.concat(adatas, merge='unique', join='outer', label='batch')
adata.obs['organism'] = "Humans (Homo sapiens)"
adata.obs['organ'] = 'Endothelial Cells'
adata.obs['celltype'] = 'Endothelial Cells'
adata.obs['condition'] = 'unconditional'
adata.obs['crispr_type'] = 'CRISPRi'

  utils.warn_names_duplicates("obs")


In [18]:
adata.write_h5ad("./GSE210681_merged.h5ad")
adata

AnnData object with n_obs × n_vars = 822156 × 36601
    obs: 'perturbation_name', 'guide_sequence', 'n_guides', 'batch', 'organism', 'organ', 'celltype', 'condition', 'crispr_type'
    var: 'gene_ids', 'feature_types'

In [25]:
adata.obs.head()

Unnamed: 0,perturbation_name,guide_sequence,n_guides,batch,organism,organ,celltype,condition,crispr_type
AAACCCAAGAATCCCT-1,PYROXD1,GCGGCGGAGCAACGGGACTC,1,GSM6435418_scRNAseq_2kG_2PMDox2_3,Humans (Homo sapiens),Endothelial Cells,Endothelial Cells,unconditional,CRISPRi
AAACCCAAGACCATTC-1,EPHA2,GGGCGTTGGTGACGTCACGC,1,GSM6435418_scRNAseq_2kG_2PMDox2_3,Humans (Homo sapiens),Endothelial Cells,Endothelial Cells,unconditional,CRISPRi
AAACCCAAGACGATAT-1,No_Guide,No_Guide,1,GSM6435418_scRNAseq_2kG_2PMDox2_3,Humans (Homo sapiens),Endothelial Cells,Endothelial Cells,unconditional,CRISPRi
AAACCCAAGCAGATAT-1,No_Guide,No_Guide,1,GSM6435418_scRNAseq_2kG_2PMDox2_3,Humans (Homo sapiens),Endothelial Cells,Endothelial Cells,unconditional,CRISPRi
AAACCCAAGCCGCACT-1,No_Guide,No_Guide,1,GSM6435418_scRNAseq_2kG_2PMDox2_3,Humans (Homo sapiens),Endothelial Cells,Endothelial Cells,unconditional,CRISPRi


In [22]:
%pip install openpyxl -q
gene_perturb_info = pd.read_excel("./41586_2024_7022_MOESM3_ESM.xlsx", sheet_name='Suppl.Table.10')
gene_perturb_info

Unnamed: 0,genes,perturbation,log2fc,p.value,FDR
0,AC022154.1:ENSG00000268093,AL354709.1,0.777980,0.000289,1.0
1,AC080162.1:ENSG00000239300,AL354709.1,0.461002,0.000960,1.0
2,AC096586.1:ENSG00000272936,AL354709.1,1.036129,0.000129,1.0
3,AL358473.1:ENSG00000229191,AL354709.1,0.461002,0.000960,1.0
4,AL359643.2:ENSG00000271978,AL354709.1,1.104281,0.000768,1.0
...,...,...,...,...,...
65198,PEX1:ENSG00000127980,ZWINT,-0.866241,0.000959,1.0
65199,SP140:ENSG00000079263,ZWINT,0.146615,0.000673,1.0
65200,TRAPPC8:ENSG00000153339,ZWINT,0.806033,0.000241,1.0
65201,USP7:ENSG00000187555,ZWINT,-0.905943,0.000741,1.0
