In [None]:
## env:base
import os
import pandas as pd
import scanpy as sc

import matplotlib.pyplot as plt  # For plotting
import seaborn as sns            # For plotting
sns.set_style("white")
import numpy as np
import anndata

from scipy.stats import mannwhitneyu
import statsmodels.stats.multitest as smm

## plot
import seaborn as sns
import matplotlib.pyplot as plt

os.chdir("/home/wangjing/wangj/codebase/HUSI/")

In [None]:
# data downloaded from https://pubmed.ncbi.nlm.nih.gov/38279009/

# data preprocessing
dd='ReplogleWeissman2022_rpe1'
adata=sc.read_h5ad(dd+".h5ad")
adata

# remove low quality cells
pt_gene=adata.obs.perturbation.unique()
pt_gene=[x for x in pt_gene if 'control' not in x]
pt_gene=set(pt_gene) & set(adata.var_names)
print(len(pt_gene))

non_targeted_mask=adata.obs['perturbation']=='control'
non_targeted_means = {}
for gene in pt_gene:
    non_targeted_means[gene] = np.mean(adata[non_targeted_mask, :].obs_vector(gene))

# non_targeted_means
filtered_indices = []
for gene in pt_gene:
    perturbed_mask = adata.obs['perturbation'] == f'{gene}'
    gene_expression = adata[perturbed_mask, :].obs_vector(gene)
    threshold = non_targeted_means[gene] / 4

    valid_indices = np.where(gene_expression <= threshold)[0]
    global_indices = np.where(perturbed_mask)[0][valid_indices]
    filtered_indices.extend(global_indices)

adata_filtered = adata[filtered_indices, :]
adata_filtered 

pt_gene=adata.obs.perturbation.unique()
# pt_gene=[x for x in pt_gene if 'control' not in x]
print(len(pt_gene))

vn=set(adata.var_names)
noExp=set(pt_gene) - vn
print(len(noExp))

adata_sub=adata[adata.obs['perturbation'].isin(noExp)]
len(adata_sub.obs.perturbation.unique())

adatafinal=anndata.concat([adata_filtered,adata_sub],axis=0)
adatafinal

len(adatafinal.obs.perturbation.unique())

sc.pp.filter_cells(adatafinal,min_genes=200)
sc.pp.filter_genes(adatafinal,min_cells=3)
adatafinal


2106
2394
288


AnnData object with n_obs × n_vars = 228790 × 8749
    obs: 'batch', 'gene', 'gene_id', 'transcript', 'gene_transcript', 'guide_id', 'percent_mito', 'UMI_count', 'z_gemgroup_UMI', 'core_scale_factor', 'core_adjusted_UMI_count', 'disease', 'cancer', 'cell_line', 'sex', 'age', 'perturbation', 'organism', 'perturbation_type', 'tissue_type', 'ncounts', 'ngenes', 'nperts', 'percent_ribo', 'celltype', 'n_genes'
    var: 'n_cells'

In [None]:
adata=adatafinal

pt_gene=adata.obs.perturbation.unique()
print(len(pt_gene))
pt_gene=[x for x in pt_gene if 'control' not in x]
print(len(pt_gene))

2394
2393


In [6]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

In [None]:
def cal_hUSI(adata):
    mm_l2 = pd.read_csv('Data/SenOCLR_12_1_drop.csv',index_col=0)
    genes = set(mm_l2.index) & set(adata.var_names)
    try:
        exp = adata[:,list(genes)].X.todense()
    except:
        exp = adata[:,list(genes)].X
    exp = pd.DataFrame(exp,index=adata.obs_names,columns=list(genes))
    score = []
    for row in range(len(exp)):  
        score.append(mm_l2.w[list(genes)].corr(exp.iloc[row],method='spearman'))
    return score

In [8]:
hUSI = cal_hUSI(adata)
adata.obs['hUSI'] = hUSI
adata.obs

Unnamed: 0_level_0,batch,gene,gene_id,transcript,gene_transcript,guide_id,percent_mito,UMI_count,z_gemgroup_UMI,core_scale_factor,...,organism,perturbation_type,tissue_type,ncounts,ngenes,nperts,percent_ribo,celltype,n_genes,hUSI
cell_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCATCCAACCGG-21,21,PPP2CB,ENSG00000104695,P1P2,6679_PPP2CB_P1P2_ENSG00000104695,PPP2CB_+_30670275.23-P1P2|PPP2CB_+_30670289.23...,0.058505,4991.0,-0.515229,0.493396,...,human,CRISPR,cell_line,4925.0,1997,1,0.229239,retinal pigment epithelial cells,1997,0.019032
AAACGCTAGTGATAGT-49,49,PPP2CB,ENSG00000104695,P1P2,6679_PPP2CB_P1P2_ENSG00000104695,PPP2CB_+_30670275.23-P1P2|PPP2CB_+_30670289.23...,0.068958,14545.0,0.698256,0.876616,...,human,CRISPR,cell_line,14395.0,3277,1,0.263147,retinal pigment epithelial cells,3277,0.134005
AACGAAAAGTAAACGT-33,33,PPP2CB,ENSG00000104695,P1P2,6679_PPP2CB_P1P2_ENSG00000104695,PPP2CB_+_30670275.23-P1P2|PPP2CB_+_30670289.23...,0.057443,11507.0,-0.630857,1.141283,...,human,CRISPR,cell_line,11370.0,3169,1,0.256376,retinal pigment epithelial cells,3169,-0.010921
AAGACTCAGGCTCAAG-56,56,PPP2CB,ENSG00000104695,P1P2,6679_PPP2CB_P1P2_ENSG00000104695,PPP2CB_+_30670275.23-P1P2|PPP2CB_+_30670289.23...,0.051307,12123.0,0.027935,0.960976,...,human,CRISPR,cell_line,11982.0,3073,1,0.305959,retinal pigment epithelial cells,3073,0.009359
AAGTCGTAGTAAACGT-2,2,PPP2CB,ENSG00000104695,P1P2,6679_PPP2CB_P1P2_ENSG00000104695,PPP2CB_+_30670275.23-P1P2|PPP2CB_+_30670289.23...,0.060373,5847.0,-1.344563,0.936004,...,human,CRISPR,cell_line,5763.0,2016,1,0.289259,retinal pigment epithelial cells,2016,0.073845
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGTCGAACACT-49,49,KCNA10,ENSG00000143105,ENST00000369771.2,4292_KCNA10_ENST00000369771.2_ENSG00000143105,KCNA10_-_111061735.23-ENST00000369771.2|KCNA10...,0.071890,13618.0,0.515106,0.876616,...,human,CRISPR,cell_line,13395.0,3594,1,0.195521,retinal pigment epithelial cells,3594,0.086938
TTTGTTGTCGAGAGAC-31,31,OR4K1,ENSG00000155249,ENST00000285600.4,5955_OR4K1_ENST00000285600.4_ENSG00000155249,OR4K1_-_20403828.23-ENST00000285600.4|OR4K1_+_...,0.064337,11533.0,0.046112,0.918100,...,human,CRISPR,cell_line,11317.0,3562,1,0.190421,retinal pigment epithelial cells,3562,0.073359
TTTGTTGTCGGCTCTT-36,36,non-targeting,non-targeting,non-targeting,11208_non-targeting_non-targeting_non-targeting,non-targeting_02977|non-targeting_01146,0.061335,11853.0,1.030208,0.644705,...,human,CRISPR,cell_line,11672.0,3613,0,0.197310,retinal pigment epithelial cells,3613,-0.008919
TTTGTTGTCGTTATCT-37,37,CEP68,ENSG00000011523,P1P2,1510_CEP68_P1P2_ENSG00000011523,CEP68_-_65283588.23-P1P2|CEP68_-_65283603.23-P1P2,0.066311,8988.0,-0.422864,0.908722,...,human,CRISPR,cell_line,8857.0,3030,1,0.191713,retinal pigment epithelial cells,3030,0.028007


In [9]:
adata.obs['hUSI_raw']=hUSI
adata.obs['hUSI']=(adata.obs['hUSI']-adata.obs['hUSI'].min())/(adata.obs['hUSI'].max()-adata.obs['hUSI'].min())
adata.obs['hUSI'].max()

np.float64(1.0)

In [10]:
meta=adata.obs

control_data=meta[meta['perturbation']=='control']['hUSI']
exp_groups=meta['perturbation'].unique()

results_hUSI=[]

for group in exp_groups:
    if group != 'control':
        exp_data=meta[meta['perturbation']==group]['hUSI']

        #fc
        fc=exp_data.mean()/control_data.mean()
        deltaMean=exp_data.mean()-control_data.mean()

        #wilcoxon test
        stat, p_values = mannwhitneyu(exp_data,control_data,alternative='two-sided')

        # make data
        results_hUSI.append({
            'group':group,
            'fc':fc,
            'deltaMean':deltaMean,
            'pvalue':p_values
        })

results_hUSI=pd.DataFrame(results_hUSI)

pv=results_hUSI['pvalue'].values
reject,padj, _, _=smm.multipletests(pv,alpha=0.05,method='fdr_bh')
results_hUSI['padj']=padj
results_hUSI

Unnamed: 0,group,fc,deltaMean,pvalue,padj
0,PPP2CB,1.254381,0.075099,7.368168e-08,1.632595e-07
1,USPL1,1.424136,0.125214,5.187201e-05,8.729236e-05
2,COPA,1.315289,0.093080,4.835780e-02,6.347791e-02
3,USP39,1.272551,0.080463,1.038491e-03,1.578850e-03
4,EIF3J,1.084295,0.024886,1.332752e-01,1.649056e-01
...,...,...,...,...,...
2388,TNFSF10,1.442909,0.130757,2.856769e-02,3.849239e-02
2389,CPEB1,1.903927,0.266859,1.990563e-11,6.162248e-11
2390,DCLRE1B,1.980220,0.289383,9.198530e-08,2.021311e-07
2391,GINS3,1.988711,0.291890,7.649524e-12,2.468195e-11


In [None]:
results_hUSI.to_csv('Data/RPE1_hUSI_'+dd+'_fc_SenOCLR_l2_1_drop.csv',sep='\t')
meta.to_csv('Data/RPE1_hUSI_'+dd+'_meta_SenOCLR_l2_1_drop.csv',sep='\t')

In [11]:
print(results_hUSI.shape)
res_sig=results_hUSI[results_hUSI['padj'] < 0.05]

print(res_sig.shape)
res_sig=results_hUSI[(results_hUSI['padj'] < 0.05) & (results_hUSI['fc'] > 2)]
print(res_sig.shape)

res_sig=res_sig.sort_values(by='fc',ascending=False)
res_sig.reset_index(drop=True,inplace=True)
res_sig.index +=1

res_sig.head(20)


(2393, 5)
(1808, 5)
(264, 5)


Unnamed: 0,group,fc,deltaMean,pvalue,padj
1,NACA,2.356832,0.400567,3.596164e-15,1.605526e-14
2,ECT2,2.347625,0.397849,7.635228e-13,2.735982e-12
3,PWP2,2.346738,0.397587,2.041249e-09,5.263696e-09
4,PRIM1,2.333028,0.39354,1.404089e-24,1.480169e-23
5,PSMA7,2.324195,0.390932,3.167716e-12,1.058707e-11
6,INCENP,2.319369,0.389507,2.406473e-05,4.160903e-05
7,PSMD13,2.313703,0.387835,8.43961e-16,4.039197e-15
8,PSMD2,2.304741,0.385189,9.738258e-13,3.467805e-12
9,DDX49,2.293894,0.381986,2.664407e-07,5.568494e-07
10,PDCD11,2.292617,0.381609,2.097677e-28,3.0795960000000003e-27


In [None]:
sc.pp.highly_variable_genes(adata,n_top_genes=2000, inplace=True)
adata.raw=adata
adata=adata[:,adata.var.highly_variable]
sc.pp.scale(adata,max_value=10)

In [None]:
## PCA
sc.tl.pca(adata,svd_solver='arpack')
sc.pl.pca(adata,color='hUSI')
sc.pl.pca_variance_ratio(adata,log=True)

In [None]:
##umap
sc.pp.neighbors(adata,n_pcs=10)
sc.tl.umap(adata)
sc.pl.umap(adata,color=['hUSI'])

In [None]:
adata.obs['group']=adata.obs['perturbation'].apply(lambda x: 'control' if x=='control' else 'perturbations')
adata.obs['group'].value_counts()

In [None]:
sc.pp.neighbors(adata,n_pcs=10)
sc.tl.umap(adata)

### all cells
sns.set(rc={'figure.figsize':(25,20)},font_scale=1.5)
sc.set_figure_params(dpi_save=300)
sns.set_style("white")
sc.pl.umap(adata, color=['group'],s=5,palette=['#C75C64',"#CEDFEF"],save='Results/PerturbSeq/scPerturb_seq_RPE1_umap_perturbation.pdf',show = True,title='Conditions')