In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore", UserWarning)
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

In [2]:
# Configure rpy2 to Use the Correct R Installation:
import os
os.environ['R_HOME'] = '/software/cellgen/team205/kk18/envs/generic_env/lib/R'
# check
from rpy2.rinterface_lib import openrlib
print(openrlib.R_HOME)

/software/cellgen/team205/kk18/envs/generic_env/lib/R


In [3]:
import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging

from rpy2.robjects import pandas2ri
from rpy2.robjects import r

sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

  anndata2ri.activate()


In [4]:
sc.settings.set_figure_params(dpi=80)

In [5]:
import importlib.util
import sys
spec = importlib.util.spec_from_file_location("module.name", "/nfs/team205/kk18/function/python/utils.py")
utils = importlib.util.module_from_spec(spec)
sys.modules["module.name"] = utils
spec.loader.exec_module(utils)

In [6]:
%%R
library(edgeR)
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors


Loading required package: limma


# Read in adata

In [7]:
# pseudobulked per nhood
# nhood-by-gene
adata = sc.read_h5ad('/nfs/team205/heart/anndata_objects/Foetal/trisomy21/Euploid_T21Hearts_Aug2024_sel_subsampled-per-cellstate-donor_cells-in-sig-nhoods-sfdr0p1_pseudobulked-nhood-reassigned-sum_finegrain.h5ad')
adata

AnnData object with n_obs × n_vars = 577 × 36601
    obs: 'index_cell', 'logFC', 'PValue', 'FDR', 'SpatialFDR', 'nhood_annotation', 'deg_group'
    var: 'gene_id', 'gene_name_scRNA-0-0', 'feature_type-1', 'mt-1', 'ribo-1', 'n_cells_by_counts-1', 'mean_counts-1', 'log1p_mean_counts-1', 'pct_dropout_by_counts-1', 'total_counts-1', 'log1p_total_counts-1', 'gene_name_GenomicFeatures', 'gene_biotype_GenomicFeatures', 'gene_seq_start_GenomicFeatures', 'gene_seq_end_GenomicFeatures', 'seq_name_GenomicFeatures', 'seq_strand_GenomicFeatures', 'seq_coord_system_GenomicFeatures', 'description_GenomicFeatures', 'gene_id_version_GenomicFeatures', 'canonical_transcript_GenomicFeatures', 'gene_name'

# DEG test for each celltype, logFC_direction: euploid vs reference

In [8]:
celltype_list = [x for x in adata.obs['nhood_annotation'].unique() if x!='Mixed']
celltype_list

['AtrialCardiomyocytesRight',
 'AtrialCardiomyocytesCycling',
 'VentricularConductionSystemProximal',
 'AtrialCardiomyocytesLeft',
 'AtrioventricularNodePacemakerCells',
 'CoronaryCapillaryEndothelialCells',
 'CoronaryPericytes',
 'CoronaryVesselAdventitialFibroblasts',
 'DuctusArteriosusSmoothMuscleCells',
 'EndocardialCells',
 'EndocardialCushionCells',
 'SubEpicardialFibroblasts',
 'GreatVesselAdventitialFibroblasts',
 'GreatVesselArterialEndothelialCells',
 'GreatVesselSmoothMuscleCells',
 'MacrophagesATF3pos',
 'MacrophagesLYVE1pos',
 'Megakaryocytes',
 'MesothelialEpicardialCells',
 'MyocardialInterstitialFibroblasts',
 'ValveInterstitialCells',
 'Myofibroblasts',
 'ParasympatheticNeurons',
 'VentricularCardiomyocytesCycling',
 'VentricularCardiomyocytesRightTrabeculated',
 'VentricularCardiomyocytesLeftCompact',
 'VentricularCardiomyocytesLeftTrabeculated',
 'VentricularCardiomyocytesRightCompact']

In [9]:
# select celltypes which has n_sample>=3
ctab = pd.crosstab(adata.obs['nhood_annotation'],adata.obs['deg_group'])[['euploid','trisomy21']]
celltypes_sel = list(ctab.index[(ctab>=3).sum(axis=1)==2])
celltypes_sel = [x for x in celltypes_sel if x!='Mixed']
adata_sub = adata[adata.obs['nhood_annotation'].isin(celltypes_sel)]
# subset groups
adata_sub = adata_sub[adata_sub.obs['deg_group'].isin(['euploid','trisomy21'])]
pd.crosstab(adata_sub.obs['nhood_annotation'],adata_sub.obs['deg_group'])

deg_group,euploid,trisomy21
nhood_annotation,Unnamed: 1_level_1,Unnamed: 2_level_1
AtrialCardiomyocytesLeft,12,24
AtrialCardiomyocytesRight,54,16
CoronaryCapillaryEndothelialCells,6,5
EndocardialCells,3,9
GreatVesselAdventitialFibroblasts,4,12
GreatVesselSmoothMuscleCells,13,9
MacrophagesATF3pos,5,7
MacrophagesLYVE1pos,14,30
MyocardialInterstitialFibroblasts,25,18
SubEpicardialFibroblasts,6,9


In [10]:
%%R -i adata_sub
class(adata_sub)

[1] "SingleCellExperiment"
attr(,"package")
[1] "SingleCellExperiment"


In [11]:
%%R
# https://github.com/compbiomed/singleCellTK/blob/40527afd2e58e14451aae655c0d8fae4a7a0f54b/R/subsetSCE.R#L140
subsetSCECols <- function(inSCE, index = NULL, bool = NULL, colData = NULL) {

  if(is.null(index) & is.null(bool) & is.null(colData)) {
    stop("At least one of 'index', 'bool', or 'colData' must be supplied.")
  }
  final.ix <- rep(TRUE, ncol(inSCE))

  # Parse index containing integers
  if(!is.null(index)) {
    if(min(index) < 1 | max(index) > ncol(inSCE)) {
      stop("'index' must contain integers between 1 and the number of columns ",
           "in 'inSCE': ", ncol(inSCE))
    }
    final.ix[!seq(ncol(inSCE)) %in% index] <- FALSE
  }

  # Parse Boolean vector
  if(!is.null(bool)) {
    if(length(bool) != ncol(inSCE) | !is.logical(bool)) {
      stop("'bool' must be a logical vector the same length as the number of ",
           "colmns in 'inSCE': ", ncol(inSCE))
    }
    final.ix[!bool] <- FALSE
  }

  # Parse expressions for colData variables
  if(!is.null(colData)) {
    for(i in seq_along(colData)) {
      temp <- eval(parse(text = as.character(colData[i])),
                   envir = as.data.frame(colData(inSCE)))
      if(length(temp) != ncol(inSCE) | !is.logical(temp)) {
        stop("The expression ", colData[i], " did not produce a boolean ",
             "vector the same length as the number of columns in 'inSCE'. ",
             "Please ensure that the spelling of the variable you are ",
             "trying to use matches one of the column names in ",
             "'colData(inSCE)' and that your expression is valid.")
      }
      final.ix[!temp] <- FALSE
    }
  }

  inSCE <- inSCE[,final.ix]
  return(inSCE)
  }

# https://www.sc-best-practices.org/conditions/differential_gene_expression.html
fit_model <- function(adata_){
    # create an edgeR object with counts and grouping factor
    y <- DGEList(assay(adata_, "X"), group = colData(adata_)$deg_group)
    # filter out genes with low counts
    print("Dimensions before subsetting:")
    print(dim(y))
    print("")
    keep <- filterByExpr(y)
    y <- y[keep, , keep.lib.sizes=FALSE]
    print("Dimensions after subsetting:")
    print(dim(y))
    print("")
    # normalize
    y <- calcNormFactors(y)
    # create a vector that is a concatenation of condition and cell type that we will later use with contrasts
    group <- colData(adata_)$deg_group
    # sex <- colData(adata_)$sex
    # replicate <- colData(adata_)$donor
    # create a design matrix: here we have multiple donors so also consider that in the design matrix
    design <- model.matrix(~ 0 + group)
    # estimate dispersion
    y <- estimateDisp(y, design = design)
    # fit the model
    fit <- glmQLFit(y, design)
    return(list("fit"=fit, "design"=design, "y"=y))
}

In [12]:
%%time
%%R -o tt_all 
celltype_list <- unique(colData(adata_sub)$nhood_annotation)
for(celltype in celltype_list){
    print(celltype)
    # subset
    sce <- subsetSCECols(adata_sub, colData = "nhood_annotation == celltype")
    # fit model
    outs <-fit_model(sce)
    # get output
    fit <- outs$fit
    y <- outs$y
    # deg
    myContrast <- makeContrasts('grouptrisomy21-groupeuploid', levels = y$design)
    qlf <- glmQLFTest(fit, contrast=myContrast)
    # get all of the DE genes and calculate Benjamini-Hochberg adjusted FDR
    tt <- topTags(qlf, n = Inf)
    tt <- tt$table
    tt <- rownames_to_column(tt, var = "gene_name")
    tt$celltype <- celltype
    # rbind
    if(celltype==celltype_list[1]){
        tt_all = tt
    }else{
        tt_all = rbind(tt_all,tt)
    }
    print('')
}

[1] "AtrialCardiomyocytesRight"
[1] "Dimensions before subsetting:"
[1] 36601    70
[1] ""
[1] "Dimensions after subsetting:"
[1] 3761   70
[1] ""
[1] ""
[1] "AtrialCardiomyocytesLeft"
[1] "Dimensions before subsetting:"
[1] 36601    36
[1] ""
[1] "Dimensions after subsetting:"
[1] 3740   36
[1] ""
[1] ""
[1] "CoronaryCapillaryEndothelialCells"
[1] "Dimensions before subsetting:"
[1] 36601    11
[1] ""
[1] "Dimensions after subsetting:"
[1] 2684   11
[1] ""
[1] ""
[1] "EndocardialCells"
[1] "Dimensions before subsetting:"
[1] 36601    12
[1] ""
[1] "Dimensions after subsetting:"
[1] 5171   12
[1] ""
[1] ""
[1] "SubEpicardialFibroblasts"
[1] "Dimensions before subsetting:"
[1] 36601    15
[1] ""
[1] "Dimensions after subsetting:"
[1] 2628   15
[1] ""
[1] ""
[1] "GreatVesselAdventitialFibroblasts"
[1] "Dimensions before subsetting:"
[1] 36601    16
[1] ""
[1] "Dimensions after subsetting:"
[1] 3758   16
[1] ""
[1] ""
[1] "GreatVesselSmoothMuscleCells"
[1] "Dimensions before subsetting:"


In [13]:
# save
tt_all.to_csv('./milo_nhood-reassigned_pseudobulk_edgeR_res_tri21-nhoods_vs_euploid-nhoods_finegrain.csv')

In [14]:
tt_all

Unnamed: 0,gene_name,logFC,logCPM,F,PValue,FDR,celltype
1,ARL17B,-1.745741,6.847549,94.587301,5.743020e-15,2.159950e-11,AtrialCardiomyocytesRight
2,NEXN,-1.570495,10.283666,73.534824,8.173152e-13,1.222062e-09,AtrialCardiomyocytesRight
3,FHL2,1.364728,9.409786,72.871337,9.747901e-13,1.222062e-09,AtrialCardiomyocytesRight
4,CALD1,-1.808502,11.626061,67.648059,3.860583e-12,3.199400e-09,AtrialCardiomyocytesRight
5,MYH9,-1.932809,8.469364,67.324364,4.253390e-12,3.199400e-09,AtrialCardiomyocytesRight
...,...,...,...,...,...,...,...
42408,VEGFB,0.001968,6.655991,0.000051,9.943554e-01,9.955906e-01,VentricularCardiomyocytesRightCompact
42409,NINL,0.001491,7.221757,0.000045,9.946975e-01,9.956240e-01,VentricularCardiomyocytesRightCompact
42410,RAD50,-0.001546,6.809448,0.000038,9.951446e-01,9.957623e-01,VentricularCardiomyocytesRightCompact
42411,BMP7,-0.001082,7.370648,0.000029,9.957088e-01,9.960177e-01,VentricularCardiomyocytesRightCompact
