# use snATAC data to validate cell type associations
- similar to Figure 3E in Sheng et al., bioRxiv 2020 (https://doi.org/10.1101/2020.11.09.375592)
- snATAC data from adult heart from Hocker et al., Science Advances 2021 (https://advances.sciencemag.org/content/7/20/eabf1444/tab-article-info):
 - relative accessibility score: Table S5 (http://ns104190.ip-147-135-44.us/data_CARE_portal/snATAC/matrices/Supplemental_Table_5_RAS.tsv)
 - cell type-specific regulatory elements: Table S6 (https://advances.sciencemag.org/highwire/filestream/254970/field_highwire_adjunct_files/17/abf1444_Table_S6.xlsx)

In [1]:
setwd("/frazer01/projects/CARDIPS/analysis/cardiac_gwas_coloc")

source("script/functions.R"  )


In [2]:
dir.create("pipeline/eqtl.1.cell_type_validation"       , showWarnings = FALSE)
dir.create("pipeline/eqtl.1.cell_type_validation/snatac", showWarnings = FALSE)


In [3]:
ras_hg38 = fread("/frazer01/reference/public/Hocker_Science_Advances_2021/Supplemental_Table_5_RAS.tsv", sep = "\t", header = TRUE, data.table = FALSE)
cts_hg38 = fread("/frazer01/reference/public/Hocker_Science_Advances_2021/Supplemental_Table_6.tsv"    , sep = "\t", header = TRUE, data.table = FALSE)

colnames(ras_hg38)  = gsub("_RAS", "", colnames(ras_hg38))
ras_hg38$coord_hg38 = paste(ras_hg38$chr  , ras_hg38$start, ras_hg38$end, sep = "_")
cts_hg38$coord_hg38 = paste(cts_hg38$chrom, cts_hg38$start, cts_hg38$end, sep = "_")


In [4]:
qtl_list = readRDS("/frazer01/projects/CARDIPS/analysis/cardiac_qtls_combined/input/qtl/qtls.RDS"   )
exp_list = readRDS("/frazer01/projects/CARDIPS/analysis/cardiac_qtls_combined/input/expdata_qtl.rds")

In [5]:
phenotypes            = qtl_list$phenotypes
qtls                  = qtl_list$qtl
coordinates           = exp_list$coordinates
rownames(coordinates) = coordinates$transcript_id

# liftover snATAC coordinates to hg19

In [6]:
snatac_hg38_bed = "pipeline/eqtl.1.cell_type_validation/snatac/snatac_hg38.bed"
snatac_hg19_bed = "pipeline/eqtl.1.cell_type_validation/snatac/snatac_hg19.bed"

fwrite(ras_hg38[order(ras_hg38$chr, ras_hg38$start, ras_hg38$end), c("chr", "start", "end", "coord_hg38")], snatac_hg38_bed, sep = "\t", col.names = FALSE, row.names = FALSE)

In [7]:
liftover         = "/software/ucsc.linux.x86_64.20151103/liftOver"
chain_file       = "/frazer01/reference/private/hg38ToHg19.over.chain"
command_liftover = paste(liftover, snatac_hg38_bed, chain_file, snatac_hg19_bed, "unMapped")

#system(command_liftover)


In [8]:
snatac           = fread(snatac_hg19_bed, sep = "\t", header = FALSE, data.table = FALSE)
colnames(snatac) = c("chrom", "start", "end", "coord_hg38")
snatac$coord     = paste(snatac$chrom, snatac$start, snatac$end, sep = "_")


In [9]:
ras = merge(snatac, ras_hg38[, !colnames(ras_hg38) %in% c("chr"  , "start", "end")], by = "coord_hg38")


In [10]:
cells_eqtl         = exp_list[["color"]][["by_cell"]]
cells_snatac       = data.frame(cell_type = colnames(ras)[6:14], 
                                name       = c("fibroblast", "ventricular CM"           , "atrial CM"           , "endothelial", "smooth muscle", "macrophages", "lymphocytes", "adipocytes", "cardiac neuron"),
                                cell_type2 = c("Fibroblast", "Ventricular Cardiomyocyte", "Atrial Cardiomyocyte", "Endothelial", "Smooth Muscle", "Macrophage" , "Lymphocyte" , "Adipocyte" , "Nervous"       )
                               )
cells_snatac$order = 1:nrow(cells_snatac)

In [11]:
cts = merge(snatac, cts_hg38[, !colnames(cts_hg38) %in% c("chrom", "start", "end")], by = "coord_hg38")
cts = merge(cells_snatac[,c("cell_type", "cell_type2")], cts   , by.x = "cell_type2", by.y = "cell_type")
cts$cell_type2 = NULL

In [12]:
fwrite(ras, "pipeline/eqtl.1.cell_type_validation/snatac/relative_accessibility_score.txt", sep = "\t", col.names = TRUE, row.names = FALSE)
fwrite(cts, "pipeline/eqtl.1.cell_type_validation/snatac/cell_type_specific_peaks.txt"    , sep = "\t", col.names = TRUE, row.names = FALSE)


In [13]:
snatac_bed = "pipeline/eqtl.1.cell_type_validation/snatac/snatac_peaks.bed"

fwrite(ras[order(ras$chrom, ras$start, ras$end), c("chrom", "start", "end", "coord")], snatac_bed, sep = "\t", col.names = FALSE, row.names = FALSE)


In [27]:
str(ras)
cells_snatac

'data.frame':	286725 obs. of  15 variables:
 $ coord_hg38: chr  "chr1_100006460_100007036" "chr1_100023423_100024000" "chr1_100024429_100024693" "chr1_100037684_100039029" ...
 $ chrom     : chr  "chr1" "chr1" "chr1" "chr1" ...
 $ start     : int  100472016 100488979 100489985 100503240 100514008 100514360 100515014 100517519 100540213 100540532 ...
 $ end       : int  100472592 100489556 100490249 100504585 100514272 100514667 100515429 100517794 100540423 100541040 ...
 $ coord     : chr  "chr1_100472016_100472592" "chr1_100488979_100489556" "chr1_100489985_100490249" "chr1_100503240_100504585" ...
 $ FB        : num  0.1207 0.0453 0.0884 0.0905 0.0471 ...
 $ vCM       : num  0.308 0.516 0.157 0.11 0.168 ...
 $ aCM       : num  0.21 0.229 0.174 0.103 0.315 ...
 $ EC        : num  0.0732 0.0552 0.0927 0.1156 0.0966 ...
 $ SM        : num  0.0871 0.0427 0.063 0.1207 0.1432 ...
 $ MAC       : num  0.1912 0.0274 0.0797 0.1186 0.2168 ...
 $ LC        : num  0.00572 0.00188 0.34129 0.17577

cell_type,name,cell_type2,order
<chr>,<chr>,<chr>,<int>
FB,fibroblast,Fibroblast,1
vCM,ventricular CM,Ventricular Cardiomyocyte,2
aCM,atrial CM,Atrial Cardiomyocyte,3
EC,endothelial,Endothelial,4
SM,smooth muscle,Smooth Muscle,5
MAC,macrophages,Macrophage,6
LC,lymphocytes,Lymphocyte,7
AD,adipocytes,Adipocyte,8
NR,cardiac neuron,Nervous,9


In [15]:
ras_norm           = as.data.frame(t(scale(t(as.matrix(ras[, cells_snatac$cell_type])), scale = TRUE)))
rownames(ras_norm) = ras$coord

# Read QTLs

In [16]:
cs_gene = fread("/frazer01/projects/CARDIPS/analysis/cardiac_eqtls/pipeline/3.2.eqtls/eqtls_fine_map/cardiac_eqtls.gene.txt"   , sep = "\t", header = TRUE, data.table = FALSE)
cs_isof = fread("/frazer01/projects/CARDIPS/analysis/cardiac_eqtls/pipeline/3.2.eqtls/eqtls_fine_map/cardiac_eqtls.isoform.txt", sep = "\t", header = TRUE, data.table = FALSE)


In [17]:
cs_gene$tr2type = paste(cs_gene$transcript_id, cs_gene$type)
cs_isof$tr2type = paste(cs_isof$transcript_id, cs_isof$type)


# Overlap lead eQTLs with snATAC peaks

In [18]:
intersect_qtls = function(phenotype, snatac_bed, cs)
{
    indata       = unique(cs[ cs$pp >= 0.01, "id"])
    indata       = data.frame(chrom = paste0    ("chr", unlist(lapply(indata, function(x){unlist(strsplit(x, "_"))[[2]]}))),
                              pos   = as.numeric(       unlist(lapply(indata, function(x){unlist(strsplit(x, "_"))[[3]]}))),
                              id    = indata
                             )
    variants_bed = paste("pipeline/eqtl.1.cell_type_validation/variants", phenotype, "bed", sep = ".")
    command      = paste("bedtools", "intersect", "-loj", 
                         "-a", variants_bed, 
                         "-b", snatac_bed)
    
    fwrite(unique(indata[order(indata$chrom, indata$pos), c("chrom", "pos", "pos", "id")]), variants_bed, sep = "\t", col.names = FALSE, row.names = FALSE)
    
    intersected           = fread(cmd = command, sep = "\t", header = FALSE, data.table = FALSE)[,c(4,8)]
    colnames(intersected) = c("id", "coord")
    
    return(intersected)
}

int_gene = intersect_qtls("rna"    , snatac_bed, cs_gene)
int_isof = intersect_qtls("isoform", snatac_bed, cs_isof)



In [19]:
str(int_gene)
nrow(int_gene[ int_gene$coord != ".",])

'data.frame':	201082 obs. of  2 variables:
 $ id   : chr  "VAR_1_63735_CCTA_C" "VAR_1_233685_GC_G" "VAR_1_247916_CAGG_C" "VAR_1_250236_CT_C" ...
 $ coord: chr  "." "." "." "." ...


In [20]:
find_overlap_by_transcript = function(x, cs, phenotype)
{
    eqtl2cell         = qtl_list[["interactions"]][[phenotype]]
    x                 = merge(cs, x[ x$coord != ".",])
    x                 = x[order(x$pp, -x$pval, decreasing = TRUE),]
    out               = aggregate(coord ~ transcript_id + type + tr2type, data = x, FUN = function(x){x[[1]]})
    out               = merge(out, ras_norm, by.x = "coord", by.y = "row.names")
    eqtl2cell$tr2type = paste(eqtl2cell$transcript_id, eqtl2cell$type)

    for(cell in sort(unique(eqtl2cell$interaction)))
    {
        mycol = paste("eqtl", cell, sep = ".")
        out[,mycol] = 0
        
        out[ out$tr2type %in% eqtl2cell[ eqtl2cell$interaction == cell & eqtl2cell$cell == TRUE, "tr2type"], mycol] = 1
    }
    
    return(out)
}

eqtl2cell_gene = find_overlap_by_transcript(int_gene, cs_gene, "rna"    )
eqtl2cell_isof = find_overlap_by_transcript(int_isof, cs_isof, "isoform")


In [21]:
str(eqtl2cell_gene)
length(unique(eqtl2cell_gene$transcript_id))

'data.frame':	14665 obs. of  29 variables:
 $ coord                                : chr  "chr1_100161444_100161987" "chr1_100434979_100436187" "chr1_100503240_100504585" "chr1_100503240_100504585" ...
 $ transcript_id                        : chr  "ENSG00000099260.11_5" "ENSG00000117620.15_8" "ENSG00000162688.17_5" "ENSG00000137996.12_3" ...
 $ type                                 : int  0 0 0 2 0 1 0 1 0 0 ...
 $ tr2type                              : chr  "ENSG00000099260.11_5 0" "ENSG00000117620.15_8 0" "ENSG00000162688.17_5 0" "ENSG00000137996.12_3 2" ...
 $ FB                                   : num  -0.449 -1.669 -0.728 -0.728 -0.728 ...
 $ vCM                                  : num  0.0759 0.3043 -0.0531 -0.0531 -0.0531 ...
 $ aCM                                  : num  -0.234 -0.271 -0.287 -0.287 -0.287 ...
 $ EC                                   : num  0.0254 0.1448 0.1572 0.1572 0.1572 ...
 $ SM                                   : num  -0.549 -1.009 0.337 0.337 0.337 ...
 $ 

In [22]:
nrow(eqtl2cell_gene[ eqtl2cell_gene$eqtl.cibersort.regular.cardiac_muscle == 1, ])

In [23]:
calculate_eqtl2cell = function(totest)
{
    out = as.data.frame(rbindlist(lapply(cells_eqtl$cell_type, function(cell1)
    {
        as.data.frame(rbindlist(lapply(cells_snatac$cell_type, function(cell2)
        {
            if(nrow(totest[totest[, paste("eqtl", cell1, sep = ".")] == 1, ]) >= 2)
            {
                #test = t.test(totest[totest[, paste("eqtl", cell1, sep = ".")] == 1, cell2],          totest[totest[, paste("eqtl", cell1, sep = ".")] == 0, cell2])
                test = t.test(totest[totest[, paste("eqtl", cell1, sep = ".")] == 1, cell2], rowMeans(totest[totest[, paste("eqtl", cell1, sep = ".")] == 1, cells_snatac[ cells_snatac$cell_type != cell2, "cell_type"]]), paired = TRUE)
                
                return(data.frame(cell_eqtl   = cell1, 
                                  cell_snatac = cell2, 
                                  estimate    = test$estimate,
                                  ci1         = test$conf.int[[1]],
                                  ci2         = test$conf.int[[2]],
                                  pval        = test$p.value
                                 ))
                #return(data.frame(cell_eqtl   = cell1, 
                #                  cell_snatac = cell2, 
                #                  estimate1   = test$estimate[[1]],
                #                  estimate2   = test$estimate[[2]],
                #                  ci1         = test$conf.int[[1]],
                #                  ci2         = test$conf.int[[2]],
                #                  pval        = test$p.value
                #                 ))
            }else
            {
                return(data.frame(cell_eqtl   = cell1, 
                                  cell_snatac = cell2, 
                                  estimate    = NA,
                                  ci1         = NA,
                                  ci2         = NA,
                                  pval        = 1
                                 ))
                #return(data.frame(cell_eqtl   = cell1, 
                #                  cell_snatac = cell2, 
                #                  estimate1   = NA,
                #                  estimate2   = NA,
                #                  ci1         = NA,
                #                  ci2         = NA,
                #                  pval        = 1
                #                 ))
            }
        })), stringsAsFactors = FALSE)
    })), stringsAsFactors = FALSE)
    return(out)
}

tests_gene = calculate_eqtl2cell(eqtl2cell_gene)
tests_isof = calculate_eqtl2cell(eqtl2cell_isof)


In [24]:
head(tests_gene[order(tests_gene$pval),])
head(tests_isof[order(tests_isof$pval),])


Unnamed: 0_level_0,cell_eqtl,cell_snatac,estimate,ci1,ci2,pval
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
6,cibersort.regular.cardiac_muscle,MAC,-0.3538216,-0.4363719,-0.27127124,8.131272e-16
2,cibersort.regular.cardiac_muscle,vCM,0.4863802,0.3644809,0.60827954,4.763539e-14
1,cibersort.regular.cardiac_muscle,FB,-0.255329,-0.3344124,-0.17624572,6.423617e-10
3,cibersort.regular.cardiac_muscle,aCM,0.3699731,0.2466272,0.49331897,8.357681e-09
52,cibersort.regular.fibroblast,LC,0.3836194,0.1408905,0.62634831,0.002181425
30,cibersort.regular.immune,aCM,-0.2176995,-0.359121,-0.07627796,0.002714402


Unnamed: 0_level_0,cell_eqtl,cell_snatac,estimate,ci1,ci2,pval
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
2,cibersort.regular.cardiac_muscle,vCM,0.5055158,0.19296837,0.81806314,0.002022455
19,cibersort.regular.endocardial,FB,-0.3504492,-0.62604685,-0.07485162,0.015563674
60,cibersort.regular.cardiac_neuron,MAC,-0.2763442,-0.4978236,-0.05486484,0.016525461
16,cibersort.regular.smooth_muscle,LC,0.2656899,0.02999305,0.50138667,0.02745613
3,cibersort.regular.cardiac_muscle,aCM,0.2754309,0.02286548,0.52799631,0.033130287
58,cibersort.regular.cardiac_neuron,EC,0.5736864,0.0384607,1.10891212,0.03667716


In [25]:
tests_gene

cell_eqtl,cell_snatac,estimate,ci1,ci2,pval
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
cibersort.regular.cardiac_muscle,FB,-0.25532904,-0.33441236,-0.176245718,6.423617e-10
cibersort.regular.cardiac_muscle,vCM,0.48638024,0.36448094,0.608279544,4.763539e-14
cibersort.regular.cardiac_muscle,aCM,0.36997311,0.24662724,0.493318975,8.357681e-09
cibersort.regular.cardiac_muscle,EC,-0.01033020,-0.10411659,0.083456186,8.286419e-01
cibersort.regular.cardiac_muscle,SM,-0.08815168,-0.18106536,0.004762005,6.288574e-02
cibersort.regular.cardiac_muscle,MAC,-0.35382159,-0.43637194,-0.271271244,8.131272e-16
cibersort.regular.cardiac_muscle,LC,0.05664873,-0.07704935,0.190346819,4.052728e-01
cibersort.regular.cardiac_muscle,AD,-0.13855234,-0.23782324,-0.039281442,6.357131e-03
cibersort.regular.cardiac_muscle,NR,-0.06681723,-0.17795141,0.044316945,2.378545e-01
cibersort.regular.smooth_muscle,FB,0.06972177,-0.02542132,0.164864854,1.504157e-01


In [28]:
fwrite(tests_gene  , "pipeline/eqtl.1.cell_type_validation/tests.gene.txt"   , sep = "\t", col.names = TRUE, row.names = FALSE)
fwrite(tests_isof  , "pipeline/eqtl.1.cell_type_validation/tests.isoform.txt", sep = "\t", col.names = TRUE, row.names = FALSE)
fwrite(cells_snatac, "pipeline/eqtl.1.cell_type_validation/cells_snatac.txt ", sep = "\t", col.names = TRUE, row.names = FALSE)


# NOT working

In [None]:
calculate_eqtl2cell = function(x, cs, phenotype)
{
    eqtl2cell         = qtl_list[["interactions"]][[phenotype]]
    eqtls             = qtl_list[["qtl"         ]][[phenotype]]
    eqtl2cell$tr2type = paste(eqtl2cell$transcript_id, eqtl2cell$type)
    out               = unique(eqtl2cell[,c("transcript_id", "type", "tr2type")])
    
    for(cell in sort(unique(eqtl2cell$interaction)))
    {
        mycol = paste("eqtl", cell, sep = ".")
        out[,mycol] = 0
        
        out[ out$tr2type %in% eqtl2cell[ eqtl2cell$interaction == cell & eqtl2cell$cell == TRUE, "tr2type"], mycol] = 1
    }
    
    for(cell in cells_snatac$cell_type)
    {
        mycol = paste("snatac", cell, sep = ".")
        out[,mycol] = 0
        
        in_cell  = x[ x$coord %in% cts[ cts$cell_type == cell, "coord"], "id"]
        tr2types = unique(cs[ cs$id %in% in_cell, "tr2type"])
        
        out[ out$tr2type %in% tr2types, mycol] = 1
    }
    return(out)
}

eqtl2cell_gene = calculate_eqtl2cell(int_gene, cs_gene, "rna"    )
eqtl2cell_isof = calculate_eqtl2cell(int_isof, cs_isof, "isoform")


In [None]:
correlation_cells = function(totest)
{
    out = as.data.frame(matrix(0, nrow = nrow(cells_eqtl), ncol = nrow(cells_snatac), dimnames = list(cells_eqtl$cell_type, cells_snatac$cell_type)))
    
    for(cell1 in cells_eqtl$cell_type)
    {
        for(cell2 in cells_snatac$cell_type)
        {
            out[cell1, cell2] = nrow(totest[ totest[, paste("eqtl", cell1, sep = ".")] == 1 & totest[, paste("snatac", cell2, sep = ".")] == 1, ]) / nrow(totest[ totest[, paste("eqtl", cell1, sep = ".")] == 1 | totest[, paste("snatac", cell2, sep = ".")] == 1, ])
            #out[cell1, cell2] = nrow(totest[ totest[, paste("eqtl", cell1, sep = ".")] == 1 & totest[, paste("snatac", cell2, sep = ".")] == 1, ]) / nrow(totest[ totest[, paste("eqtl", cell1, sep = ".")] == 1, ])
        }    
    }
    return(out)
}

cor_gene = correlation_cells(eqtl2cell_gene)


In [None]:
cor_gene

colSums(eqtl2cell_gene[, c(paste("eqtl", cells_eqtl$cell_type, sep = "."), paste("snatac", cells_snatac$cell_type, sep = "."))])

In [None]:
correlation_cells = function(phenotype, qtl_list, int_data, eqtl2cell)
{
    x       = qtl_list[["interactions"]][[phenotype]]
    x       = merge(x, eqtl2cell, by = c("transcript_id", "type"))
    x$delta = abs(x$beta_1 - x$beta_0)
    
    out = as.data.frame(rbindlist(lapply(cells_eqtl$cell_type, function(cell1)
    {
        totest = x[ x$interaction == cell1, ]
        as.data.frame(rbindlist(lapply(cells_snatac$cell_type, function(cell2)
        {
            if(nrow(totest[totest[, paste("snatac", cell2, sep = ".")] == 1, ]) >= 2)
            {
                test = t.test(totest[totest[, paste("snatac", cell2, sep = ".")] == 1, "delta"], totest[totest[, paste("snatac", cell2, sep = ".")] == 0, "delta"])
                #test = t.test(abs(totest[totest[, paste("snatac", cell2, sep = ".")] == 1, "beta_1"]), abs(totest[totest[, paste("snatac", cell2, sep = ".")] == 0, "beta_1"]))

                return(data.frame(cell_eqtl   = cell1, 
                                  cell_snatac = cell2, 
                                  estimate1   = test$estimate[[1]],
                                  estimate2   = test$estimate[[2]],
                                  ci1         = test$conf.int[[1]],
                                  ci2         = test$conf.int[[2]],
                                  pval        = test$p.value
                                 ))
            }else
            {
                return(data.frame(cell_eqtl   = cell1, 
                                  cell_snatac = cell2, 
                                  estimate1   = NA,
                                  estimate2   = NA,
                                  ci1         = NA,
                                  ci2         = NA,
                                  pval        = 1
                                 ))
            }
        })), stringsAsFactors = FALSE)
    })), stringsAsFactors = FALSE)
    return(out)
}

cor_gene = correlation_cells("rna", qtl_list, int_gene, eqtl2cell_gene)


In [None]:
cor_gene

In [None]:
x = qtl_list$interactions$rna

hist(abs(x[ x$cell == TRUE , "beta_1"] - x[ x$cell == TRUE , "beta_0"]))
hist(abs(x[ x$cell == FALSE, "beta_1"] - x[ x$cell == FALSE, "beta_0"]))



# Overlap between cell type associated eQTLs and cell type associated ATAC-seq peaks

In [None]:
overlap_cell_types = function()

In [None]:
cells_eqtl

In [None]:
colnames(ras)

In [None]:
str(cts_hg38)
str(cts)