In [1]:
setwd("/frazer01/projects/CARDIPS/analysis/cardiac_eqtls")

source("script/packages.R"  )
source("script/input_data.R")
source("script/functions.R" )


# Analysis of differential expression results

In [2]:
gene_info               = fread("pipeline/1.2.expression/gene_info.txt"   , sep = "\t", header = TRUE, data.table = FALSE)
isof_info               = fread("pipeline/1.2.expression//isoform_info.txt", sep = "\t", header = TRUE, data.table = FALSE)
gene_info$transcript_id = gene_info$gene_id

In [3]:
read_diffexp = function(name, tissue1, tissue2, gene_info)
{
    indata               = add_rownames(fread(paste("pipeline/4.1.differential_expression", paste("diffexp", name, tissue1, tissue2, "txt", sep = "."), sep = "/"), sep = "\t", header = TRUE, data.table = FALSE))
    indata$transcript_id = rownames(indata)
    indata               = indata[order(indata$pval),]
    indata               = merge(gene_info[,c("transcript_id", "gene_id", "gene_name", "gene_type")], indata)
    indata$qval          = p.adjust(indata$pval, method = "bonferroni")
    indata$tissue1       = tissue1
    indata$tissue2       = tissue2
    indata$type          = name
    indata$diffexp       = FALSE
    
    indata[indata$qval < 0.05, "diffexp"] = TRUE
    
    message(paste(name, tissue1, tissue2, nrow(indata), nrow(indata[indata$qval < 0.05,])))
    return(indata)
}

diffexp = as.data.frame(rbindlist(lapply(c("ipsc_cvpc", "heart"), function(tissue1)
{
	out = as.data.frame(rbindlist(lapply(c("heart", "arteria"), function(tissue2)
	{
		if(tissue1 != tissue2)
		{
			indata_gene    = read_diffexp("gene_tpm"   , tissue1, tissue2, gene_info)
			indata_iso_tpm = read_diffexp("isoform_tpm", tissue1, tissue2, isof_info)
			indata_iso_use = read_diffexp("isoform_use", tissue1, tissue2, isof_info)
            
            return(rbind(indata_gene, indata_iso_tpm, indata_iso_use))
		}
	})), stringsAsFactors = FALSE)
    
     return(out)
})), stringsAsFactors = FALSE)

#name    = "gene_tpm"
#tissue1 = "ipsc_cvpc"
#tissue2 = "heart"
#head(read_diffexp("gene_tpm"   , tissue1, tissue2, gene_info))
#head(read_diffexp("isoform_use", tissue1, tissue2))

fwrite(diffexp, "pipeline/4.1.differential_expression/diffexp.txt", sep = "\t", col.names = TRUE, row.names = FALSE)

gene_tpm ipsc_cvpc heart 20393 15512

isoform_tpm ipsc_cvpc heart 38271 26005

isoform_use ipsc_cvpc heart 38271 21921

gene_tpm ipsc_cvpc arteria 20393 16409

isoform_tpm ipsc_cvpc arteria 38271 27777

isoform_use ipsc_cvpc arteria 38271 23448

gene_tpm heart arteria 20393 15402

isoform_tpm heart arteria 38271 25598

isoform_use heart arteria 38271 19246



In [4]:
a = diffexp[diffexp$tissue1 == "ipsc_cvpc" & diffexp$tissue2 == "heart" & diffexp$type == "isoform_tpm" & diffexp$gene_name %in% c("SCN5A", "TNNT2", "TTN", "ABLIM1"),]
a[order(a$pval),]

Unnamed: 0_level_0,transcript_id,gene_id,gene_name,gene_type,beta,se,tval,pval,qval,tissue1,tissue2,type,diffexp
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<lgl>
33550,ENST00000413689.6_5,ENSG00000183873.17_7,SCN5A,protein_coding,1.1652335,0.0707312,16.47411,1.0272559999999999e-50,3.93141e-46,ipsc_cvpc,heart,isoform_tpm,True
33456,ENST00000412633.3_3,ENSG00000118194.20_8,TNNT2,protein_coding,1.2795052,0.07861034,16.276552,9.799986999999999e-50,3.750553e-45,ipsc_cvpc,heart,isoform_tpm,True
34283,ENST00000422165.6_3,ENSG00000118194.20_8,TNNT2,protein_coding,1.1035073,0.07246897,15.227308,1.2970969999999998e-44,4.964119e-40,ipsc_cvpc,heart,isoform_tpm,True
28237,ENST00000367318.10_5,ENSG00000118194.20_8,TNNT2,protein_coding,-0.7297111,0.06770188,-10.778298,6.486667e-25,2.4825119999999998e-20,ipsc_cvpc,heart,isoform_tpm,True
25644,ENST00000333535.9_4,ENSG00000183873.17_7,SCN5A,protein_coding,-1.0137763,0.09641594,-10.514613,7.063926e-24,2.7034349999999997e-19,ipsc_cvpc,heart,isoform_tpm,True
31312,ENST00000392952.7_2,ENSG00000099204.20_8,ABLIM1,protein_coding,-0.6711208,0.07142927,-9.3956,1.123307e-19,4.299007e-15,ipsc_cvpc,heart,isoform_tpm,True
37631,ENST00000460472.6_4,ENSG00000155657.27_6,TTN,protein_coding,-0.6046035,0.06636338,-9.110499,1.165709e-18,4.461284e-14,ipsc_cvpc,heart,isoform_tpm,True
57569,ENST00000651023.1_2,ENSG00000099204.20_8,ABLIM1,protein_coding,0.6304452,0.0844321,7.46689,2.850852e-13,1.09105e-08,ipsc_cvpc,heart,isoform_tpm,True
31314,ENST00000392955.7_4,ENSG00000099204.20_8,ABLIM1,protein_coding,0.6472399,0.08947357,7.233867,1.414112e-12,5.411947e-08,ipsc_cvpc,heart,isoform_tpm,True
27829,ENST00000360870.9_4,ENSG00000155657.27_6,TTN,protein_coding,-0.4211161,0.07362295,-5.719903,1.670599e-08,0.0006393549,ipsc_cvpc,heart,isoform_tpm,True


In [5]:
str(diffexp)

'data.frame':	290805 obs. of  13 variables:
 $ transcript_id: chr  "ENSG00000000003.15_5" "ENSG00000000005.6_4" "ENSG00000000419.12_6" "ENSG00000000457.14_7" ...
 $ gene_id      : chr  "ENSG00000000003.15_5" "ENSG00000000005.6_4" "ENSG00000000419.12_6" "ENSG00000000457.14_7" ...
 $ gene_name    : chr  "TSPAN6" "TNMD" "DPM1" "SCYL3" ...
 $ gene_type    : chr  "protein_coding" "protein_coding" "protein_coding" "protein_coding" ...
 $ beta         : num  1.935 1.461 0.535 1.98 1.987 ...
 $ se           : num  0.0914 0.0904 0.121 0.1028 0.0952 ...
 $ tval         : num  21.17 16.16 4.42 19.26 20.87 ...
 $ pval         : num  5.25e-75 3.55e-49 1.19e-05 5.98e-65 2.13e-73 ...
 $ qval         : num  1.07e-70 7.25e-45 2.43e-01 1.22e-60 4.34e-69 ...
 $ tissue1      : chr  "ipsc_cvpc" "ipsc_cvpc" "ipsc_cvpc" "ipsc_cvpc" ...
 $ tissue2      : chr  "heart" "heart" "heart" "heart" ...
 $ type         : chr  "gene_tpm" "gene_tpm" "gene_tpm" "gene_tpm" ...
 $ diffexp      : logi  TRUE TRUE FALSE TRUE 

# Add filters for expression levels

In [6]:
metadata              =              fread("pipeline/3.1.covariates/metadata.txt"              , sep = "\t", header = TRUE , data.table = FALSE)
covariates            = add_rownames(fread("pipeline/3.1.covariates/covariates.txt"            , sep = "\t", header = TRUE , data.table = FALSE))
gene_tpm_expressed    = add_rownames(fread("pipeline/1.2.expression/tpm_gene.expressed.txt"    , sep = "\t", header = TRUE , data.table = FALSE))
isof_tpm_expressed    = add_rownames(fread("pipeline/1.2.expression/tpm_isoform.expressed.txt" , sep = "\t", header = TRUE , data.table = FALSE))
isof_use_expressed    = add_rownames(fread("pipeline/1.2.expression/use_isoform.expressed.txt" , sep = "\t", header = TRUE , data.table = FALSE))
gene_tpm_normalized   = add_rownames(fread("pipeline/1.2.expression/tpm_gene.normalized.txt"   , sep = "\t", header = TRUE , data.table = FALSE))
isof_tpm_normalized   = add_rownames(fread("pipeline/1.2.expression/tpm_isoform.normalized.txt", sep = "\t", header = TRUE , data.table = FALSE))
isof_use_normalized   = add_rownames(fread("pipeline/1.2.expression/use_isoform.normalized.txt", sep = "\t", header = TRUE , data.table = FALSE))


In [7]:
add_diffexp = function(diffexp, name, tissue1, tissue2, gene_info, covariates, tpm)
{
    indata               = diffexp[diffexp$type == name & diffexp$tissue1 == tissue1 & diffexp$tissue2 == tissue2,]
    tpm1                 = tpm[indata$transcript_id,rownames(covariates[covariates[,tissue1] == 1,])]
    tpm2                 = tpm[indata$transcript_id,rownames(covariates[covariates[,tissue2] == 1,])]
    expdata              = data.frame(transcript_id = indata$transcript_id, tissue1_exp = rowMeans(tpm1[indata$transcript_id,]), tissue2_exp = rowMeans(tpm2[indata$transcript_id,]))
    expdata$delta        = expdata$tissue1_exp - expdata$tissue2_exp
    
    #if(name == "isoform_use")
    #{
        expdata$log2r        = log2(expdata$tissue1_exp / expdata$tissue2_exp)
        expdata$log2r_filter = FALSE

        expdata[is.na(expdata$log2r) == FALSE & abs(expdata$log2r) > 2, "log2r_filter"] = TRUE
    #}else
    #{
    #    expdata$log2r        = 0
    #    expdata$log2r_filter = FALSE
    #}
    
    outdata = merge(indata, expdata)
   
    message(paste(name, tissue1, tissue2, nrow(outdata), nrow(outdata[outdata$diffexp == TRUE,]), nrow(outdata[outdata$diffexp == TRUE & outdata$log2r_filter == TRUE,])))
    return(outdata)
}

diffexp_add = as.data.frame(rbindlist(lapply(c("ipsc_cvpc", "heart"), function(tissue1)
{
	out = as.data.frame(rbindlist(lapply(c("heart", "arteria"), function(tissue2)
	{
		if(tissue1 != tissue2)
		{
			indata_gene    = add_diffexp(diffexp, "gene_tpm"   , tissue1, tissue2, gene_info, covariates, gene_tpm_expressed)
			indata_iso_tpm = add_diffexp(diffexp, "isoform_tpm", tissue1, tissue2, isof_info, covariates, isof_tpm_expressed)
			indata_iso_use = add_diffexp(diffexp, "isoform_use", tissue1, tissue2, isof_info, covariates, isof_use_expressed)
            
            return(rbind(indata_gene, indata_iso_tpm, indata_iso_use))
		}
	})), stringsAsFactors = FALSE)
    
     return(out)
})), stringsAsFactors = FALSE)

#name    = "gene_tpm"
#tissue1 = "heart"
#tissue2 = "arteria"
#
#head(read_diffexp("gene_tpm"   , tissue1, tissue2, gene_info))

fwrite(diffexp_add, "pipeline/4.1.differential_expression/diffexp.txt", sep = "\t", col.names = TRUE, row.names = FALSE)

gene_tpm ipsc_cvpc heart 20393 15512 6672

isoform_tpm ipsc_cvpc heart 38271 26005 13038

isoform_use ipsc_cvpc heart 38271 21921 5459

gene_tpm ipsc_cvpc arteria 20393 16409 5990

isoform_tpm ipsc_cvpc arteria 38271 27777 12255

isoform_use ipsc_cvpc arteria 38271 23448 5738

gene_tpm heart arteria 20393 15402 3461

isoform_tpm heart arteria 38271 25598 6677

isoform_use heart arteria 38271 19246 1140



In [8]:
str(diffexp_add)

'data.frame':	290805 obs. of  18 variables:
 $ transcript_id: chr  "ENSG00000000003.15_5" "ENSG00000000005.6_4" "ENSG00000000419.12_6" "ENSG00000000457.14_7" ...
 $ gene_id      : chr  "ENSG00000000003.15_5" "ENSG00000000005.6_4" "ENSG00000000419.12_6" "ENSG00000000457.14_7" ...
 $ gene_name    : chr  "TSPAN6" "TNMD" "DPM1" "SCYL3" ...
 $ gene_type    : chr  "protein_coding" "protein_coding" "protein_coding" "protein_coding" ...
 $ beta         : num  1.935 1.461 0.535 1.98 1.987 ...
 $ se           : num  0.0914 0.0904 0.121 0.1028 0.0952 ...
 $ tval         : num  21.17 16.16 4.42 19.26 20.87 ...
 $ pval         : num  5.25e-75 3.55e-49 1.19e-05 5.98e-65 2.13e-73 ...
 $ qval         : num  1.07e-70 7.25e-45 2.43e-01 1.22e-60 4.34e-69 ...
 $ tissue1      : chr  "ipsc_cvpc" "ipsc_cvpc" "ipsc_cvpc" "ipsc_cvpc" ...
 $ tissue2      : chr  "heart" "heart" "heart" "heart" ...
 $ type         : chr  "gene_tpm" "gene_tpm" "gene_tpm" "gene_tpm" ...
 $ diffexp      : logi  TRUE TRUE FALSE TRUE 