-
Notifications
You must be signed in to change notification settings - Fork 46
/
transform_counts_Tcells.R
executable file
·142 lines (109 loc) · 5 KB
/
transform_counts_Tcells.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
library(statmod)
library(gdata)
library(genefilter)
library(EBImage)
library(rhdf5)
library(DESeq)
library(statmod)
library(hom.Hs.inp.db)
library(AnnotationDbi)
library(org.Mm.eg.db)
#1. REad in data
load('./data_Tcells.Rdata')
dataMouse[ 1:10, 1:5 ]
geneTypes <- factor( c( ENSM="ENSM", ERCC="ERCC" )[
substr( rownames(dataMouse), 1, 4 ) ] )
#2. calculate normalisation for counts
countsMmus <- dataMouse[ which( geneTypes=="ENSM" ), ]
countsERCC <- dataMouse[ which( geneTypes=="ERCC" ), ]
lengthsMmus <- dataMouse[ which( geneTypes=="ENSM" ), 1 ]
lengthsERCC <- dataMouse[ which( geneTypes=="ERCC" ), 1 ]
sfERCC <- estimateSizeFactorsForMatrix( countsERCC )
#we do not want to normalize for differences in biological starting material...
#as this can be due to different stages in cell cycle
sfMmus <- sfERCC
nCountsERCC <- t( t(countsERCC) / sfERCC )
nCountsMmus <- t( t(countsMmus) / sfMmus )
meansERCC <- rowMeans( nCountsERCC )
varsERCC <- rowVars( nCountsERCC )
cv2ERCC <- varsERCC / meansERCC^2
#normalized counts (brennecke)
meansMmus <- rowMeans( nCountsMmus )
varsMmus <- rowVars( nCountsMmus )
cv2Mmus <- varsMmus / meansMmus^2
meansERCC <- rowMeans( nCountsERCC )
varsERCC <- rowVars( nCountsERCC )
cv2ERCC <- varsERCC / meansERCC^2
#Do fitting of technical noise
#normalised counts (with size factor)
minMeanForFitA <- unname( quantile( meansERCC[ which( cv2ERCC > .3 ) ], .8 ) )
useForFitA <- meansERCC >= minMeanForFitA
fitA <- glmgam.fit( cbind( a0 = 1, a1tilde = 1/meansERCC[useForFitA] ),
cv2ERCC[useForFitA] )
#plot fit
plot( meansERCC, cv2ERCC, log="xy", col=1+useForFitA)
xg <- 10^seq( -3, 5, length.out=100 )
lines( xg, coefficients(fitA)["a0"] + coefficients(fitA)["a1tilde"]/xg )
segments( meansERCC[useForFitA], cv2ERCC[useForFitA],
meansERCC[useForFitA], fitA$fitted.values, col="gray" )
#perfrom statistical test
minBiolDisp <- .5^2
xi <- mean( 1 / sfERCC )
m <- ncol(countsMmus)
psia1thetaA <- mean( 1 / sfERCC ) +
( coefficients(fitA)["a1tilde"] - xi ) * mean( sfERCC / sfMmus )
cv2thA <- coefficients(fitA)["a0"] + minBiolDisp + coefficients(fitA)["a0"] * minBiolDisp
testDenomA <- ( meansMmus * psia1thetaA + meansMmus^2 * cv2thA ) / ( 1 + cv2thA/m )
pA <- 1 - pchisq( varsMmus * (m-1) / testDenomA, m-1 )
padjA <- p.adjust( pA, "BH" )
table( padjA < .1 )
#plot mean/cv2 relationship and
plot( meansMmus, cv2Mmus, log="xy", col=1+(padjA<0.1),ylim=c(0.1,95), xlab='Mean Counts', ylab='CV2 Counts')
xg <- 10^seq( -3, 5, length.out=100 )
lines( xg, coefficients(fitA)["a0"] + coefficients(fitA)["a1tilde"]/xg,lwd=2,col='blue' )
points(meansERCC, cv2ERCC,col='blue',pch=15,cex=1.1)
#points(meansMmus[cc_gene_indices], cv2Mmus[cc_gene_indices],col=rgb(0,255,0,100,maxColorValue=255),pch=2,cex=0.75)
#points(meansMmus[ccCBall_gene_indices], cv2Mmus[ccCBall_gene_indices],col=rgb(0,255,0,20,maxColorValue=255),pch=2,cex=0.8)
legend('bottomleft',c('T-cells (padj >= 0.1)','T-cells (padj<0.1)','ERCC','Cell Cycle genes'),pch=c(1,1,15),col=c('black','red','blue','green'),cex=0.7)
#4. Transform to log-space and propagate error
eps=1
LogNcountsMmus=log10(nCountsMmus+eps)
dLogNcountsMmus=1/(meansMmus+eps)
var_techMmus=(coefficients(fitA)["a0"] + coefficients(fitA)["a1tilde"]/meansMmus)*meansMmus^2
LogVar_techMmus=(dLogNcountsMmus*sqrt(var_techMmus))^2 #error propagation
#gene names in the T-cell data.
gene_names=rownames(nCountsMmus)
gene_names_het=gene_names[which(padjA<0.1)]
#all Cycle base genes homologs (top 600 genes)
hu2musAll=inpIDMapper(dataCB[1:600,3],'HOMSA','MUSMU',srcIDType='ENSEMBL',destIDType='ENSEMBL')
ccCBall_gene_indices=match(unlist(hu2musAll),rownames(nCountsMmus))
#get cell cycle genes from GO
xxGO <- as.list(org.Mm.egGO2EG)
cell_cycleEG <-unlist(xxGO['GO:0007049'])
#get ENSEMBLE ids
x <- org.Mm.egENSEMBL
mapped_genes <- mappedkeys(x)
xxE <- as.list(x[mapped_genes])
ens_ids_cc<-unlist(xxE[cell_cycleEG])
cc_gene_indices <- na.omit(match(ens_ids_cc, rownames(dataMouse)))
#ensemble IDs to gene symbols
x <- org.Mm.egSYMBOL
# Get the gene symbol that are mapped to an entrez gene identifiers
mapped_genes <- mappedkeys(x)
# Convert to a list
xx <- as.list(x[mapped_genes])
xxenseg <- as.list(org.Mm.egENSEMBL2EG)
gene_syms=unlist(xx[unlist(xxenseg[gene_names])])
gene_names_list<-(lapply(xxenseg[gene_names],function(x){if(is.null(x)){x=NA}else{x=x[1]}}))
sym_names=unlist(lapply(xx[unlist(gene_names_list)],function(x){if(is.null(x)){x=NA}else{x=x[1]}}))
sym_names[is.na(sym_names)]=gene_names[is.na(sym_names)]
sym_names_het=sym_names[which(padjA<0.1)] #gene symbols of variable genes
#rename a few variables...
cellcyclegenes <- ens_ids_cc
cellcyclegenes_filter <- cc_gene_indices
cell_names <- colnames(nCountsMmus)
Y <- nCountsMmus
genes_heterogen <- (padjA<0.1)*1
countsERCC_mat=as.matrix(countsERCC * 1)
countsMmus_mat = as.matrix(countsMmus * 1)
#h5save(ccCBall_gene_indices,gene_names,sym_names,sym_names_het,cellcyclegenes_filter,cellcyclegenes,cell_names,nCountsMmus,genes_heterogen,LogVar_techMmus,LogNcountsMmus,countsMmus_mat,sfERCC,countsERCC_mat,file='data_Tcells_normCounts.h5f')