Skip to content

Commit

Permalink
updated epigenetic analysis script
Browse files Browse the repository at this point in the history
  • Loading branch information
ShengLi committed Mar 20, 2016
1 parent 5326652 commit 4b6f40e
Showing 1 changed file with 23 additions and 208 deletions.
231 changes: 23 additions & 208 deletions relapsed_AML_epigenetics.R → epigenetics_analysis.R
Expand Up @@ -19,14 +19,13 @@ legend("topright", col=c("green","blue"), c("Low MUT","High MUT"), lty=1, box.lw

# Figure 2 #

setwd("/Volumes/icb2/Awork/AMLpaper/epiloci/")
load("~/Google Drive/MasonLab/AML/manuscript_materials/b138.aug_rtimeupdate_137patient.rda")
AML147DR=read.table("survival/DR_eloci_147_Aug18.txt",header=F,sep="\t",stringsAsFactors=F)
AML147DN=read.table("survival/AML147_DN.txt",header=F,sep="\t",stringsAsFactors=F)
AML147RN=read.table("survival/AML147_RN.txt",header=F,sep="\t",stringsAsFactors=F)
load("b138.aug_rtimeupdate_137patient.rda")
AML147DR=read.table("DR_eloci_147_Aug18.txt",header=F,sep="\t",stringsAsFactors=F)
AML147DN=read.table("AML147_DN.txt",header=F,sep="\t",stringsAsFactors=F)
AML147RN=read.table("AML147_RN.txt",header=F,sep="\t",stringsAsFactors=F)

AML144DN9=read.table("/Volumes/aml2/home/ntd/errbs/epipoly/AML144_DN_9N_Sep6.txt",header=F,sep="\t",stringsAsFactors=F)
AML144RN9=read.table("/Volumes/aml2/home/ntd/errbs/epipoly/AML144_RN_9N_Sep6.txt",header=F,sep="\t",stringsAsFactors=F)
AML144DN9=read.table("AML144_DN_9N_Sep6.txt",header=F,sep="\t",stringsAsFactors=F)
AML144RN9=read.table("AML144_RN_9N_Sep6.txt",header=F,sep="\t",stringsAsFactors=F)

print(load("../manuscript_materials/epicluster/epi_cl_aug.rda"))

Expand All @@ -48,8 +47,8 @@ printtopdf(p, "../manuscript_materials/aug2015/Figure2_log10epm90_epicluster.pdf

p=ggplot(dat2, aes(x=ifelse(stage==1, "D vs. N", "R vs. N"), y=epm90, fill=stage)) + geom_violin() + stat_summary(fun.y=median.quartile,geom='point')+ facet_grid(.~cl)+
gb2.theme("none") + ylab("EPM") + scale_fill_manual(values=cbPalette[2:3]) + xlab(NULL) + ggtitle("All AML (n=138)")
printtopdf(p, "../manuscript_materials/aug2015/Figure2_epm90_epicluster.pdf")
pdf("../manuscript_materials/aug2015/Figure2_epm90_epicluster_pval.pdf")
printtopdf(p, "Figure2_epm90_epicluster.pdf")
pdf("Figure2_epm90_epicluster_pval.pdf")
pval.by.stage=ddply(dat2, .(stage), function(X){data.frame(not.cl=1:3, pval=sapply(1:3, function(i) wilcox.test(epm90~cl, X[gsub("Cluster ","",X$cl)!=i,])$p.value))})
pval.by.cl=ddply(dat2, .(cl), function(X){data.frame(pval=wilcox.test(epm90~stage, X)$p.value)})
textplot(pval.by.stage)
Expand All @@ -60,26 +59,26 @@ names(DN.epm90.aug)=dat2$sample[dat2$stage==1]
b138.aug$DN.epm90.aug=DN.epm90.aug[b138.aug$sample]

b138.aug$cl.aug=cl[b138.aug$sample]
summary(coxph(Surv(relapsetime.aug)~ cl.aug, b138.aug)) # pvalue=0.365
survdiff(Surv(relapsetime.aug)~ cl.aug, b138.aug[which(b138.aug$cl.aug !=1),]) #p= 0.589
survdiff(Surv(relapsetime.aug)~ cl.aug, b138.aug[which(b138.aug$cl.aug !=2),]) #p= 0.317
survdiff(Surv(relapsetime.aug)~ cl.aug, b138.aug[which(b138.aug$cl.aug !=3),]) #p= 0.704
summary(coxph(Surv(relapsetime.aug)~ cl.aug, b138.aug))
survdiff(Surv(relapsetime.aug)~ cl.aug, b138.aug[which(b138.aug$cl.aug !=1),])
survdiff(Surv(relapsetime.aug)~ cl.aug, b138.aug[which(b138.aug$cl.aug !=2),])
survdiff(Surv(relapsetime.aug)~ cl.aug, b138.aug[which(b138.aug$cl.aug !=3),])

# cluster and age/gender
anova(lm(dage~as.factor(cl.aug), b138.aug)) # pvalue=0.4413
anova(lm(dage~as.factor(cl.aug), b138.aug))

p=ggplot(b138.aug, aes(x=factor(cl.aug), y=dage, fill=factor(cl.aug))) + geom_violin() + stat_summary(fun.y=median.quartile,geom='point')+
gb2.theme("none") + ylab("Age") + scale_fill_manual(values=cbPalette[5:7]) + xlab("Cluster") + ggtitle("All AML (n=138)\np-value=0.4413")
printtopdf(p, pdf="aug2015/epm_vs_age.pdf",width=4,height=4)

anova(lm(sex~as.factor(cl.aug), b138.aug)) # pvalue=0.4413
anova(lm(sex~as.factor(cl.aug), b138.aug))
chisq.test(table(b138.aug$sex, b138.aug$cl.aug))

dat.aug=ddply(b138.aug, .(cl.aug), function(X)data.frame(table(X$sex)))
p=ggplot(dat.aug, aes(x=Var1, y=Freq,fill=factor(cl.aug))) +
geom_bar(stat="identity", position="fill") +
gb2.theme("none") + ylab("Proportion of patients\nfrom three clusters") + scale_fill_manual(values=cbPalette[5:7]) + xlab("Gender") + ggtitle("All AML (n=138)\np-value=0.01048")
printtopdf(p, pdf="aug2015/epm_vs_sex.pdf",width=4,height=4)
printtopdf(p, pdf="epm_vs_sex.pdf",width=4,height=4)


# add dr
Expand All @@ -98,12 +97,12 @@ dat4$epm90[is.na(dat4$epm90)]=0
p=ggplot( dat4, aes(x=as.numeric(ID), y=log10(epm90+1), fill=as.factor(stage))) +
geom_bar(stat="identity") + xlab("sample index") +scale_x_continuous(breaks=c(1, 50,100, 138))+
gb2.theme("none") + ylab("EPM (log10)") + scale_fill_manual(values=cbPalette[4]) + ylim(c(0,5.1))
printtopdf(p,"~/awork/projects/paml/figures/Figure2/figure2d.pdf", width=5,height=2.5)
printtopdf(p,"figure2d.pdf", width=5,height=2.5)

p=ggplot( dat2[dat2$stage==2,], aes(x=as.numeric(ID), y=log10(epm90+1), fill=as.factor(stage))) +
geom_bar(stat="identity") + xlab("sample index") +scale_x_continuous(breaks=c(1, 50,100, 138)) +
gb2.theme("none") + ylab("EPM (log10)") + scale_fill_manual(values=cbPalette[3]) + ylim(c(0,5.1))
printtopdf(p,"~/awork/projects/paml/figures/Figure2/figure2b.pdf", width=5,height=2.5)
printtopdf(p,"figure2b.pdf", width=5,height=2.5)

p=ggplot( dat2[dat2$stage==1,], aes(x=as.numeric(ID), y=log10(epm90+1), fill=as.factor(stage))) +
geom_bar(stat="identity") + xlab("sample index") +scale_x_continuous(breaks=c(1, 50,100, 138)) +
Expand All @@ -114,207 +113,23 @@ wilcox.test(dat2[dat2$stage==1,"epm90"],dat2[dat2$stage==2,"epm90"]) #p-value =

# Figure 3 #
# Eloci shared
print(load("/Volumes/icb2/Awork/AMLpaper/manuscript_materials//epicluster/fig3_14N.shared.aug.rda"))
print(load("fig3_14N.shared.aug.rda"))
greens=c("#a1d99b","#41ab5d", "#006d2c")
dat4$variable=gsub("\\.", " ", dat4$variable)
p=ggplot(dat4, aes(ID, epiallele.shift,fill=variable))+
geom_bar(stat="identity", position="fill") + scale_fill_manual(values=greens) +
facet_grid(~cluster,scale="free",labeller=label_both, space="free_x") + gb2.theme("bottom") +
scale_x_continuous(breaks=c(1, 138))
printtopdf(p, "~/awork/projects/paml/figures/Figure3/fig3.shared_14N_3_epi_green2_oct.pdf", width=8,height=5)

# SNV callings by merging three snv caller results.
library(VariantAnnotation)
getFinalSnvsInfo=function(pid){
nid=gsub("-[12]$", "-N", pid)

vvcf=readVcf(paste0(pid, "/", pid, "_", nid, ".varscan.snv.fltbyindel.Somatic.hc.vcf"),"hg19")
mvcf=readVcf(paste0(pid, "/", pid, "_", nid, ".mutect.snv.Somatic.vcf"),"hg19")
# mutect
pid2=grep("N", colnames(geno(mvcf)$AD), value=T, invert=T)
nid2=grep("N", colnames(geno(mvcf)$AD), value=T, invert=F)
t.alt.count=round(unlist(geno(mvcf)$DP[,pid2])* unlist(geno(mvcf)$FA[,pid2]))
t.depth=unlist(geno(mvcf)$DP[,pid2])
t.ref.count=t.depth-t.alt.count
n.alt.count=round(unlist(geno(mvcf)$DP[,nid2])* unlist(geno(mvcf)$FA[,nid2]))
n.depth=unlist(geno(mvcf)$DP[,nid2])
n.ref.count=n.depth-n.alt.count
freq=unlist(geno(mvcf)$FA[,pid2])
info=as.data.frame(rowRanges(mvcf))
msnvs=data.frame(info[,1:2],t.depth, t.ref.count, t.alt.count, n.depth, n.ref.count, n.alt.count, freq, info[,7:8])
mids=paste(msnvs$seqnames, msnvs$start, msnvs$REF, sep="_")
# varscan
t="TUMOR"
n="NORMAL"
t.depth=geno(vvcf)$DP[,t]
t.ref.count=geno(vvcf)$RD[,t]
t.alt.count=geno(vvcf)$AD[,t]
n.depth=geno(vvcf)$DP[,n]
n.alt.count=geno(vvcf)$AD[,n]
n.ref.count=geno(vvcf)$RD[,n]
freq=as.numeric(gsub("%", "", geno(vvcf)$FREQ)[,t])/100
info=as.data.frame(rowRanges(vvcf))
vsnvs=data.frame(info[,1:2],t.depth, t.ref.count, t.alt.count, n.depth, n.ref.count, n.alt.count, freq, info[,7:8])
vids=paste(vsnvs$seqnames, vsnvs$start, vsnvs$REF, sep="_")

# merge varscan + mutect
merged.vcf=rbind(msnvs, vsnvs[which(! vids %in% mids),])
mergedids=paste(merged.vcf$seqnames, merged.vcf$start, merged.vcf$REF, sep="_")

# use of somatic loci shared by at least two callers.
snvloci=read.table(paste0(pid, "/", pid, ".somatic.snv.bed"), header=F, sep="\t", stringsAsFactors = F)
somaticids=paste(snvloci$V1, snvloci$V2, snvloci$V4, sep="_")

finalsnv=merged.vcf[mergedids %in% somaticids,]

write.table(finalsnv, file = paste0(pid, "/", pid, ".somatic.snv.info.txt"), row.names=F, col.names=T, sep="\t", quote=F)
save(finalsnv,file=paste0(pid, "/", pid, ".somatic.snv.info.rda"))
finalsnv
}

library(doMC)
registerDoMC(cores=5);

snvs=foreach(id = dir(pattern="-[12]$")) %dopar% {
snv=getFinalSnvsInfo(id);
}
names(snvs)=dir(pattern="-[12]$")
save(snvs, file="merged.snvs.rda")

# high confidence indels filterings
vindelvcf=readVcf(paste0(pid, "/", pid, "_", nid, ".varscan.indel.Somatic.hc.vcf"),"hg19")
vindelvcf1=vindelvcf[unique(findOverlaps(rowRanges(vindelvcf), ref.exon.gr)@queryHits)]
# varscan
t="TUMOR"
n="NORMAL"
t.depth=geno(vindelvcf1)$DP[,t]
t.ref.count=geno(vindelvcf1)$RD[,t]
t.alt.count=geno(vindelvcf1)$AD[,t]
n.depth=geno(vindelvcf1)$DP[,n]
n.alt.count=geno(vindelvcf1)$AD[,n]
n.ref.count=geno(vindelvcf1)$RD[,n]
freq=as.numeric(gsub("%", "", geno(vindelvcf1)$FREQ)[,t])/100
info=as.data.frame(rowRanges(vindelvcf1))
vindels=data.frame(info[,1:2],t.depth, t.ref.count, t.alt.count, n.depth, n.ref.count, n.alt.count, freq, info[,7:8])
vids=paste(vindels$seqnames, vindels$start, vindels$REF, sep="_")

# counts snvs:
setwd("/Volumes/hippo/home/aml/Exome/Sep2015/pairedAML")
pids=unique(gsub("-[12N]$", "",dir(pattern="-")))
pids1=pids[!pids %in% c("SGUR4", "SGUP4", paste0("PAML-", c(77,80, 89, 96, 105)))]
library(reshape)
library(doMC)
df1=foreach(i = pids1, .combine=rbind)%do%{
loci=foreach(stage=1:2) %do% {
snvloci=read.table(paste0(i, "-", stage, "/", i, "-", stage, ".somatic.snv.bed"), stringsAsFactors=F);
somaticids=paste(snvloci$V1, snvloci$V2, snvloci$V4, sep="_")
};
d=length(setdiff(loci[[1]], loci[[2]]));
r=length(setdiff(loci[[2]], loci[[1]]));
dr=length(intersect(loci[[1]], loci[[2]]));
data.frame(ID=i, type='snv', D=d, DR=dr, R=r)
}

indeldf1=foreach(i = pids1, .combine=rbind)%do%{
loci=foreach(stage=1:2) %do% {
snvloci=read.table(paste0(i, "-", stage, "/", i, "-", stage, "_", i, "-N", ".varscan.indel.Somatic.hc.exon.vcf"), stringsAsFactors=F);
somaticids=paste(snvloci$V1, snvloci$V2, snvloci$V4, sep="_")
};
d=length(setdiff(loci[[1]], loci[[2]]));
r=length(setdiff(loci[[2]], loci[[1]]));
dr=length(intersect(loci[[1]], loci[[2]]));
data.frame(ID=i, type='indel',D=d, DR=dr, R=r)
}

mutationdf=ddply(rbind(df1, indeldf1), .(ID),function(X){colSums(X[,3:5])})
mutationdf1=melt(mutationdf)
ids=gsub("SG", "",gsub("PAML","HU", as.character(mutationdf$ID)))
i=grep("HU",invert=T, ids)
ids2=c(ids[-i], paste(substr(ids[i], 1,2), substr(ids[i], 3, 4), sep="-"))
names(ids2)=mutationdf$ID
mutationdf$cluster=cl[ids2]
mutationdf1$cluster=cl[ids2[as.character(mutationdf1$ID)]]
mutationdf2=cast(mutationdf1)

# Order as epigenetic
ids=unique(dat4$sample)
ids2=gsub("HU-","PAML-", ids)
ids2[grep("PAML", invert=T, ids2)]=gsub("-","", paste0("SG",ids2[grep("PAML", invert=T, ids2)]))
ordered.id=ids2[ids2 %in% unique(as.character(mutationdf1$ID))]

mutationdf1$ID=factor(as.character(mutationdf1$ID), levels = ordered.id)
mutationdf1$types="Diagnosis-specific"
mutationdf1$types[mutationdf1$variable=="DR"]="Shared"
mutationdf1$types[mutationdf1$variable=="R"]="Relapse-specific"
mutationdf1$types=factor(mutationdf1$types, levels=c("Diagnosis-specific", "Shared", "Relapse-specific"))


blues=c("#74a9cf","#0570b0", "#034e7b")
p=ggplot(mutationdf1,aes(x=as.integer(ID), y=value, fill=types)) + geom_bar(stat='identity', position="fill") +
facet_grid(~cluster,scale="free_x",labeller=label_both, space="free_x") +
scale_fill_manual(values=blues) + gb2.theme("bottom") + xlab("ID")+
ylab("proportion of somatic mutations") +scale_x_continuous(breaks=c(1, 48))
printtopdf(p,"~/awork/projects/paml/figures/Figure3/mutations.burden.snv.indel2.Oct.48.pdf", width=8,height=5)
save(mutationdf1, df1, indeldf1, file="mutationdf1.48.rda")
save(cl, file="~/awork/projects/paml/figures/Figure3/cl.rda")

# Supp Figure 6 #

# Clonal analysis using sciClone

library(sciClone)
pids=unique(gsub("-[12N]$", "",dir(pattern="-")))
pids1=pids[!pids %in% paste0("PAML-", c(80, 89, 96, 105))]
library(sciClone)
registerDoMC(cores=5)
foreach(p = pids1) %dopar%{
Tsnvs=foreach(i = 1:2) %do% {
sample.id=paste(p,i,sep="-");
snvf=paste0(sample.id,"/", sample.id, ".somatic.snv.info.rda");
load(snvf);
finalsnv[,c(1,2,4,5,9)]
}
names(Tsnvs)=paste(p, 1:2, sep="-")
acnvs=foreach(i = 1:2) %do% {
sample.id=paste(p,i,sep="-");
cnvf=paste0(sample.id,"/", sample.id, "_mergedcnv.rda");
load(cnvf)
cnv
}
names(acnvs)=paste(p, 1:2, sep="-")
xids=paste(p,1:2,sep="-");
print(p)
#Tsnvs=gm.asnvs.exons[xids]
# filling the SNVs gaps by making psudo zero vaf postion
Tsnvsr=do.call(rbind, Tsnvs)
Tsnvsu=unique(Tsnvsr[,1:2])
Tsnvsu$refn=50
Tsnvsu$altn=0
Tsnvsu$freq=0
rownames(Tsnvsu)=paste(Tsnvsu$seqnames, Tsnvsu$start,sep=".")
Tsnvs2=lapply(Tsnvs, function(X){
x=setdiff(rownames(Tsnvsu), paste(X$seqnames, X$start, sep="."))
colnames(Tsnvsu)=colnames(X)
rbind(X, Tsnvsu[x,])
})
for(i in 8){
tryCatch({
sc2=sciClone(Tsnvs2, copyNumberCalls = acnvs[xids],sampleNames =xids,cnCallsAreLog2 = F,minimumDepth = 50, maximumClusters = i)
sc.plot2d(sc2,outputFile = paste0(xids,"/",xids,"_sc2clo.exons.max", i, ".pdf"))
save(sc2, file=paste0(xids[1],"/",xids[1],"_sc2clo.exons.max", i, ".rda"))
writeClusterTable(sc2, paste0(xids[1],"/",xids[1],"_sc2clo.exons.max", i, ".txt"))
}, error=function(e){})
}
}
printtopdf(p, "fig3.shared_14N_3_epi_green2_oct.pdf", width=8,height=5)

# Figure 4 #
library(pheatmap);
library(doMC);
setwd("/scratchLocal/shl2018/AML/ERRBS/epipoly2/")
sumt=read.fast("~/ntd/errbs/amlc/T1/T1_entropy.txt",header=T,sep="\t")

sumt=read.fast("T1_entropy.txt",header=T,sep="\t")
pheatmap(sumt[which(rowMeans(sumt[,-1:-3])< -30), -1:-3])

files=dir("~/ntd/errbs/amlc/", pattern="_entropy.txt", recursive = T, full.names = T)
files=dir(".", pattern="_entropy.txt", recursive = T, full.names = T)
tnsum=list()
for(f in files){
tnsum[[basename(f)]]=read.fast(f,header=T,sep="\t")
Expand All @@ -336,7 +151,7 @@ unionm=foreach(n=1:5, .combine=cbind) %do% {
tmp
}
colnames(unionm)=splitn(names(tnsum),"_",1)
pdf("~/ntd/errbs/amlc/entropy_heatmap.pdf",width=4,height=4)
pdf("entropy_heatmap.pdf",width=4,height=4)
pheatmap(unionm, cluster_rows = T, cluster_cols = F,
clustering_distance_rows = "binary",
c("grey","black"))
Expand Down

0 comments on commit 4b6f40e

Please sign in to comment.