/
analyzing_signatures.R
313 lines (188 loc) · 13.3 KB
/
analyzing_signatures.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
# install.packages("feather")
library(feather)
library(tidyverse)
apobec_hnscc_df <- read_feather("~/Documents/manuscripts/APOBEC_HNSCC/nfkb.feather")
head(apobec_hnscc_df)
apobec_hnscc_df %>%
group_by(categ) %>%
summarize(mean(S2),mean(S13))
# APOBEC_expression <- read.table(file = "expression/hnsc_tcga_pub_hnsc_tcga_pub_rna_seq_v2_mrna_median_Zscores_we_want.txt",skip=1,header = T,stringsAsFactors = F,sep=" ")
# downloaded from http://www.cbioportal.org/index.do?session_id=5a0f48fa498e5df2e2986845&show_samples=false&
APOBEC_expression <- read.table(file = "expression/hnsc_tcga_hnsc_tcga_rna_seq_v2_mrna_median_Zscores_tab.txt",header = T,stringsAsFactors = F,sep="\t")
load("HNSC_MAF.RData")
HNSC.MAF <- HNSC.MAF[which(HNSC.MAF$Variant_Type=="SNP"),]
head(HNSC.MAF)
length(unique(HNSC.MAF$Unique_patient_identifier))
unique(names(APOBEC_expression))
unique(HNSC.MAF$Unique_patient_identifier)
names(APOBEC_expression)
# Need to only keep names that correspond to APOBEC
APOBEC_expression <- APOBEC_expression[startsWith(x = names(APOBEC_expression),prefix = "TCGA")]
APOBEC_expression <- APOBEC_expression[endsWith(x = names(APOBEC_expression),suffix = ".01")]
# Need to replace all . with - and delete last 3
names(APOBEC_expression) <- gsub("\\.", "-", names(APOBEC_expression))
first.12 <- function(string_to_12){
return(paste(unlist(strsplit(string_to_12,split = ""))[1:12],collapse = ""))
}
names(APOBEC_expression) <- unlist(lapply(names(APOBEC_expression),first.12))
APOBEC3B.expression.and.hpv <- as.data.frame(matrix(data = NA,nrow = length(unique(HNSC.MAF$Unique_patient_identifier)),ncol=3))
rownames(APOBEC3B.expression.and.hpv) <- unique(HNSC.MAF$Unique_patient_identifier)
colnames(APOBEC3B.expression.and.hpv) <- c("HPV_status","APOBEC3B_expression_Z_score","SNV_count")
for(i in 1:nrow(APOBEC3B.expression.and.hpv)){
APOBEC3B.expression.and.hpv[i,"SNV_count"] <- length(which(HNSC.MAF$Unique_patient_identifier==rownames(APOBEC3B.expression.and.hpv)[i]))
APOBEC3B.expression.and.hpv[i,"HPV_status"] <- apobec_hnscc_df$categ[which(apobec_hnscc_df$patient_id==rownames(APOBEC3B.expression.and.hpv)[i])]
if(length(which(names(APOBEC_expression)==rownames(APOBEC3B.expression.and.hpv)[i]))>0){
APOBEC3B.expression.and.hpv[i,"APOBEC3B_expression_Z_score"] <- as.numeric(APOBEC_expression[which(names(APOBEC_expression)==rownames(APOBEC3B.expression.and.hpv)[i])])
}
}
APOBEC.data <- subset(APOBEC3B.expression.and.hpv, HPV_status!="ambiguous")
library(ggplot2)
APOBEC.plot <- ggplot(data = APOBEC.data, aes(x=APOBEC3B_expression_Z_score,y=SNV_count)) +
geom_point(aes(x=APOBEC3B_expression_Z_score,y=SNV_count)) +
geom_smooth(method=lm) +
theme_bw() +
facet_grid(HPV_status ~ .)
APOBEC.plot
ggsave(filename = "Figures/SNV_vs_APOBEC3B_expression.png")
APOBEC.plot.subset <- ggplot(data = subset(APOBEC.data,SNV_count<2000), aes(x=APOBEC3B_expression_Z_score,y=SNV_count)) +
geom_point(aes(x=APOBEC3B_expression_Z_score,y=SNV_count)) +
geom_smooth(method=lm) +
theme_bw() +
facet_grid(HPV_status ~ .)
APOBEC.plot.subset
ggsave(filename = "Figures/subset_SNV_vs_APOBEC3B_expression.png",plot = APOBEC.plot.subset)
summary(lm(APOBEC.data$SNV_count ~ APOBEC.data$APOBEC3B_expression_Z_score))
summary(lm(APOBEC.data$SNV_count[which(APOBEC.data$HPV_status=="HPV+")] ~ APOBEC.data$APOBEC3B_expression_Z_score[which(APOBEC.data$HPV_status=="HPV+")]))
summary(lm(APOBEC.data$SNV_count[which(APOBEC.data$HPV_status=="HPV-")] ~ APOBEC.data$APOBEC3B_expression_Z_score[which(APOBEC.data$HPV_status=="HPV-")]))
summary(lm(APOBEC.data$SNV_count[which(APOBEC.data$HPV_status=="HPV-" & APOBEC.data$SNV_count<2000)] ~ APOBEC.data$APOBEC3B_expression_Z_score[which(APOBEC.data$HPV_status=="HPV-" & APOBEC.data$SNV_count<2000)]))
APOBEC.plot <- ggplot(data = APOBEC.data, aes(y=APOBEC3B_expression_Z_score,x=SNV_count)) +
geom_point(aes(y=APOBEC3B_expression_Z_score,x=SNV_count)) +
geom_smooth(method=lm) +
theme_bw() +
facet_grid(HPV_status ~ .)
APOBEC.plot
APOBEC.plot.subset <- ggplot(data = subset(APOBEC.data,SNV_count<2000), aes(x=APOBEC3B_expression_Z_score,x=SNV_count)) +
geom_point(aes(y=APOBEC3B_expression_Z_score,x=SNV_count)) +
geom_smooth(method=lm) +
theme_bw() +
facet_grid(HPV_status ~ .)
APOBEC.plot.subset
summary(lm(APOBEC.data$APOBEC3B_expression_Z_score ~APOBEC.data$SNV_count ))
summary(lm( APOBEC.data$APOBEC3B_expression_Z_score[which(APOBEC.data$HPV_status=="HPV+")]~APOBEC.data$SNV_count[which(APOBEC.data$HPV_status=="HPV+")] ))
summary(lm(APOBEC.data$APOBEC3B_expression_Z_score[which(APOBEC.data$HPV_status=="HPV-")] ~ APOBEC.data$SNV_count[which(APOBEC.data$HPV_status=="HPV-")] ))
summary(lm( APOBEC.data$APOBEC3B_expression_Z_score[which(APOBEC.data$HPV_status=="HPV-" & APOBEC.data$SNV_count<2000)]~APOBEC.data$SNV_count[which(APOBEC.data$HPV_status=="HPV-" & APOBEC.data$SNV_count<2000)] ))
# SNV count vs. APOBEC vs. HPV+
APOBEC.data$APOBEC_pos_neg <- NA
for(i in 1:nrow(APOBEC.data)){
APOBEC.data$APOBEC_pos_neg[i] <- apobec_hnscc_df$has_apobec[which(apobec_hnscc_df$patient_id==rownames(APOBEC.data)[i])]
}
require(ggplot2)
ggplot(data=APOBEC.data,aes(y=log10(SNV_count), x=APOBEC_pos_neg))+ geom_boxplot() + geom_jitter(alpha=0.4,width = 0.1) + facet_grid(HPV_status ~.) + coord_flip() + labs(x="APOBEC signal",y="log10(SNV count)")
APOBEC.data %>% group_by(HPV_status, APOBEC_pos_neg) %>% count()
hpv.pos.apo.pos <- APOBEC.data$SNV_count[which(APOBEC.data$HPV_status=="HPV+" & APOBEC.data$APOBEC_pos_neg==T)]
hpv.pos.apo.neg <- APOBEC.data$SNV_count[which(APOBEC.data$HPV_status=="HPV+" & APOBEC.data$APOBEC_pos_neg==F)]
wilcox.test(hpv.pos.apo.neg,hpv.pos.apo.pos,paired = F,conf.int = T)
wilcox.test(hpv.pos.apo.pos,hpv.pos.apo.neg,paired = F,conf.int = T)
median(hpv.pos.apo.neg);median(hpv.pos.apo.pos)
t.test(hpv.pos.apo.neg,hpv.pos.apo.pos)
hpv.neg.apo.pos <- APOBEC.data$SNV_count[which(APOBEC.data$HPV_status=="HPV-" & APOBEC.data$APOBEC_pos_neg==T)]
hpv.neg.apo.neg <- APOBEC.data$SNV_count[which(APOBEC.data$HPV_status=="HPV-" & APOBEC.data$APOBEC_pos_neg==F)]
wilcox.test(hpv.neg.apo.neg,hpv.neg.apo.pos,paired = F,conf.int = T)
wilcox.test(hpv.neg.apo.pos,hpv.neg.apo.neg,paired = F,conf.int = T)
t.test(hpv.neg.apo.neg,hpv.neg.apo.pos)
median(hpv.neg.apo.neg);median(hpv.neg.apo.pos)
# wilcox.test(log10(hpv.pos.apo.neg),log10(hpv.pos.apo.pos))
# wilcox.test(log10(hpv.neg.apo.neg),log10(hpv.neg.apo.pos))
# wilcox.test(hpv.neg.apo.pos,hpv.neg.apo.neg)
wilcox.test(hpv.neg.apo.pos,hpv.pos.apo.pos)
wilcox.test(hpv.pos.apo.neg,hpv.neg.apo.neg)
#Testing
# mean(apobec_hnscc_df$S2[which(apobec_hnscc_df$categ=="HPV+")])
hpv.neg <- APOBEC.data$SNV_count[which(APOBEC.data$HPV_status=="HPV-")]
wilcox.test(hpv.pos.apo.pos,hpv.neg,paired = F,conf.int = T)
# ggplot(data = APOBEC.data,aes(y=log10(SNV_count),x=APOBEC_pos_neg))
wilcox.test(hpv.pos.apo.neg,hpv.neg,paired = F,conf.int = T)
# Need to catalogue all mutations as APOBEC mutations or not.
load("~/Documents/Selection_analysis/HNSC/selection_output/HNSC_selection_output.RData")
genes <- unique(selection.output$complete_mutation_data$Gene)
nrow(selection.output$complete_mutation_data)
selection.output$complete_mutation_data$TCW_TKW <- NA
# weight.and.trinuc.df <- as.data.frame(matrix(nrow=nrow(selection.output$complete_mutation_data),ncol=5,data=NA))
# colnames(weight.and.trinuc.df) <- c("gene","mutation","tumor","APOBEC_weight","TCW_TKW")
if(length(which(is.na(selection.output$complete_mutation_data$Nucleotide_trinuc_context)))>0){
selection.output$complete_mutation_data <- selection.output$complete_mutation_data[-which(is.na(selection.output$complete_mutation_data$Nucleotide_trinuc_context)),]
}
for(i in 1:nrow(selection.output$complete_mutation_data)){
if(((selection.output$complete_mutation_data$Nucleotide_trinuc_context[i]=="TCA" & selection.output$complete_mutation_data$Alternative_Nucleotide[i]=="T") |
(selection.output$complete_mutation_data$Nucleotide_trinuc_context[i]=="TGA" & selection.output$complete_mutation_data$Alternative_Nucleotide[i]=="A")) |
((selection.output$complete_mutation_data$Nucleotide_trinuc_context[i]=="TCT" & selection.output$complete_mutation_data$Alternative_Nucleotide[i]=="T") |
(selection.output$complete_mutation_data$Nucleotide_trinuc_context[i]=="AGA" & selection.output$complete_mutation_data$Alternative_Nucleotide[i]=="A")) |
((selection.output$complete_mutation_data$Nucleotide_trinuc_context[i]=="TCA" & selection.output$complete_mutation_data$Alternative_Nucleotide[i]=="G") |
(selection.output$complete_mutation_data$Nucleotide_trinuc_context[i]=="TGA" & selection.output$complete_mutation_data$Alternative_Nucleotide[i]=="C")) |
((selection.output$complete_mutation_data$Nucleotide_trinuc_context[i]=="TCT" & selection.output$complete_mutation_data$Alternative_Nucleotide[i]=="G") |
(selection.output$complete_mutation_data$Nucleotide_trinuc_context[i]=="AGA" & selection.output$complete_mutation_data$Alternative_Nucleotide[i]=="C"))){
selection.output$complete_mutation_data$TCW_TKW[i] <- 1
}else{
selection.output$complete_mutation_data$TCW_TKW[i] <- 0
}
}
APOBEC.data$percent_TCW_TKW <- NA
for(i in 1:nrow(APOBEC.data)){
APOBEC.data$percent_TCW_TKW[i] <- sum(selection.output$complete_mutation_data$TCW_TKW[which(selection.output$complete_mutation_data$Unique_patient_identifier==rownames(APOBEC.data)[i])])/length(which(selection.output$complete_mutation_data$Unique_patient_identifier==rownames(APOBEC.data)[i]))
}
head(APOBEC.data)
library(ggplot2)
apobec.signal.vs.percent <- ggplot(data=APOBEC.data,aes(y=percent_TCW_TKW, x=APOBEC_pos_neg))+ geom_boxplot() + geom_jitter(alpha=0.4,width = 0.1) + facet_grid(HPV_status ~.) + coord_flip() + labs(x="APOBEC signal",y="Percent of SNV that are TCW --> TKW") + theme_bw()
ggsave(filename = "~/Documents/manuscripts/APOBEC_HNSCC/Figures/APOBEC_signal_vs_percent.png",plot = apobec.signal.vs.percent)
tcw.percent.apo.pos.hpv.pos <- APOBEC.data$percent_TCW_TKW[which(APOBEC.data$HPV_status=="HPV+" & APOBEC.data$APOBEC_pos_neg==T)]
tcw.percent.apo.pos.hpv.neg <- APOBEC.data$percent_TCW_TKW[which(APOBEC.data$HPV_status=="HPV-" & APOBEC.data$APOBEC_pos_neg==T)]
median(tcw.percent.apo.pos.hpv.pos)
median(tcw.percent.apo.pos.hpv.neg)
wilcox.test(tcw.percent.apo.pos.hpv.pos, tcw.percent.apo.pos.hpv.neg, paired = F, conf.int = T)
tcw.percent.apo.neg.hpv.pos <- APOBEC.data$percent_TCW_TKW[which(APOBEC.data$HPV_status=="HPV+" & APOBEC.data$APOBEC_pos_neg==F)]
tcw.percent.apo.neg.hpv.neg <- APOBEC.data$percent_TCW_TKW[which(APOBEC.data$HPV_status=="HPV-" & APOBEC.data$APOBEC_pos_neg==F)]
wilcox.test(tcw.percent.apo.neg.hpv.pos,tcw.percent.apo.neg.hpv.neg,conf.int = T,paired = F)
# comparing HPV status of our method and tang 2013 ----
# from https://www.nature.com/articles/ncomms3513#supplementary-information
# Tang et al 2013 The landscape of viral expression and host gene fusion and adaptation in human cancer
# Where Hinderson 2014 got their viral data from
tang.2013 <- read.csv(file = "input_data/ncomms3513-s2.txt",skip = 3,sep="\t",header = T,stringsAsFactors = F)
head(tang.2013)
tang.2013 <- subset(tang.2013, Cancer=="HNSC")
tang.2013$Sample.barcode[1]
names.split <- strsplit(tang.2013$Sample.barcode,split="-")
tang.2013$sample.short <- NA
for(i in 1:nrow(tang.2013)){
tang.2013$sample.short[i] <- paste(names.split[[i]][1:3],collapse = "-")
}
both.datasets <- union(tang.2013$sample.short,apobec_hnscc_df$patient_id)
HPV.comparison <- as.data.frame(matrix(data = NA,nrow=length(both.datasets),ncol=3))
colnames(HPV.comparison) <- c("tumor","tang2013","ours")
HPV.comparison$tumor <- both.datasets
for(i in 1:nrow(HPV.comparison)){
if(length(which(apobec_hnscc_df$patient_id==HPV.comparison$tumor[i]))==1){
HPV.comparison$ours[i] <- apobec_hnscc_df$categ[which(apobec_hnscc_df$patient_id==HPV.comparison$tumor[i])]
}
if(length(which(tang.2013$sample.short==HPV.comparison$tumor[i]))==1){
HPV.comparison$tang2013[i] <- tang.2013$Virus.description[which(tang.2013$sample.short==HPV.comparison$tumor[i])]
if(HPV.comparison$tang2013[i]=="N/A"){ HPV.comparison$tang2013[i] <- "not_detected"}
}
}
HPV.comparison$both_pos <- NA
HPV.comparison$both_neg <- NA
HPV.comparison$tang_pos_us_neg <- NA
HPV.comparison$tang_neg_us_pos <- NA
HPV.comparison$tang_known_us_unknown <- NA
HPV.comparison$tang_unknown_us_known <- NA
HPV.comparison$both_pos[which(HPV.comparison$ours=="HPV+" & (!(HPV.comparison$tang2013 %in% c("not_detected")) & !is.na(HPV.comparison$tang2013)))] <- T
HPV.comparison$both_neg[which(HPV.comparison$ours=="HPV-" & HPV.comparison$tang2013 == "not_detected")] <- T
HPV.comparison$tang_pos_us_neg[which(HPV.comparison$ours=="HPV-" & (!(HPV.comparison$tang2013 %in% c("not_detected")) & !is.na(HPV.comparison$tang2013)))] <- T
HPV.comparison$tang_neg_us_pos[which(HPV.comparison$ours=="HPV+" & HPV.comparison$tang2013 == "not_detected")] <- T
HPV.comparison$tang_known_us_unknown[which(is.na(HPV.comparison$ours) & !is.na(HPV.comparison$tang2013))] <- T
HPV.comparison$tang_unknown_us_known[which(!is.na(HPV.comparison$ours) & is.na(HPV.comparison$tang2013))] <- T
write.table(x = HPV.comparison,file = "output_data/comparing_tang2013_and_our_HPV_status.txt",quote = F,sep = "\t",row.names = F)
length(which(HPV.comparison$both_pos==T))
length(which(HPV.comparison$both_neg==T))
length(which(HPV.comparison$tang_neg_us_pos==T))
length(which(HPV.comparison$tang_pos_us_neg==T))
nrow(tang.2013)