-
Notifications
You must be signed in to change notification settings - Fork 2
/
UR_ranking.R
166 lines (138 loc) · 6.27 KB
/
UR_ranking.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly = TRUE)
stop_quietly <- function() {
opt <- options(show.error.messages = FALSE)
on.exit(options(opt))
stop()
}
if (args[1]=='--help') {
print('singleR.R <full path to ipa-input main directory (i.e., containing subdirectories and data from all samples to be compared)> <full output path>')
print('Built on R version 4.1.2')
stop_quietly()
}
# Test so that the number of arguments are correct: if not, return an error
if (length(args)<2) {
stop("At least two arguments must be supplied (see --help guide).n", call.=FALSE)
}
library(readxl)
library(dplyr)
library(pheatmap)
# dir.home <- getwd()
# dir.data <- paste(dir.home, '/ipa_output', sep = '')
# dir.out <- paste(dir.home, '/output', sep = '')
dir.data <- args[1]
dir.out <- args[3]
if (dir.exists(dir.out)==FALSE){
dir.create(dir.out, recursive = T)
print(paste('created', dir.out))
}
molecule_types <- c('complex', 'cytokine', 'G-protein coupled receptor',
'group', 'growth factor', 'ligand-dependent nuclear receptor',
'transmembrane receptor')
ipa_columns <- c("Upstream Regulator", "Molecule Type",
"Activation z-score", "p-value of overlap")
# Rank the URs based on in how many cell types and at how many time points they were predicted in
# read in the ipa output files
ipa_files <- list.files(dir.data, recursive = T, pattern = '^[^UR_]', full.names = T)
ipa_files <- ipa_files[-grep('manually_processed', ipa_files)]
ipa <- c()
for (i in 1:length(ipa_files)){
ipa[[i]] <- read_excel(ipa_files[i])
ipa[[i]] <- ipa[[i]][,1:9]
if (length(grep('All rights reserved', colnames(ipa[[i]])[1]))==1){
colnames(ipa[[i]]) <- ipa[[i]][1,]
ipa[[i]] <- ipa[[i]][-1,]
}
ipa[[i]] <- ipa[[i]][,which(colnames(ipa[[i]]) %in% ipa_columns)]
# sort the data to only include significant predictions
if (any(is.na(ipa[[i]]$`Activation z-score`))){
ipa[[i]]$`Activation z-score`[which(is.na(ipa[[i]]$`Activation z-score`))] <- 0 # NAs are considered not significant
ipa[[i]]$`Activation z-score` <- as.numeric(ipa[[i]]$`Activation z-score`)
}
if (any(is.na(ipa[[i]]$`p-value of overlap`))){
ipa[[i]]$`p-value of overlap`[which(is.na(ipa[[i]]$`p-value of overlap`))] <- 1 # NAs are considered not significant
ipa[[i]]$`p-value of overlap` <- as.numeric(ipa[[i]]$`p-value of overlap`)
}
ipa[[i]] <- ipa[[i]][which(abs(ipa[[i]]$`Activation z-score`) >= 2),]
ipa[[i]] <- ipa[[i]][which(ipa[[i]]$`p-value of overlap` <= 0.05),]
# sort the data only to include URs of interest
ipa[[i]] <- ipa[[i]][which(ipa[[i]]$`Molecule Type` %in% molecule_types),]
}
celltypes <- sapply(strsplit(ipa_files, '/'), '[[', 9)
celltypes <- gsub(' ', '', celltypes)
time_points <- sapply(strsplit(celltypes, '_'), '[[', 2)
celltypes <- sapply(strsplit(celltypes, '_'), '[[', 1)
celltypes <- gsub('monocytes', 'Monocytes', celltypes)
celltypes <- gsub('th1', 'Th1', celltypes)
celltypes <- gsub('Bcells', 'Bcell', celltypes)
celltypes <- gsub('Th1cell', 'Th1', celltypes)
celltypes <- gsub('Th17cell', 'Th17', celltypes)
celltypes <- gsub('Th2cell', 'Th2', celltypes)
celltypes <- gsub('CD8cell', 'CD8', celltypes)
celltypes <- gsub('Tregcell', 'Treg', celltypes)
celltypes <- gsub('Monocytescell', 'Monocytes', celltypes)
celltypes <- gsub('Dendriticcell', 'Dendritic', celltypes)
celltypes <- gsub('NKcell', 'NK', celltypes)
celltypes <- gsub('NTcell', 'NT', celltypes)
celltypes <- gsub('Bcell', 'Bcells', celltypes)
unique(sort(celltypes))
names(ipa) <- paste(celltypes, time_points, sep = '_')
# Summarize z-scores for each cellType_timePoint and UR
for (i in 1:length(ipa)){
ipa[[i]]$'Sample' <- names(ipa)[i]
}
ipa_comb <- rbind(ipa[[1]], ipa[[2]])
if (length(ipa) > 2){
for (i in 3:length(ipa)){
ipa_comb <- rbind(ipa_comb, ipa[[i]])
}
}
# count the number of occurrences for each UR
URs <- unique(sort(ipa_comb$`Upstream Regulator`))
UR_rank <- as.data.frame(matrix(NA, ncol = 2, nrow = length(URs)))
colnames(UR_rank) <- c('UR', 'N cell types and time points')
UR_rank$UR <- URs
for (i in 1:length(UR_rank$UR)){
UR_rank$`N cell types and time points`[i] <- length(which(ipa_comb$`Upstream Regulator` == UR_rank$UR[i]))
}
UR_rank <- UR_rank[order(UR_rank$`N cell types and time points`, decreasing = T),]
write.csv(ipa_comb, paste(dir.out, '/combined_UR_regulations.csv', sep = ''), row.names = F)
write.csv(UR_rank, paste(dir.out, '/UR_ranking.csv', sep = ''), row.names = F)
# Make a heatmap of the z-scores for each UR in each cell type at each time point
zscores <- matrix(NA, ncol = length(ipa), nrow = length(URs))
colnames(zscores) <- names(ipa)
rownames(zscores) <- UR_rank$UR
for (i in 1:length(colnames(zscores))){
for (z in 1:length(rownames(zscores))){
# i <- 1
# z <- 1
subx <- ipa_comb[which(ipa_comb$Sample == colnames(zscores)[i]),]
subx <- subx[which(subx$`Upstream Regulator` == rownames(zscores)[z]),]
if (length(subx$`Activation z-score`) == 1){
zscores[z,i] <- subx$`Activation z-score`
} else if (length(subx$`Activation z-score`) == 0){
zscores[z,i] <- 0
} else if (length(subx$`Activation z-score`) > 1){
print(paste(i, z, 'Error: more than one z-score selected'))
}
}
}
# create the heatmap for FC data
bk1 <- c(seq(-10,-0.1,by=0.1),-0.001)
bk2 <- c(0.001,seq(0.1,10,by=0.1))
bk <- c(bk1,bk2) #combine the break limits for purpose of graphing
my_palette <- c(colorRampPalette(colors = c("darkblue", "lightblue"))(n = length(bk1)-1),
"black", "black",
c(colorRampPalette(colors = c("pink", "darkred"))(n = length(bk2)-1)))
annotation_colors <- data.frame(row.names = colnames(zscores),
CellType = sapply(strsplit(colnames(zscores), '_'), '[[', 1),
TimePoint = sapply(strsplit(colnames(zscores), '_'), '[[', 2))
ph <- pheatmap(zscores, color = my_palette, breaks = bk,
show_colnames = F,
annotation_col = annotation_colors,
cluster_cols = F,
cluster_rows = F,
cellwidth = 4)
pdf(paste(dir.out, '/UR_ranking_heatmap.pdf', sep = ''), height = 13, width = 8)
print(ph)
dev.off()