-
Notifications
You must be signed in to change notification settings - Fork 4
/
fct_regressors_grouping.R
162 lines (149 loc) · 6.05 KB
/
fct_regressors_grouping.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#' Get correlation
#'
#' @param pair character of two gene IDs sepearte by a white space
#' @param normalized.count normalized expression matrix containing
#' the IDs of the pair
#'
#' @return spearman correlation bewteen the two expression vectors
get_correlation <- function(pair, normalized.count) {
tf1 <- stringr::str_split_fixed(pair, ' ', 2)[1]
tf2 <- stringr::str_split_fixed(pair, ' ', 2)[2]
if (tf1 %in% rownames(normalized.count) &
tf2 %in% rownames(normalized.count))
return(cor(
as.numeric(normalized.count[tf1, ]),
as.numeric(normalized.count[tf2, ]),
method = "spearman"
))
}
#' Group correlated regulators
#'
#' @description During network inference, the importance of regressors is assessed for aech target gene.
#' But regression methods are very sensitive to correlation between predictive variables, that should be
#' dealt with for stability reasons.
#' To account for this, this function summarizes the expression of the regulators being correlated above
#' a certain threshold as new variables, being the mean of several correlated profiles.
#'
#' To do so, a graph of all regressors correlated (spearman) above the threshold id built, and groups
#' are formed with a community detection algorithm. Each group is then averaged in a single variable.
#' Strongly negatively correlated regressors are added to the group it is correlated to, but their expression
#' is not taken into account in the summary mean profile.
#'
#' @param normalized.count normalized expression matrix containing genes and regressors.
#' @param genes target genes that want to be used in the inference process
#' @param regressors regressor genes to be used in the inference process.
#' @param corr_thr correlation threshold to be used for regressors grouping.
#'
#' @return a named list.
#'
#'
#' counts : the normalized expression data containing the new summarized variables, in
#' the format mean_geneID1-geneID2... The individual genes that were grouped are removed.
#'
#'
#' correlated_regressors_graph : visNetwork data
#' of the correlated regulators
#'
#'
#' grouped_genes : new vector of target genes, with individual correlated genes replaced by groups
#'
#'
#' grouped_regressors : new vector of regressors, with individual correlated genes replaced by groups
#'
#' @export
#'
#' @examples
#' data(abiotic_stresses)
#' aggregated_data <- aggregate_splice_variants(abiotic_stresses$normalized_counts)
#' genes <- get_locus(abiotic_stresses$heat_DEGs)
#' regressors <- intersect(genes, regulators_per_organism[["Arabidopsis thaliana"]])
#' grouping <- DIANE::group_regressors(aggregated_data, genes, regressors)
#' print(names(grouping))
group_regressors <-
function(normalized.count,
genes,
regressors,
corr_thr = 0.9) {
if(length(regressors) <= 2){
stop("More than 2 regressors are required for grouping")
}
#calculating correlations for each TF pairs
pairs <- data.frame(t(combn(regressors, 2)))
pairs$cor <- sapply(paste(pairs[, 1], pairs[, 2]), get_correlation,
normalized.count = normalized.count)
top <- pairs[pairs$cor > corr_thr, ]
if (nrow(top) == 0) {
warning("No grouping was performed as no regulators pair
was correlated over the threshold.")
return(
list(
counts = normalized.count,
correlated_regressors_graph = NULL,
grouped_genes = genes,
grouped_regressors = regressors
)
)
}
# graph and communities detection of highly correlated TFs
net_un <- igraph::graph_from_data_frame(top, directed = FALSE)
louvain <- igraph::cluster_louvain(net_un)
groups <- igraph::membership(louvain)
# graph plot creation
d <- visNetwork::toVisNetworkData(net_un)
d$edges$value <- d$edges$cor
d$edges$label <- round(d$edges$cor, 3)
d$nodes$group <- groups[match(d$nodes$id, names(groups))]
graph_plot <- d
other_tfs <- regressors[!regressors %in% names(groups)]
# Builiding the new consensus variables
new_reg <- c()
grouped_regs <- data.frame(matrix(nrow = length(unique(groups)),
ncol = length(colnames(normalized.count))))
colnames(grouped_regs) <- colnames(normalized.count)
rownames(grouped_regs) <- unique(groups)
grouped_tfs <- c()
for (group in unique(groups)) {
tfs <- names(groups)[groups == group]
mean_tf <- colMeans(normalized.count[tfs, ])
grouped_regs[group, ] <- mean_tf
# find the negatively correlated regulators to that group, and add them, without
# using their expression un the mean group expression
for (tf in other_tfs) {
if (cor(as.numeric(normalized.count[tf, ]), mean_tf, method = "spearman") < -corr_thr) {
print(
paste(
"adding tf ",
tf,
" to group ",
group,
"because correlation of",
cor(as.numeric(normalized.count[tf, ]), mean_tf, method = "spearman"),
"to mean"
)
)
tfs <- c(tfs, tf)
# remove this tf from the list so it is not assigned to another group later
other_tfs <- other_tfs[other_tfs != tf]
print(length(other_tfs))
}
}
new_reg <-
c(new_reg, paste0("mean_", paste(tfs, collapse = "-")))
grouped_tfs <- c(grouped_tfs, tfs)
}
rownames(grouped_regs) <- new_reg
normalized.count <-
rbind.data.frame(normalized.count, grouped_regs)
#remove regressors that are grouped from the data
normalized.count <-
normalized.count[!rownames(normalized.count) %in% grouped_tfs, ]
return(
list(
counts = normalized.count,
correlated_regressors_graph = graph_plot,
grouped_genes = c(intersect(genes, rownames(normalized.count)),
rownames(normalized.count)[grepl("mean_", rownames(normalized.count))]),
grouped_regressors = c(new_reg, regressors[!regressors %in% grouped_tfs])
)
)
}