-
Notifications
You must be signed in to change notification settings - Fork 0
/
helpers.py
175 lines (151 loc) · 7.9 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import numpy as np
import pandas as pd
from sklearn.metrics import silhouette_score, silhouette_samples, calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score
# Helper Functions
def shifted_hill_foo(y, b0, b, n): # Shifted Hill Function
# also representable as y + (1 - y)(1/(1 + (b/b0) ** n))
return (b0 ** n) / ((b0 ** n) + (b ** n)) + y * (b ** n) / ((b0 ** n) + (b ** n))
def column_normalize_foo(dframe, reference_dframe):
# To normalize the gene expression values (log2-scale) using the mean and sd from the reference_run
allgenes = ['u200', 'ZEB', 'SNAIL', 'LIN28', 'let7', 'NFkB', 'GRHL2', 'KEAP1', 'NRF2', 'ECad',
'OVOL'] # all possible genes in all the runs
newdframe = pd.DataFrame()
for i in dframe.columns:
x = dframe[i].to_numpy()
if i in allgenes:
if i in reference_dframe.columns:
m = np.mean(reference_dframe[i])
s = np.std(reference_dframe[i], ddof=1)
newdframe[i] = (x - m) / s
else: # if the gene not present in the reference DataFrame (only for completeness)
m = np.mean(dframe[i])
s = np.std(dframe[i], ddof=1)
newdframe[i] = (x - m) / s
else: # if the column is not a gene-column
newdframe[i] = x
return newdframe
def count_foo(model):
# To count children at different levels of a HierarchicalClustering tree
# modified from https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
return counts
def custom_dendrogram_foo(hc, counts, n):
# To return the linkage_matrix expected by scipy.cluster.hierarchy.dendrogram having data of only the last n merges
# Explanation:
# nodes merged at ith iteration have index n + i, with original observations having i < n
# for z = linkage matrix, z[i,0] and z[i,1] are merged at ith iteration to form a cluster of index n+i, and these
# nodes have a distance z[i,2] and the number of original observations (i<n) in the new cluster is z[i,3]
# Also, number of merges = number of observations - 1
children = hc.children_
distances = hc.distances_
original_n = len(counts) + 1
seen_clusters = dict() # the cluster numbers we have already observed
linkage_matrix = np.zeros((n, 4))
iteration = 0
leafnodes = -1 # begin at -1 so that the first assigned value is 0 (leafnodes + 1)
for c1, c2 in children[-n:, :]:
if c1 not in seen_clusters:
seen_clusters[c1] = leafnodes + 1
leafnodes += 1
if c2 not in seen_clusters:
seen_clusters[c2] = leafnodes + 1
leafnodes += 1
c1 = seen_clusters[c1]
c2 = seen_clusters[c2]
seen_clusters[original_n + len(counts) - n + iteration] = n + 1 + iteration
linkage_matrix[iteration, :] = [c1, c2, distances[-n + iteration], counts[-n + iteration]]
iteration += 1
return linkage_matrix
def hierarchical_cluster_labels(hc, n):
# To cut the HierarchicalClustering tree to obtain n clusters and return the labels
# starts joining clusters from bottom until the requisite number is reached
labs = np.array([x for x in range(len(hc.labels_))])
cluster_groups = dict([(x, [x]) for x in range(len(hc.labels_))])
curr_clusters = len(hc.labels_)
n_observations = len(hc.labels_)
i = 0
while curr_clusters > n:
c1, c2 = hc.children_[i, :]
gr = cluster_groups.pop(c1) + cluster_groups.pop(c2)
labs[np.array(gr)] = labs[gr[0]] # choose any one common label to apply to all
cluster_groups[n_observations + i] = gr
curr_clusters -= 1
i += 1
uq = np.unique(labs).tolist()
labs = [uq.index(x) for x in labs] # assign cluster numbers to observations based on the unique labels found
return labs
def overall_scores_foo(data, labs):
# To compute the cluster-quality metrics for the given labels and data using the euclidean distance norm
ss = silhouette_score(data, metric='euclidean', labels=labs)
ch = calinski_harabasz_score(data, labels=labs)
db = davies_bouldin_score(data, labels=labs)
return ss, ch, db
def silhouette_plot(data, labs, ax): # not used. Kept for the benefit of a possible (though improbable) reader :D
# To plot a Silhouette plot (individual sample silhouette widths arranged in descending fashion)
ss = silhouette_samples(data, labels=labs, metric='euclidean')
ss.sort()
ax.fill_between([i for i in range(len(ss))], ss, [0 for i in range(len(ss))])
ax.hlines(np.mean(ss), 0, len(ss), linestyles='dashed') # line for the average silhouette width
ax.set_xlabel('Sorted Observations')
ax.set_ylabel('Silhouette widths for each observation')
def custom_boxplot(ax, arr, position, col, median):
# Prettier custom boxplot-equivalent with colors
# arr contains the requisite percentiles, position is the center of the box (0.5 is the width of the box)
ax.fill_between([position - 0.25, position + 0.25], [arr[2], arr[2]], [arr[3], arr[3]], color=col)
ax.fill_between([position - 0.25, position + 0.25], [arr[1], arr[1]], [arr[2], arr[2]], color=col, alpha=0.6)
ax.fill_between([position - 0.25, position + 0.25], [arr[3], arr[3]], [arr[4], arr[4]], color=col, alpha=0.6)
ax.fill_between([position - 0.25, position + 0.25], [arr[0], arr[0]], [arr[1], arr[1]], color=col, alpha=0.3)
ax.fill_between([position - 0.25, position + 0.25], [arr[4], arr[4]], [arr[5], arr[5]], color=col, alpha=0.3)
ax.plot([position - 0.4, position + 0.4], [median, median], color=col, linewidth=2)
def bond_strength(y, mu, bigG, k, interaction_type='Act'):
if interaction_type == 'Act':
return y * bigG / mu / k # bond strength
elif interaction_type == 'Inh':
return bigG / mu / k / y # inverted lambda for inhibitory relations
else:
assert False, 'Incorrect Type.'
def asymmetry_bond_strength(b1, b2):
# returns asymmetry parameter for two bonds (used for double negative feedback loops)
log_ratio = np.log2(b1 / b2)
return log_ratio
def significance_annotate(xy1, xy2, p, ax, h=0.5, h2=0.3, h3=0.1):
# allows adding the significance bars to denote p-value in the plots
if p > 0.01:
s = '+'
elif p > 0.001:
s = '*'
elif p > 0.0001:
s = '**'
else:
s = '***'
m = max(xy1[1], xy2[1]) # located at the higher of the two y-values
ax.plot([xy1[0], xy1[0]], [m + h2, m + h2 + h], color='black')
ax.plot([xy1[0], xy2[0]], [m + h2 + h, m + h2 + h], color='black')
ax.plot([xy2[0], xy2[0]], [m + h2, m + h2 + h], color='black')
ax.text((xy1[0] + xy2[0]) / 2, m + h2 + h + h3, s, horizontalalignment='center', fontsize=14)
def significance_annotate_inverted(xy1, xy2, p, ax, h=0.5, h2=0.3, h3=0.1):
# allows adding the significance bars to denote p-value in the plots
if p > 0.01:
s = '+'
elif p > 0.001:
s = '*'
elif p > 0.0001:
s = '**'
else:
s = '***'
m = max(xy1[1], xy2[1]) # located at the higher of the two y-values
ax.plot([xy1[0], xy1[0]], [m - h2, m - h2 - h], color='black')
ax.plot([xy1[0], xy2[0]], [m - h2 - h, m - h2 - h], color='black')
ax.plot([xy2[0], xy2[0]], [m - h2, m - h2 - h], color='black')
ax.text((xy1[0] + xy2[0]) / 2, m - h2 - h - h3, s, horizontalalignment='center', verticalalignment='top',
fontsize=14)