-
Notifications
You must be signed in to change notification settings - Fork 0
/
cluster_data.py
99 lines (88 loc) · 5.72 KB
/
cluster_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import numpy as np
import pandas as pd
import os
from sklearn.cluster import KMeans
import time # To know how much time it took (for time management)
helpers = __import__('helpers')
# To compute the clustering of core genes and plot/tabulate basic clustering information
t_start = time.time()
np.random.seed(12345)
paths = ['GRHL2/full/oe0', 'GRHL2/full/oe10', 'GRHL2/full/de10',
'GRHL2/partial/oe0', 'GRHL2/partial/oe10', 'GRHL2/partial/de10',
'NRF2/oe0', 'NRF2/oe10', 'NRF2/de10',
'OVOL/oe0', 'OVOL/oe10', 'OVOL/de10',
'base/oe0']
raw_data_paths = ['collated_data/' + x for x in paths] # where the csv files are stored
save_paths = ['figures/clustering/'] * 13 # where the plots will be stored (kept as a list for uniformity)
df_cluster_indices = pd.DataFrame() # to store all the cluster indices to later use for plotting and for easy reference
for path_to_folder, save_path in zip(raw_data_paths, save_paths):
t_per_folder = time.time()
print('\nStarting ' + path_to_folder)
files = os.listdir(path_to_folder)
circuit_names = [f.split('_')[0] for f in files if 'oe0Normalized.csv' in f] # names of the runs in the folder
oe_level = path_to_folder.split('/')[-1] # the over/down-expression level (from among oe0, oe10, de10)
# assuming no name has '_' in it
df_list = [] # stores the DataFrames for all the runs/replicates
df_param_list = []
for circuit_name in circuit_names:
df_list.append(pd.read_csv(path_to_folder + '/' + circuit_name + '_oe0Normalized.csv'))
df_param_list.append(pd.read_csv(path_to_folder + '/' + circuit_name + '_parameters_formatted.csv'))
psfname = circuit_names[0][:-1] # from among ovol, grhl2full, grhl2partial, base, nrf2
tier1_genes = ['u200', 'ZEB', 'LIN28', 'let7'] # coreGenes
# main clustering and saving the DataFrames along with phenotype calling
final_labels = [] # to store the final labels of all replicates (corresponding to cluster number = 4)
main_ss_vec = [] # to store the cluster quality indices for different runs
main_db_vec = []
main_ch_vec = []
main_inertia_vec = []
for run_index in range(5):
main_data = df_list[run_index].loc[:, tier1_genes].to_numpy()
major_labs_base = [np.array([]), np.array([])] # to allow indexing directly by c (number of clusters)
inertia_vec = [] # store KMeans inertia (one of the cluster metrics)
for c in range(2, 9):
kmeans_obj = KMeans(c, init='k-means++', n_init=50, random_state=1, algorithm='full', max_iter=500,
tol=1e-5)
kmeans_obj.fit(main_data)
major_labs_base.append(np.array(kmeans_obj.labels_, copy=True))
inertia_vec.append(kmeans_obj.inertia_)
if c == 4: # save the 4 cluster solution
final_labels.append(np.array(kmeans_obj.labels_, copy=True))
ssvec, chvec, dbvec = [], [], [] # Silhouette score, Calinski-Harabasz criteria, Davies-Bouldin score
for c in range(2, 9):
ss, ch, db = helpers.overall_scores_foo(main_data, major_labs_base[c])
ssvec.append(ss)
chvec.append(ch)
dbvec.append(db)
main_ss_vec.append(ssvec)
main_db_vec.append(dbvec)
main_ch_vec.append(chvec)
main_inertia_vec.append(inertia_vec)
# add the across-replicate means and sds for each of the indices for each of the cluster numbers from 2 - 8
column_prefix = psfname + oe_level
df_cluster_indices[column_prefix + 'sils_means'] = [np.mean([x[i] for x in main_ss_vec]) for i in range(7)]
df_cluster_indices[column_prefix + 'sils_sd'] = [np.std([x[i] for x in main_ss_vec], ddof=1) for i in range(7)]
df_cluster_indices[column_prefix + 'ch_means'] = [np.mean([x[i] for x in main_ch_vec]) for i in range(7)]
df_cluster_indices[column_prefix + 'ch_sd'] = [np.std([x[i] for x in main_ch_vec], ddof=1) for i in range(7)]
df_cluster_indices[column_prefix + 'db_means'] = [np.mean([x[i] for x in main_db_vec]) for i in range(7)]
df_cluster_indices[column_prefix + 'db_sd'] = [np.std([x[i] for x in main_db_vec], ddof=1) for i in range(7)]
df_cluster_indices[column_prefix + 'inert_means'] = [np.mean([x[i] for x in main_inertia_vec]) for i in range(7)]
df_cluster_indices[column_prefix + 'inert_sd'] = [np.std([x[i] for x in main_inertia_vec], ddof=1) for i in range(7)]
# phenotype calling
phen = ['e', 'he', 'hm', 'm'] # increasing order of ZEB
for run_index in range(5):
df_list[run_index]['clus_labels'] = final_labels[run_index] # add the original cluster labels (not needed)
df_list[run_index]['phenotype'] = 'Not Computed' # placeholder until the phenotypes are assigned
df = df_list[run_index]
zeb_medians = [] # calculate the ZEB medians of the 4 clusters
for c in range(4):
zeb_medians.append(np.median(df.loc[df['clus_labels'].to_numpy() == c, 'ZEB']))
clus_labels = [0, 1, 2, 3]
sorted_zeb_medians = sorted(zeb_medians)
for i in range(4):
zeb_value = sorted_zeb_medians[i] # in increasing order
clus_label_value = clus_labels[zeb_medians.index(zeb_value)]
df.loc[df['clus_labels'].to_numpy() == clus_label_value, 'phenotype'] = phen[i] # assign phenotype
df.to_csv(path_to_folder + '/' + psfname + str(run_index + 1) + '_clusData.csv', index=False)
print('Time for last folder: ' + str(round(time.time() - t_per_folder, 2)) + ' seconds.')
df_cluster_indices.to_csv(save_paths[0] + '/' + 'cluster_indices.csv', index=False)
print('\nAll finished. It took ' + str(round((time.time() - t_start) / 3600, 2)) + ' hours.')