/
clustering.py
143 lines (125 loc) · 6.48 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from lib2to3.pgen2.token import VBAR
from termios import FIONBIO
import numpy as np
from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN, OPTICS, SpectralClustering
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import silhouette_score, rand_score, adjusted_rand_score, mutual_info_score, adjusted_mutual_info_score, normalized_mutual_info_score, calinski_harabasz_score, davies_bouldin_score
file_results_clustering = './data/output/results_clustering_{}.txt'
def cluster_algo(dataset, n_clusters, prefix="obs", algo="hierarchy"):
if algo == "hierarchy":
model = AgglomerativeClustering(n_clusters=n_clusters, affinity="euclidean", linkage="ward")
elif algo == "kmeans":
model = KMeans(n_clusters=n_clusters, random_state=0)
elif algo == "dbscan":
model = DBSCAN(eps=0.25, min_samples=3)
elif algo == "optics":
model = OPTICS(min_samples=5, xi=0.2, p=1)
elif algo == "spectral":
# model = SpectralClustering(n_clusters=n_clusters, assign_labels='cluster_qr') # this mode is for higher sklearn versions
model = SpectralClustering(n_clusters=n_clusters, assign_labels='discretize')
else:
raise ValueError(f"No such algo: {algo}")
model.fit(dataset)
return [f"{prefix}{i}" for i in model.labels_]
def check_accuracy(fake_floor_labels, floor_labels): # not used, as we modified floor labelling, see show_performance()
correct_labels = 0
counts = 0
for label in list(set(fake_floor_labels)):
tmp_floor_labels = [floor_labels[idx] for idx in range(len(fake_floor_labels))
if fake_floor_labels[idx] == label]
label_counts = {}
for item in tmp_floor_labels:
if item not in label_counts:
label_counts[item] = 0
label_counts[item] += 1
max_count = np.max(list(label_counts.values()))
correct_labels += max_count
counts += np.sum(list(label_counts.values()))
return correct_labels / counts
def evaluate(conf_matrix, building, args=None):
TP = np.diag(conf_matrix)
FP = np.sum(conf_matrix, axis=0) - TP
FN = np.sum(conf_matrix, axis=1) - TP
pred_list = TP+FP
true_list = TP+FN
f_list = []
t_list = []
for i in range(len(pred_list)):
for _ in range(int(pred_list[i])):
f_list.append(i)
for _ in range(int(true_list[i])):
t_list.append(i)
ari = adjusted_rand_score(t_list, f_list)
nmi = normalized_mutual_info_score(t_list, f_list, average_method='arithmetic')
print (f'{building}: {len(conf_matrix)}, {ari}, {nmi} \n')
if args:
with open(file_results_clustering.format(args.dim), 'a+') as f_out:
f_out.write(f'{building}: {len(conf_matrix)}, ari: {ari}, nmi: {nmi} \n')
def show_performance(fake_floor_labels, floor_labels, building, args=None): # performance when we "know" the true labels, not used as this is a clustering job
cluster_dict = {}
occupied_true_labels = []
true_floors = list(set(floor_labels))
for label in list(set(fake_floor_labels)):
cluster_dict[label] = fake_floor_labels.count(label)
fake_floor_list = [k for (k, v) in sorted(cluster_dict.items(), key=lambda x:x[1], reverse=True)] # fake floor labels (cluster labels)
conf_matrix = np.zeros((len(fake_floor_list), len(fake_floor_list))) # newly designed, as we modified the accuracy measure
for label in (fake_floor_list):
tmp_floor_labels = [floor_labels[idx] for idx in range(len(fake_floor_labels)) if fake_floor_labels[idx] == label]
label_counts = {}
for item in tmp_floor_labels:
if item not in label_counts:
label_counts[item] = 0
label_counts[item] += 1
for (k, v) in sorted(label_counts.items(), key=lambda x:x[1], reverse=True):
if k not in occupied_true_labels:
i = true_floors.index(k)
occupied_true_labels.append(k)
conf_matrix[i][i] = v
break
for (k, v) in sorted(label_counts.items(), key=lambda x:x[1], reverse=True):
if k not in occupied_true_labels:
conf_matrix[true_floors.index(k)][i] = v
print("Number of clusters: ", len(set(fake_floor_labels)))
evaluate(conf_matrix, building, args)
def get_predicted_labels(fake_floor_labels, floor_labels):
cluster_dict = {}
occupied_true_labels = []
predicted_floor_labels = ['-1' for item in fake_floor_labels] # store the result after we match fake floor labels to real ones
true_floors = list(set(floor_labels))
for label in list(set(fake_floor_labels)):
cluster_dict[label] = fake_floor_labels.count(label)
fake_floor_list = [k for (k, v) in sorted(cluster_dict.items(), key=lambda x: x[1],
reverse=True)] # fake floor labels (cluster labels)
conf_matrix = np.zeros(
(len(fake_floor_list), len(fake_floor_list))) # newly designed, as we modified the accuracy measure
labeled = []
for label in (fake_floor_list):
tmp_floor_labels = [floor_labels[idx] for idx in range(len(fake_floor_labels)) if
fake_floor_labels[idx] == label]
label_counts = {} # Find the majority of true labels in the clusters.
for item in tmp_floor_labels:
if item not in label_counts:
label_counts[item] = 0
label_counts[item] += 1
for (k, v) in sorted(label_counts.items(), key=lambda x: x[1], reverse=True):
if k not in occupied_true_labels:
labeled.append(label)
i = true_floors.index(k)
occupied_true_labels.append(k)
conf_matrix[i][i] = v
indexes = [idx for idx in range(len(fake_floor_labels)) if
fake_floor_labels[idx] == label]
for idx in indexes:
predicted_floor_labels[idx] = k
break
for label in fake_floor_list:
if label not in labeled:
indexes = [idx for idx in range(len(fake_floor_labels)) if
fake_floor_labels[idx] == label]
for true_label in true_floors:
if true_label not in occupied_true_labels:
occupied_true_labels.append(true_label)
for idx in indexes:
predicted_floor_labels[idx] = true_label
break
return predicted_floor_labels, occupied_true_labels