-
Notifications
You must be signed in to change notification settings - Fork 0
/
lsh.py
111 lines (83 loc) · 4.53 KB
/
lsh.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import numpy as np
import argparse
import utilities as utils
from dim_reduction import pca_dim_reduction, tsne_dim_reduction
from LocalitySensitiveHashing import *
def get_lsh_clusters(data_path, output_path, dim, num_clusters, r = 50, b = 100 ):
lsh = LocalitySensitiveHashing(
datafile = data_path,
dim = dim, # data dimensionality
r = r, # Number of rows in each band (each row is for one hash func)
b = b, # Number of bands self.how_many_hashes = r * b
expected_num_of_clusters = num_clusters
)
# load data
lsh.get_data_from_csv()
# obtain lsh clusters
lsh.initialize_hash_store()
lsh.hash_all_data()
# similarity_groups
sim_grps = lsh.lsh_basic_for_neighborhood_clusters()
sim_grps_coalesced = lsh.merge_similarity_groups_with_coalescence(sim_grps)
sim_grps_merged = lsh.merge_similarity_groups_with_l2norm_sample_based(sim_grps_coalesced)
# write clusters to the output
lsh.write_clusters_to_file(sim_grps_merged, output_path)
def load_sentence_embeddings(corpora_dirs, corpora_names, cap_size, reduced_dim = None):
sentences = []
embeddings = []
for (source, name) in zip(corpora_dirs, corpora_names):
sentences_embeddings = utils.load_precomputed_embeddings(source)
print('total len, {}'.format(name))
print(len(sentences_embeddings))
sents = [[name, e[0]] for e in sentences_embeddings[:cap_size]]
embds = [e[1][0] for e in sentences_embeddings[:cap_size]]
sentences.extend(sents)
embeddings.extend(embds)
# reduce dimensionality
if reduced_dim:
embeddings = pca_dim_reduction(embeddings, pca_dims = reduced_dim)
embeddings_with_id = []
sentences_with_id = []
count = 0
for s,e in zip(sentences,embeddings):
sent_id = '{}_{}'.format(s[0],count)
row = [sent_id]
row.extend(e)
embeddings_with_id.append(row)
sentences_with_id.append([sent_id, s[1]])
count += 1
return sentences_with_id, embeddings_with_id
# python3 lsh.py --corpora_dirs=data/04_embeddings/embeddings_NHL_raw_partial,data/04_embeddings/embeddings_MPC_raw_full --corpora_names=NHL,MPC --clustering_dir=data/05_clustering --cap_size=2000 --reduced_dim=50 --num_clusters=10
if __name__ == '__main__':
parser = argparse.ArgumentParser ()
parser.add_argument('--corpora_dirs', dest = 'corpora_dirs', default = '/path/to/corpus_embeddings1,/path/to/corpus_embeddings2', help = 'provide a list of corpus embeddings directories separated by comma')
parser.add_argument('--corpora_names', dest = 'corpora_names', default = 'corpus1,corpus2', help = 'provide a list of corpus names separated by comma')
parser.add_argument('--colors', dest = 'colors', default='purple,gold,cyan,black', help = 'provide a list of colors separated by comma')
parser.add_argument('--cap_size', dest = 'cap_size', type=int, default = 2000, help = 'provide an integer')
# clustering params
parser.add_argument('--clustering_dir', dest = 'clustering_dir', default = '/path/to/clusters', help = 'provide a directory path')
parser.add_argument('--reduced_dim', dest = 'reduced_dim', type=int, default = 50, help = 'provide an integer')
parser.add_argument('--num_clusters', dest = 'num_clusters', type=int, default = 10, help = 'provide an integer')
args = parser.parse_args()
## dataset embedding dirs
corpora_dirs = args.corpora_dirs.split(',')
corpora_names = args.corpora_names.split(',')
colors = args.colors.split(',')
# clustering params
clustering_dir = args.clustering_dir
reduced_dim = args.reduced_dim
num_clusters = args.num_clusters
## limit the number of data points from each corpus
cap_size = int(args.cap_size)
experiment_name = 'experiment_sents_{}({})_{}({})_reduceddim_{}_numclusters_{}'.format(cap_size, corpora_names[0], cap_size, corpora_names[1], reduced_dim, num_clusters)
experiment_path = '{}/{}'.format(clustering_dir, experiment_name)
utils.create_dir_if_not_exist(experiment_path)
# load sentences and their embeddings from previously extracted files
# add an id to each sentence to prepare them for clustering
sents_with_id, embs_with_id = load_sentence_embeddings(corpora_dirs, corpora_names, cap_size, reduced_dim = reduced_dim)
embs_with_id_path = '{}/embeddings_with_sent_id.csv'.format(experiment_path)
np.savetxt(embs_with_id_path, np.asarray(embs_with_id), delimiter=',', fmt='%s')
sents_with_id_path = '{}/sentences_with_sent_id.csv'.format(experiment_path)
np.savetxt(sents_with_id_path, np.asarray(sents_with_id), delimiter=',', fmt='%s')
output_path = '{}/clusters.txt'.format(experiment_path)
get_lsh_clusters(embs_with_id_path, output_path, dim = reduced_dim, num_clusters = num_clusters, r = 50, b = 100)