/
wmd.py
35 lines (32 loc) · 1.18 KB
/
wmd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import os
import numpy as np
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import stop_words
from sklearn.metrics import euclidean_distances
from pyemd import emd
model = Word2Vec.load("data/word_model.mod")
def get_wmd_distance(d1, d2, min_vocab=7, verbose=False):
vocabulary = [w for w in set(d1.lower().split() + d2.lower().split()) if w in model.vocab and w not in stop_words.ENGLISH_STOP_WORDS]
if len(vocabulary) < min_vocab:
return 1
vect = CountVectorizer(vocabulary=vocabulary).fit([d1, d2])
W_ = np.array([model[w] for w in vect.get_feature_names() if w in model])
D_ = euclidean_distances(W_)
D_ = D_.astype(np.double)
D_ /= D_.max() # just for comparison purposes
v_1, v_2 = vect.transform([d1, d2])
v_1 = v_1.toarray().ravel()
v_2 = v_2.toarray().ravel()
# pyemd needs double precision input
v_1 = v_1.astype(np.double)
v_2 = v_2.astype(np.double)
v_1 /= v_1.sum()
v_2 /= v_2.sum()
if verbose:
print vocabulary
print v_1, v_2
return emd(v_1, v_2, D_)
# d1 = "Government speaks to the media in Illinois"
# d2 = "The state addresses the press in Chicago"
# print get_wmd_distance(d1, d2)