In [18]:
import numpy as np
from gensim.models import KeyedVectors
from itertools import repeat
model_glove = KeyedVectors.load_word2vec_format('glove.6B.50d_word2vec.txt')

In [19]:
def find_range(point, range_limits):
    n_limits = len(range_limits)
    for limit in range(n_limits - 1):
        if (point > range_limits[limit]) and (point < range_limits[limit + 1]):
            return limit
    raise ValueError

In [20]:
def colapse_into_25(space_50d):
    ndim = space_50d.ndim
    if ndim == 1:  # If a word embedding
        space_25d = np.zeros(25)
        for idx in range(25):
            space_25d[idx] = np.sum(space_50d[idx*2:(idx+1)*2])
    else:
        space_25d = np.zeros([space_50d.shape[0], 25])
        for idx in range(25):
            space_25d[idx] = np.sum(space_50d[idx*2:(idx+1)*2],)
    return space_25d

In [35]:
def embspace_to_midi(word_embedding, n_words):
    """
        word_embedding: The 50dim vector resulting of difference between multiple words embedding
        n_words: Number of words used to create the word_embedding
    """
    embedding = np.load('./mappings.npy')
    reduced_25 = colapse_into_25(embedding)
    maxs = np.max(reduced_25, axis=0)*n_words
    mins = np.min(reduced_25, axis=0)*n_words
    steps = (maxs - mins) / 129
    mappings = np.array(list((map(np.arange, mins, maxs, steps))))

    reduced_embedding = colapse_into_25(word_embedding)
        
    midi = np.zeros(25)
    for dimension in range(25):
        midi[dimension] = find_range(reduced_embedding[dimension], mappings[dimension])
    return midi

In [36]:
diffwords = model_glove['republican'] - model_glove['party']
dist1 = embspace_to_midi(diffwords, 2)

In [37]:
diffwords = model_glove['republican'] - model_glove['democratic']
dist2 = embspace_to_midi(diffwords, 2)

In [38]:
diffwords = model_glove['republican'] - model_glove['banana']
dist3 = embspace_to_midi(diffwords, 2)

In [39]:
diffwords = model_glove['republican'] - model_glove['avocado']
dist4 = embspace_to_midi(diffwords, 2)

In [40]:
print(np.sum((dist1)**2))
print(np.sum((dist2)**2))
print(np.sum((dist3)**2))
print(np.sum((dist4)**2))


44751.0
46745.0
41112.0
40767.0


In [41]:
print(dist1)
print(dist2)
print(dist3)
print(dist4)

[42. 45. 45. 39. 43. 35. 42. 39. 43. 38. 38. 39. 39. 47. 46. 33. 40. 46.
 41. 40. 46. 46. 45. 49. 47.]
[42. 42. 46. 41. 43. 41. 44. 42. 42. 37. 43. 42. 39. 45. 44. 38. 46. 48.
 44. 42. 46. 45. 46. 45. 46.]
[44. 47. 45. 43. 29. 26. 39. 32. 43. 39. 44. 41. 33. 37. 40. 50. 24. 38.
 48. 37. 46. 40. 35. 44. 54.]
[43. 50. 46. 38. 35. 35. 35. 39. 39. 41. 48. 39. 30. 36. 44. 55. 28. 31.
 46. 31. 49. 43. 34. 35. 45.]
