In [1]:
import edward as ed
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from scipy.stats import multivariate_normal as mvn

Value error parsing header in AFM: b'Notice' b'\xa9 Copyright SoftMaker Software GmbH and its licensors'
Value error parsing header in AFM: b'Notice' b'\xa9 Copyright SoftMaker Software GmbH and its licensors'
Value error parsing header in AFM: b'Notice' b'\xa9 Copyright SoftMaker Software GmbH and its licensors'


In [11]:
def build_toy_dataset(N):
  mu0 = np.zeros(D) # prior mean of the sense means
  sigma0 = np.ones(D) # prior covariance of the sense means
  a0 = np.ones(D) # prior shape for the scale of the sense covariance
  b0 = np.ones(D) # prior scale for the scale of the sense covariance   
  alpha0 = np.ones(S) # priors for the sense probabilities
  beta0 = np.ones(V) # priors over the word frequencies
    
  mus_all = []
  Sigmas_all = []
  pis_all = []

  # draw word frequencies
  pword = np.random.dirichlet(beta0)
  
  # draw the word sense distributions
  for w in range(V):
    mus = []
    Sigmas = []
    pis = np.random.dirichlet(alpha0)
    
    # draw the means for each sense
    for s in range(S):
        mus.append(np.random.multivariate_normal(mu0, np.diag(sigma0)))
        Sigmas.append(np.diag(1.0 / np.random.gamma(a0, b0)))
        
    mus_all.append(mus)
    Sigmas_all.append(Sigmas)
    pis_all.append(pis)
    
  s_all = []
  z_all = []
  c_all = []
    
  # draw the context windows for each word
  for w in range(V):
    
    # draw the sense for each context
    print('Word %i. Sense distribution = %s' % (w, str(pis_all[w])))
    s_ws = np.argmax(np.random.multinomial(1, pis_all[w], N), 1)
    z_ws = []
    
    c_ws = []
    
    for n in range(N):
        
        #print('Sense for pair n = %i' % s_ws[n])
        #print('Mean for chosen sense = %s' % str(mus_all[w][s_ws[n]]))
        #print('Cov for chosen sense = %s' % str(Sigmas_all[w][s_ws[n]]))
        
        # draw the embedding for each context
        z_ws.append(np.random.multivariate_normal(mus_all[w][s_ws[n]], Sigmas_all[w][s_ws[n]]))
    
        # construct the categorical distribution over all words
        joint = []
        for w2 in range(V):
            
            pw2 = 0
            for s in range(S):
                pw2 += pis_all[w2][s] * mvn.pdf(z_ws[-1], mus_all[w2][s], Sigmas_all[w2][s])
            pw2 *= pword[w2]
            joint.append(pw2)
        
        pc_giv_z = joint / np.sum(joint)
        
        c = np.argmax(np.random.multinomial(1, pc_giv_z, C), 1)
        c_ws.append(c)
        
    s_all.append(s_ws)
    z_all.append(z_ws)
    c_all.append(c_ws)
        
  c_all = np.array(c_all).swapaxes(0, 1).swapaxes(1, 2) # so we get N x C x V from V x N x C
        
  return c_all

C = 10
N = 100  # number of context windows for each word in the vocabulary
S = 2  # number of different senses per word. TODO: make this random for each word
D = 10 # dimensionality of the embeddings
V = 100 # vocabulary size

c_all = build_toy_dataset(N)

cw_train = c_all

Word 0. Sense distribution = [0.78987805 0.21012195]
Word 1. Sense distribution = [0.6760104 0.3239896]
Word 2. Sense distribution = [0.60780286 0.39219714]
Word 3. Sense distribution = [0.67907698 0.32092302]
Word 4. Sense distribution = [0.55288706 0.44711294]
Word 5. Sense distribution = [0.64739543 0.35260457]
Word 6. Sense distribution = [0.89756743 0.10243257]
Word 7. Sense distribution = [0.25691385 0.74308615]
Word 8. Sense distribution = [0.50855823 0.49144177]
Word 9. Sense distribution = [0.46731234 0.53268766]
Word 10. Sense distribution = [0.12639621 0.87360379]
Word 11. Sense distribution = [0.728379 0.271621]
Word 12. Sense distribution = [0.65390785 0.34609215]
Word 13. Sense distribution = [0.8354555 0.1645445]
Word 14. Sense distribution = [0.03579466 0.96420534]
Word 15. Sense distribution = [0.26227659 0.73772341]
Word 16. Sense distribution = [0.49030342 0.50969658]
Word 17. Sense distribution = [0.58381061 0.41618939]
Word 18. Sense distribution = [0.1697819 0.830

In [8]:
from edward.models import Dirichlet, InverseGamma, MultivariateNormalDiag, \
    Normal, ParamMixture, Categorical

from edward.models import RandomVariable
from tensorflow.contrib.distributions import Distribution

class distributions_ContextWindow(Distribution):
  def __init__(self, senses, mus, Sigmas, validate_args=False,
                  allow_nan_stats=True,
                  name="ContextWindow"):
    
    self.senses = senses
    self.mus = mus
    self.Sigmas = Sigmas
    
    super(distributions_ContextWindow, self).__init__(
            dtype=tf.float32,
            reparameterization_type=tf.contrib.distributions.FULLY_REPARAMETERIZED,
            validate_args=validate_args,
            allow_nan_stats=allow_nan_stats,
            name=name)
    
    
  def _log_prob(self, value):
    raise NotImplementedError("log_prob is not implemented")

  def _sample_n(self, n, seed=None):
   
    c_all = []
    
    context_words = []

    # sample the vectors for each word given its selected sense    
    z_w = MultivariateNormalDiag(self.mus[self.senses, range(V)], 
                                 self.Sigmas[self.senses, range(V)]
                                )._sample_n(1, seed)[0]

    # construct the categorical distribution over all words
    joint = []

    mvn = MultivariateNormalDiag(
                                    loc=self.mus[self.senses, range(V)], # this assumes that the senses were instantiated at random from pi and were different for each word
                                    scale_diag=self.Sigmas[self.senses, range(V)]
    )

    pw2 = mvn.prob(z_w) # z_w has shape V. Mvn contains V distributions. So pw2 has shape V x V
    pw2 *= tf.reshape(pword, shape=(1, V))

    pc_giv_z = pw2 / tf.reduce_sum(joint, keepdims=True, axis=1) # shape VxV

    c_all = Categorical(probs=pc_giv_z).sample((n, C), seed) # V different distributions in one line. Shape = N x C x V
        
    return c_all
   
               
class ContextWindow(RandomVariable, distributions_ContextWindow):
               
  def __init__(self, *args, **kwargs):
    
    RandomVariable.__init__(self, *args, **kwargs)

D = 3
S = 2
V = 100
N = 100 # sample contexts per word
C = 10 # context window size

pword = Dirichlet(tf.ones(V))

pi = Dirichlet(tf.ones(S), sample_shape=V) # sense distributions for each word. Needs replacing with CRP

mu = Normal(tf.zeros(D), tf.ones(D), sample_shape=(S, V))
sigmasq = InverseGamma(tf.ones(D), tf.ones(D), sample_shape=(S, V))

sense = Categorical(probs=pi, sample_shape=N)

cw = ContextWindow(sense, mu, sigmas,
                sample_shape=N) # result should be N X V x C
 


AttributeError: 'Tensor' object has no attribute 'T'

In [None]:
# approximate distributions

q_mu = Normal(
    loc=tf.get_variable("q_mu/loc", [S * V]),
    scale=tf.nn.softplus(tf.get_variable("q_mu/scale", [S * V]))
)

q_sigmasq = InverseGamma(
    concentration=tf.nn.softplus(tf.get_variable("q_sigmasq/concentration", [S * V])),
    rate=tf.nn.softplus(tf.get_variable("q_sigmasq/rate", [S * V]))
)

q_pi = Dirichlet(
    concentration=tf.nn.softplus(tf.get_variable("q_pi/concentration", [S * V]))
)

q_sense = Categorical(
    probs=tf.nn.softmax(tf.get_variable("q_sense/probs", [S * V * N]))
)

q_pword = Categorical(
    probs=tf.nn.softmax(tf.get_variable("q_pword/probs", [S * V * N]))
)

In [None]:
latent_vars = {
    #sense: q_sense,
    mu: q_mu,
    sigmasq: q_sigmasq,
    pi: q_pi, 
    pword: q_pword
}

data = {
    cw: cw_train,
}

inference = ed.KLqp(latent_vars, data)
inference.run(n_iter=500)

In [None]:
# Calculate likelihood for each data point and cluster assignment,
# averaged over many posterior samples. ``x_post`` has shape (N, 100, K, D).
mu_sample = qmu.sample(100)
sigmasq_sample = qsigmasq.sample(100)
x_post = Normal(loc=tf.ones([N, 1, 1, 1]) * mu_sample,
                scale=tf.ones([N, 1, 1, 1]) * tf.sqrt(sigmasq_sample))
x_broadcasted = tf.tile(tf.reshape(x_train, [N, 1, 1, D]), [1, 100, K, 1])

# Sum over latent dimension, then average over posterior samples.
# ``log_liks`` ends up with shape (N, K).
log_liks = x_post.log_prob(x_broadcasted)
log_liks = tf.reduce_sum(log_liks, 3)
log_liks = tf.reduce_mean(log_liks, 1)


In [None]:
clusters = tf.argmax(log_liks, 1).eval()


In [None]:
from matplotlib import cm 
plt.scatter(x_train[:, 0], x_train[:, 1], c=clusters, cmap=cm.bwr)
plt.axis([-3, 3, -3, 3])
plt.title("Predicted cluster assignments")
plt.show()
