TODO: change the data structure and context window distribution so that we can have different numbers of context windows per word.

In [1]:
import edward as ed
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from scipy.stats import multivariate_normal as mvn

Value error parsing header in AFM: b'Notice' b'\xa9 Copyright SoftMaker Software GmbH and its licensors'
Value error parsing header in AFM: b'Notice' b'\xa9 Copyright SoftMaker Software GmbH and its licensors'
Value error parsing header in AFM: b'Notice' b'\xa9 Copyright SoftMaker Software GmbH and its licensors'


In [2]:
D = 3 # dimensionality of the embeddings
S = 2 # number of different senses per word. TODO: make this random for each word
V = 5 # vocabulary size
N = 6 # number of context windows for each word in the vocabulary
C = 10 # context window size

tf.reset_default_graph() 

In [3]:
def build_toy_dataset(N):
  mu0 = np.zeros(D) # prior mean of the sense means
  sigma0 = np.ones(D) # prior covariance of the sense means
  a0 = np.ones(D) # prior shape for the scale of the sense covariance
  b0 = np.ones(D) # prior scale for the scale of the sense covariance   
  alpha0 = np.ones(S) # priors for the sense probabilities
  beta0 = np.ones(V) # priors over the word frequencies
    
  mus_all = []
  Sigmas_all = []
  pis_all = []

  # draw word frequencies
  pword = np.random.dirichlet(beta0)
  
  # draw the word sense distributions
  for w in range(V):
    mus = []
    Sigmas = []
    pis = np.random.dirichlet(alpha0)
    
    # draw the means for each sense
    for s in range(S):
        mus.append(np.random.multivariate_normal(mu0, np.diag(sigma0)))
        Sigmas.append(np.diag(1.0 / np.random.gamma(a0, b0)))
        
    mus_all.append(mus)
    Sigmas_all.append(Sigmas)
    pis_all.append(pis)
    
  s_all = []
  z_all = []
  c_all = []
    
  # draw the context windows for each word
  for w in range(V):
    
    # draw the sense for each context
    print('Word %i. Sense distribution = %s' % (w, str(pis_all[w])))
    s_ws = np.argmax(np.random.multinomial(1, pis_all[w], N), 1)
    z_ws = []
    
    c_ws = []
    
    for n in range(N):
        
        #print('Sense for pair n = %i' % s_ws[n])
        #print('Mean for chosen sense = %s' % str(mus_all[w][s_ws[n]]))
        #print('Cov for chosen sense = %s' % str(Sigmas_all[w][s_ws[n]]))
        
        # draw the embedding for each context
        z_ws.append(np.random.multivariate_normal(mus_all[w][s_ws[n]], Sigmas_all[w][s_ws[n]]))
    
        # construct the categorical distribution over all words
        joint = []
        for w2 in range(V):
            
            pw2 = 0
            for s in range(S):
                pw2 += pis_all[w2][s] * mvn.pdf(z_ws[-1], mus_all[w2][s], Sigmas_all[w2][s])
            pw2 *= pword[w2]
            joint.append(pw2)
        
        pc_giv_z = joint / np.sum(joint)
        
        c = np.argmax(np.random.multinomial(1, pc_giv_z, C), 1)
        c_ws.append(c)
        
    s_all.append(s_ws)
    z_all.append(z_ws)
    c_all.append(c_ws)
        
  c_all = np.array(c_all, dtype=int).swapaxes(0, 2).swapaxes(0, 1) # so we get N x C x V from V x N x C
        
  return c_all, s_all, z_all, mus_all, Sigmas_all, pis_all

c_all, s_all, z_all, mus_all, Sigmas_all, pis_all = build_toy_dataset(N)

cw_train = c_all
print('Data shape: %s' % str(cw_train.shape))
print('Type of cw: %s' % str(cw_train.dtype))

Word 0. Sense distribution = [0.29787045 0.70212955]
Word 1. Sense distribution = [0.45077866 0.54922134]
Word 2. Sense distribution = [0.40266363 0.59733637]
Word 3. Sense distribution = [0.77429513 0.22570487]
Word 4. Sense distribution = [0.82309081 0.17690919]
Data shape: (6, 10, 5)
Type of cw: int64


In [4]:
from edward.models import Dirichlet, InverseGamma, MultivariateNormalDiag, \
    Normal, ParamMixture, Categorical

from edward.models import RandomVariable
from tensorflow.contrib.distributions import Distribution

class distributions_ContextWindow(Distribution):
  def __init__(self, senses, mus, Sigmas, pword, 
               validate_args=False,
               allow_nan_stats=True,
               name="ContextWindow"):
    
    self.senses = senses
    self.mus = mus
    self.Sigmas = Sigmas
    self.pword = pword
        
    super(distributions_ContextWindow, self).__init__(
            dtype=tf.float32,
            reparameterization_type=tf.contrib.distributions.FULLY_REPARAMETERIZED,
            validate_args=validate_args,
            allow_nan_stats=allow_nan_stats,
            name=name,
            #graph_parents=[pis, mus, Sigmas, pword],
            parameters={'senses':senses, 'mus':mus, 'Sigmas':Sigmas, 'pword':pword},
    )
        
  def _log_prob(self, value):
    # value has shape C x V
    
    value = tf.to_int32(value)
    
    print('Value in _log_prob: %s' % str(value))    
            
    logpc_giv_w = 0
    
    C = value.shape[1]
    V = value.shape[2]
    
    for n in range(value.shape[0]):
    
        for w in range(V):

            for c in range(C): # prob of each context word of w, integrating over possible senses and embeddings
                wc = value[n, c, w] # the context word

                Sigma = 1.0 / (1.0 / self.Sigmas[self.senses[n, wc], wc, :] + 1.0 / self.Sigmas[self.senses[n, w], w, :])

                logpc_giv_w +=  MultivariateNormalDiag(
                    (self.mus[self.senses[n, wc], wc, :] / self.Sigmas[self.senses[n, wc], wc, :] + 
                    self.mus[self.senses[n, w], w, :] / self.Sigmas[self.senses[n, w], w, :]) / Sigma, 
                    Sigma
                    # here we integrate out the 
                )._log_prob(self.mus[self.senses[n, wc], wc, :])

                logpc_giv_w += tf.log(self.pword[wc])
            
    return logpc_giv_w         
    
  def _sample_n(self, n, seed=None):
   
    c_all = []
    
    context_words = []
    
    pc_giv_z = []
    
    senses = self.senses
        
    for x in range(n):
        
        pw2 = []
        
        for w in range(V):
        
            sense_nw = senses[x, w]

            z_w = MultivariateNormalDiag(
                    self.mus[sense_nw, w, :], 
                    self.Sigmas[sense_nw, w, :]
            )            
            joint = []
            
            for w2 in range(V):
                mvn = MultivariateNormalDiag(
                    loc=self.mus[senses[x, w2], w2, :], # this assumes that the senses were instantiated at random from pi and were different for each word
                    scale_diag=self.Sigmas[senses[x, w2], w2, :]
                )
            
                # Get a V x 1 vector of probabilities
                p_z_giv_c = mvn.prob(z_w)
                joint.append(p_z_giv_c)
            
            joint = tf.stack(joint, axis=0) * self.pword
            pw2.append(joint)

        pw2 = tf.stack(pw2, axis=0)
        pc_giv_z = pw2 / tf.reduce_sum(pw2, keepdims=True, axis=1) # shape VxV

        c_sample = Categorical(probs=pc_giv_z).sample(C, seed) # V different distributions in one line. Shape = n x C x N x V
        c_all.append(c_sample)
        
    c_all = tf.stack(c_all, axis=0)

    print('c_all: %s' % str(c_all))
    return c_all
   
               
class ContextWindow(RandomVariable, distributions_ContextWindow):
               
  def __init__(self, *args, **kwargs):
    RandomVariable.__init__(self, *args, **kwargs)
    self.conjugate_log_prob = self._log_prob
    
pword = Dirichlet(tf.ones(V))

pi = Dirichlet(tf.ones(S), sample_shape=V) # sense distributions for each word. Needs replacing with CRP
mu = Normal(tf.zeros(D), tf.ones(D), sample_shape=(S, V))
sigmasq = InverseGamma(tf.ones(D), tf.ones(D), sample_shape=(S, V))

senses = Categorical(pi, sample_shape=(N))
cw = ContextWindow(senses, mu, sigmasq, pword, sample_shape=(N)) # result should be N x C x V

c_all: Tensor("ContextWindow/sample/stack_36:0", shape=(6, 10, 5), dtype=int32)


In [5]:
from edward.models import Empirical

T = 500

# approximate distributions
q_mu = Empirical(
    tf.get_variable("q_mu/params", [T, S, V, D], initializer=tf.zeros_initializer() )
)

q_sigmasq = Empirical(
    tf.get_variable("q_sigmasq/params", [T, S, V, D], initializer=tf.ones_initializer() )
)

q_pi = Empirical(
    tf.get_variable("q_pi/params", [T, V, S], initializer=tf.constant_initializer(1.0 / S) )
)

q_pword = Empirical(
    tf.get_variable("q_pword/params", [T, V], initializer=tf.constant_initializer(1.0 / V) )
)

q_senses = Empirical(
    tf.get_variable("q_senses/params", [T, V, S], initializer=tf.zeros_initializer(), dtype=tf.int32) 
)

In [6]:
print(pword.shape)
print(q_pword.shape)

print(mu.shape)
print(q_mu.shape)

print(sigmasq.shape)
print(q_sigmasq.shape)

print(pi.shape)
print(q_pi.shape)

print(cw.shape)
print(cw_train.shape)

print(senses.shape)
print(q_senses.shape)

latent_vars = {
    senses: q_senses,
    mu: q_mu,
    sigmasq: q_sigmasq,
    pi: q_pi, 
    pword: q_pword
}

data = {
    cw: cw_train,
}

(5,)
(5,)
(2, 5, 3)
(2, 5, 3)
(2, 5, 3)
(2, 5, 3)
(5, 2)
(5, 2)
(6, 10, 5)
(6, 10, 5)
(6, 5)
(5, 2)


In [7]:
inference = ed.Gibbs(latent_vars, data=data)
inference.initialize()
sess = edward.get_session()
tf.global_variables_initializer().run()

for i in range(inference.niter):
    info_dict = inference.update()
    inference.print_progress(info_dict)

AttributeError: 'NoneType' object has no attribute 'shape'

In [None]:
# approximate distributions
q_mu = Normal(
    loc=tf.Variable(tf.zeros([S, V, D])),
    scale=tf.Variable(tf.zeros([S, V, D]))
)

q_sigmasq = InverseGamma(
    concentration=tf.Variable(tf.zeros([S, V, D])),
    rate=tf.Variable(tf.zeros([S, V, D]))
)

q_pi = Dirichlet(
    concentration=tf.Variable(tf.zeros([V, S]))
)

q_pword = Dirichlet(
    concentration=tf.Variable(tf.zeros([V]))
)

q_senses = Categorical(
    probs=tf.Variable(tf.zeros([N, V, S]))
)

In [None]:
print(pword.shape)
print(q_pword.shape)

print(mu.shape)
print(q_mu.shape)

print(sigmasq.shape)
print(q_sigmasq.shape)

print(pi.shape)
print(q_pi.shape)

print(cw.shape)
print(cw_train.shape)

print(senses.shape)
print(q_senses.shape)

latent_vars = {
    senses: q_senses,
    mu: q_mu,
    sigmasq: q_sigmasq,
    pi: q_pi, 
    pword: q_pword
}

data = {
    cw: cw_train,
}

In [None]:
inference = ed.KLqp(latent_vars, data)

n_iter = 100

#inference.run(n_iter=n_iter)

inference.initialize()
tf.global_variables_initializer().run()

for i in range(n_iter):
    print('VB iteration %i' % i)
    info_dict = inference.update()
    inference.print_progress(info_dict)
    print(q_pi.mean().eval())
    
inference.finalize()


In [None]:
# print out the sense labels inferred for all the central word occurrences

Esenses = q_senses.eval()

for w in range(V):
    for n in range(N):    
        print('word %i, sample %i, -- probability of senses = %s, true sense is %i' % (n, w, str(Esenses[n, w, s]), s_all[w, n]) )
            

In [None]:
def build_toy_dataset(N, w, noise_std=0.1):
  D = len(w)
  x = np.random.randn(N, D)
  y = np.dot(x, w) + np.random.normal(0, noise_std, size=N)
  return x, y

N = 40  # number of data points
D = 10  # number of features

w_true = np.random.randn(D)
X_train, y_train = build_toy_dataset(N, w_true)
X_test, y_test = build_toy_dataset(N, w_true)

from edward.models import Normal

X = tf.placeholder(tf.float32, [N, D])
w = Normal(loc=tf.zeros(D), scale=tf.ones(D))
b = Normal(loc=tf.zeros(1), scale=tf.ones(1))
y = Normal(loc=ed.dot(X, w) + b, scale=tf.ones(N))

qw = Normal(loc=tf.get_variable("qw/loc", [D]),
            scale=tf.nn.softplus(tf.get_variable("qw/scale", [D])))
qb = Normal(loc=tf.get_variable("qb/loc", [1]),
            scale=tf.nn.softplus(tf.get_variable("qb/scale", [1])))

inference = ed.KLqp({w: qw, b: qb}, data={X: X_train, y: y_train})
inference.run(n_samples=5, n_iter=250)

### What makes our method novel?

A fully Bayesian approach to learning word embeddings with multiple, potentially infinite numbers of distinct senses per token.

The Bayesian treatment is intended to help with:
* Rare words in the training corpus, whose embeddings cannot be confidently estimated -- variance means we don't put too much weight onto these uncertain cases during learning
* Inferring the number of senses -- priors effectively regularise the model toward fewer senses
* Domain adaptation/Transfer learning -- we can inflate variances to indicate uncertainty in new domains
* (As in Barkan, Brazinskas et al) context-specific embeddings for each word instance
* (As in Barkan, Brazinskas et al) composition of sentence or document embeddings -- word occurrences with more confident or precise embeddings will have stronger influence in the combined sentence embedding. I think this will push the sentence embeddings away from generic words and toward the extremes.

### How do we test our model?

* Look at the tasks tried by Brazinskas et al, Barkan, and the ACL 2018 paper and ty to reuse their code where possible
* Compute the context-specific embeddings for each word (posterior means)
* Test what happens if we concatenate the variances to the embedding vector as a vagueness or uncertainty feature