In [35]:
import numpy as np
import tensorflow as tf
import itertools,time
import sys, os
from collections import OrderedDict,Counter
from copy import deepcopy
from time import time
import matplotlib.pyplot as plt
import pickle as pkl
import sys, getopt

In [36]:
slim = tf.contrib.slim

tf.reset_default_graph()

class VAE(object):
    """
    See "Auto-Encoding Variational Bayes" by Kingma and Welling for more details.
    """


    def __init__(self, network_architecture, transfer_fct=tf.nn.softplus,
                 learning_rate=0.001, batch_size=100):
        self.network_architecture = network_architecture
        self.transfer_fct = transfer_fct
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        print('Learning Rate:', self.learning_rate)

        # tf Graph input
        self.x = tf.placeholder(tf.float32, [None, network_architecture["n_input"]], name='input')
        self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')

        self.h_dim = (network_architecture["n_z"]) # had a float before
        self.a = 1*np.ones((1 , self.h_dim)).astype(np.float32)                         # a    = 1
        self.prior_mean = tf.constant((np.log(self.a).T-np.mean(np.log(self.a),1)).T)          # prior_mean  = 0
        self.prior_var = tf.constant(  ( ( (1.0/self.a)*( 1 - (2.0/self.h_dim) ) ).T +       # prior_var = 0.99 + 0.005 = 0.995
                                ( 1.0/(self.h_dim*self.h_dim) )*np.sum(1.0/self.a,1) ).T  )
        self.prior_logvar = tf.log(self.prior_var)

        self._create_network()
        with tf.name_scope('cost'):
            self._create_loss_optimizer()

        init = tf.initialize_all_variables()

        self.sess = tf.InteractiveSession()
        self.sess.run(init)

    def _create_network(self):
        """
        steps:
        1. initialize weights
        2. build recognition network
        3. build reconstruction network
        """
        n_z = self.network_architecture['n_z']
        n_hidden_gener_1 = self.network_architecture['n_hidden_gener_1']
        en1 = slim.layers.linear(self.x, self.network_architecture['n_hidden_recog_1'], scope='FC_en1')
        en1 = tf.nn.softplus(en1, name='softplus1')
        en2 = slim.layers.linear(en1,    self.network_architecture['n_hidden_recog_2'], scope='FC_en2')
        en2 = tf.nn.softplus(en2, name='softplus2')
        en2_do = slim.layers.dropout(en2, self.keep_prob, scope='en2_dropped')
        self.posterior_mean   = slim.layers.linear(en2_do, self.network_architecture['n_z'], scope='FC_mean')
        self.posterior_logvar = slim.layers.linear(en2_do, self.network_architecture['n_z'], scope='FC_logvar')
        self.posterior_mean   = slim.layers.batch_norm(self.posterior_mean, scope='BN_mean')
        self.posterior_logvar = slim.layers.batch_norm(self.posterior_logvar, scope='BN_logvar')

        with tf.name_scope('z_scope'):
            eps = tf.random_normal((self.batch_size, n_z), 0, 1,                            # take noise
                                   dtype=tf.float32)
            self.z = tf.add(self.posterior_mean,
                            tf.multiply(tf.sqrt(tf.exp(self.posterior_logvar)), eps))         # reparameterization z
            self.posterior_var = tf.exp(self.posterior_logvar) 

        p = slim.layers.softmax(self.z)
        p_do = slim.layers.dropout(p, self.keep_prob, scope='p_dropped')               # dropout(softmax(z))
        decoded = slim.layers.linear(p_do, n_hidden_gener_1, scope='FC_decoder')

        self.x_reconstr_mean = tf.nn.softmax(slim.layers.batch_norm(decoded, scope='BN_decoder'))                    # softmax(bn(50->1995))

        print(self.x_reconstr_mean)

    def _create_loss_optimizer(self):

        #self.x_reconstr_mean+=1e-10                                                     # prevent log(0)

        NL = -tf.reduce_sum(self.x * tf.log(self.x_reconstr_mean+1e-10), 1)     # cross entropy on categorical
        #reconstr_loss = -tf.reduce_sum(self.x * tf.log(self.x_reconstr_mean), 1)

        var_division    = self.posterior_var  / self.prior_var
        diff            = self.posterior_mean - self.prior_mean
        diff_term       = diff * diff / self.prior_var
        logvar_division = self.prior_logvar - self.posterior_logvar
        KLD = 0.5 * (tf.reduce_sum(var_division + diff_term + logvar_division, 1) - self.h_dim )

        self.cost = tf.reduce_mean(NL + KLD)

        self.optimizer = \
            tf.train.AdamOptimizer(learning_rate=self.learning_rate,beta1=0.99).minimize(self.cost)

    def partial_fit(self, X):

        #if hasattr(self, 'decoder_weight'):
            #decoder_weight = self.decoder_weight
        #else:
        decoder_weight = [v for v in tf.global_variables() if v.name=='FC_decoder/weights:0'][0]
        opt, cost,emb = self.sess.run((self.optimizer, self.cost, decoder_weight),feed_dict={self.x: X,self.keep_prob: .8})
        return cost,emb

    def test(self, X):
        """Test the model and return the lowerbound on the log-likelihood.
        """
        cost = self.sess.run((self.cost),feed_dict={self.x: np.expand_dims(X, axis=0),self.keep_prob: 1.0})
        return cost
    def topic_prop(self, X):
        """heta_ is the topic proportion vector. Apply softmax transformation to it before use.
        """
        theta_ = self.sess.run((self.z),feed_dict={self.x: np.expand_dims(X, axis=0),self.keep_prob: 1.0})
        return theta_

In [37]:
def onehot(data, min_length):
    return np.bincount(data, minlength=min_length)

In [38]:
all_tokens = pkl.load(open("Downloads/all_tokens.p", "rb"))

In [39]:
data = pkl.load(open("Downloads/sent_tokens.p", "rb"))

In [40]:
max_vocab_size = 10000
# save index 0 for unk and 1 for pad
PAD_IDX = 0
UNK_IDX = 1

In [41]:
from nltk.corpus import stopwords

stops = set(stopwords.words('english'))  # nltk stopwords list

In [42]:
def build_vocab(all_tokens, max_vocab_size):
    # Returns:
    # id2token
    # token2id
    token_counter = Counter(all_tokens)
    token_counter = Counter({k:v for k,v in token_counter.items() if k not in stops and k.isalpha()})
    # unzip the vocab and its corresponding count
    vocab, count = zip(*token_counter.most_common(max_vocab_size))
    id2token = list(vocab)
    # Give indices from 2 to the vocab
    token2id = dict(zip(vocab, range(2, 2+len(vocab))))
    # Add pad and unk to vocab
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX
    token2id['<unk>'] = UNK_IDX
    return token2id, id2token

token2id, id2token = build_vocab(all_tokens,max_vocab_size)    

In [43]:
def token2index_dataset(tokens_data, token2id, id2token):
    indices_data = []
    for tokens in tokens_data:
        index_list = [token2id[token] if token in token2id else UNK_IDX for token in tokens]
        indices_data.append(index_list)
    return indices_data

In [44]:
x_train = token2index_dataset(data, token2id, id2token)

In [45]:
x_train = np.array([np.array(document) for document in x_train])

In [46]:
x_train = np.array([onehot(doc.astype('int'),max_vocab_size+2) for doc in x_train if np.sum(doc)!=0])

In [47]:
n_samples_tr = x_train.shape[0]
docs_tr = x_train

In [48]:
batch_size=200
learning_rate=0.002
network_architecture = \
    dict(n_hidden_recog_1=100, # 1st layer encoder neurons
         n_hidden_recog_2=100, # 2nd layer encoder neurons
         n_hidden_gener_1=x_train.shape[1], # 1st layer decoder neurons
         n_input=x_train.shape[1], # MNIST data input (img shape: 28*28)
         n_z=50)  # dimensionality of latent space

In [49]:
def make_network(layer1=100,layer2=100,num_topics=50,bs=200,eta=0.002):
    tf.reset_default_graph()
    network_architecture = \
        dict(n_hidden_recog_1=layer1, # 1st layer encoder neurons
             n_hidden_recog_2=layer2, # 2nd layer encoder neurons
             n_hidden_gener_1=x_train.shape[1], # 1st layer decoder neurons
             n_input=x_train.shape[1], # MNIST data input (img shape: 28*28)
             n_z=num_topics)  # dimensionality of latent space
    batch_size=bs
    learning_rate=eta
    return network_architecture,batch_size,learning_rate

In [50]:
def create_minibatch(data):
    rng = np.random.RandomState(10)

    while True:
        # Return random data samples of a size 'minibatch_size' at each iteration
        ixs = rng.randint(data.shape[0], size=batch_size)
        yield data[ixs]

In [51]:
def train(network_architecture, minibatches, type='prodlda',learning_rate=0.001,
          batch_size=200, training_epochs=100, display_step=5):
    tf.reset_default_graph()
    vae = VAE(network_architecture,
                                 learning_rate=learning_rate,
                                 batch_size=batch_size)
    writer = tf.summary.FileWriter('logs', tf.get_default_graph())
    emb=0
    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(n_samples_tr / batch_size)
        # Loop over all batches
        for i in range(total_batch):
            batch_xs = next(minibatches)
            # Fit training using batch data
            cost,emb = vae.partial_fit(batch_xs)
            # Compute average loss
            avg_cost += cost / n_samples_tr * batch_size

            if np.isnan(avg_cost):
                print(epoch,i,np.sum(batch_xs,1).astype(np.int),batch_xs.shape)
                print('Encountered NaN, stopping training. Please check the learning_rate settings and the momentum.')
                # return vae,emb
                sys.exit()

        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), \
                  "cost=", "{:.9f}".format(avg_cost))
    return vae,emb

In [74]:
def print_top_words(beta, feature_names, n_top_words=10):
    print('---------------Printing the Topics------------------')
    for i in range(len(beta)):
        print(" ".join([feature_names[j]
            for j in beta[i].argsort()[:-n_top_words - 1:-1]]))
    print('---------------End of Topics------------------')

In [53]:
minibatches = create_minibatch(docs_tr.astype('float32'))

In [54]:
network_architecture,batch_size,learning_rate=make_network()

In [55]:
vae,emb = train(network_architecture, 
                minibatches)

Learning Rate: 0.001
Tensor("Softmax:0", shape=(200, 10002), dtype=float32)
Instructions for updating:
Use `tf.global_variables_initializer` instead.
Epoch: 0001 cost= 8267.231494141
Epoch: 0006 cost= 5601.142812500
Epoch: 0011 cost= 5395.203759766
Epoch: 0016 cost= 5000.472275391
Epoch: 0021 cost= 4888.346455078
Epoch: 0026 cost= 4760.160297852
Epoch: 0031 cost= 4598.640258789
Epoch: 0036 cost= 4345.423139648
Epoch: 0041 cost= 4217.210322266
Epoch: 0046 cost= 4042.558408203
Epoch: 0051 cost= 4036.144296875
Epoch: 0056 cost= 3887.852172852
Epoch: 0061 cost= 3861.446752930
Epoch: 0066 cost= 3689.239399414
Epoch: 0071 cost= 3586.152426758
Epoch: 0076 cost= 3468.924501953
Epoch: 0081 cost= 3454.514555664
Epoch: 0086 cost= 3439.926684570
Epoch: 0091 cost= 3265.541440430
Epoch: 0096 cost= 3194.837392578


In [76]:
print_top_words(emb, list(zip(*sorted(token2id.items(), key=lambda x: x[1])))[0])

---------------Printing the Topics------------------
cnn tuesday abortions tweeted scientists democrats president trump donald abortion
comicbook tweeted tuesday cnn cbs donald wednesday subscribed lawmakers police
appropriately thorough clarify translated destroying gem absent albeit ah slowed
comicbook subscribed podcast commented airs theaters fans russo starred tweeted
constitutional statute surveillance taxpayer treasury insurers reinsurance reform advocates requirement
albeit distant disappear scholars absent <pad> gem inserted masses downward
clauses contractual quizzes cardholder salutation throughs consents datonics gifs accountholder
renters motorcycle proposal assets investor investment proposals coverage farmers recreational
flowing absent appropriately divine someday descent <pad> instructed backwards ought
occupation appropriately wonders <pad> gem abdominal dedication assumption inherently ought
senate politicians democrats voters republicans constitutional legislation b