In [1]:
import pickle
import itertools
import numpy as np
from scipy import spatial
from scipy.stats import norm
import nltk.data
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import reuters
from nltk. corpus import gutenberg
from nltk.corpus import brown
from nltk.tokenize import sent_tokenize
from gensim.models import KeyedVectors
from keras.layers import Input, Dense, Lambda, Layer
from keras import callbacks
from keras.models import Model
from keras import backend as K
from keras import metrics

Using TensorFlow backend.


In [2]:
w2v = KeyedVectors.load_word2vec_format('/Users/shreyajain/Downloads/datasets/wiki-news-300d-1M.vec')


In [3]:
def split_into_sent (text):
    strg = ''
    for word in text:
        strg += word
        strg += ' '
    strg_cleaned = strg.lower()
    for x in ['\xd5d','\n','"',"!", '#','$','%','&','(',')','*','+',',','-','/',':',';','<','=','>','?','@','[','^',']','_','`','{','|','}','~','\t']:
        strg_cleaned = strg_cleaned.replace(x, '')
    sentences = sent_tokenize(strg_cleaned)
    return sentences

In [4]:
a = split_into_sent (['A','man', 'in', 'white','runs', 'like', 'he','never','before','and','I','just','need','to'])
print ("texts ", a)

texts  ['a man in white runs like he never before and i just need to']


In [5]:
def vectorize_sentences(sentences):
    vectorized = []
    for sentence in sentences:
        byword = sentence.split()
        concat_vector = []
        for word in byword:
            try:
                concat_vector.append(w2v[word])
            except:
                pass
        vectorized.append(concat_vector)
    return vectorized

In [6]:
data_concat = []

for t in [brown.words(), reuters.words(), gutenberg.words()]:
    text = split_into_sent(t)
    vect = vectorize_sentences(text)
    data = [x for x in vect if len(x) == 10]
    for x in data:
        data_concat.append(list(itertools.chain.from_iterable(x)))

# with open ('/home/ubuntu/pynb/wikitokens.pickle', 'rb') as f:
#     wiki_tokens = pickle.load(f)
# wiki_tokens = vectorize_sentences(wiki_tokens)
# wikidata = [x for x in wiki_tokens if len(x) == 10]
# for x in wikidata:
#     data_concat.append(list(itertools.chain.from_iterable(x)))


In [7]:
data_array = np.array(data_concat)
np.random.shuffle(data_array)

# train = data_array[:8000]
# test = data_array[8000:10000]

train = data_array[:5000]
test = data_array[5000:6500]

In [8]:
# each element of train has 10 words (vectors of length 300 each)
print ("train ", train[0])
print ("train ", len(train[0]))

print ("test ", len(test))

train  [-0.0065 -0.033   0.006  ...  0.207   0.0689 -0.0467]
train  3000
test  1500


In [17]:
batch_size = 500
original_dim = 3000
latent_dim = 1000
intermediate_dim = 1200
epochs = 200
epsilon_std = 1.0

x = Input(batch_shape=(batch_size, original_dim))
h = Dense(intermediate_dim, activation='relu')(x)
z_mean = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)

# re-parametrization trick
def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0.,
                              stddev=epsilon_std)
    return z_mean + K.exp(z_log_var / 2) * epsilon

# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

# we instantiate these layers separately so as to reuse them later
decoder_h = Dense(intermediate_dim, activation='relu')
decoder_mean = Dense(original_dim, activation='sigmoid')
h_decoded = decoder_h(z)
x_decoded_mean = decoder_mean(h_decoded)

# placeholder loss
def zero_loss(y_true, y_pred):
    return K.zeros_like(y_pred)

# Custom loss layer
class CustomVariationalLayer(Layer):
    def __init__(self, **kwargs):
        self.is_placeholder = True
        super(CustomVariationalLayer, self).__init__(**kwargs)

    def vae_loss(self, x, x_decoded_mean):
        xent_loss = original_dim * metrics.binary_crossentropy(x, x_decoded_mean)
        kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
        return K.mean(xent_loss + kl_loss)

    def call(self, inputs):
        x = inputs[0]
        x_decoded_mean = inputs[1]
        loss = self.vae_loss(x, x_decoded_mean)
        self.add_loss(loss, inputs=inputs)
        # we don't use this output, but it has to have the correct shape:
        return K.ones_like(x)

loss_layer = CustomVariationalLayer()([x, x_decoded_mean])
vae = Model(x, [loss_layer])
vae.compile(optimizer='rmsprop', loss=[zero_loss])

#checkpoint
cp = [callbacks.ModelCheckpoint(filepath="/Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5", verbose=1, save_best_only=True)]

#train
vae.fit(train, train,
        shuffle=True,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(test, test), callbacks=cp)

# build a model to project inputs on the latent space
encoder = Model(x, z_mean)

# build a generator that can sample from the learned distribution
decoder_input = Input(shape=(latent_dim,))
_h_decoded = decoder_h(decoder_input)
_x_decoded_mean = decoder_mean(_h_decoded)
generator = Model(decoder_input, _x_decoded_mean)


Train on 5000 samples, validate on 1500 samples
Epoch 1/200

Epoch 00001: val_loss improved from inf to -456.22819, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 2/200

Epoch 00002: val_loss improved from -456.22819 to -511.29829, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 3/200

Epoch 00003: val_loss improved from -511.29829 to -523.87779, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 4/200

Epoch 00004: val_loss improved from -523.87779 to -531.39648, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 5/200

Epoch 00005: val_loss improved from -531.39648 to -541.92503, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 6/200

Epoch 00006: val_loss improved from -541.92503 to -546.54205, savi


Epoch 00031: val_loss improved from -628.82550 to -629.35150, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 32/200

Epoch 00032: val_loss improved from -629.35150 to -630.35040, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 33/200

Epoch 00033: val_loss improved from -630.35040 to -631.97579, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 34/200

Epoch 00034: val_loss improved from -631.97579 to -634.83748, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 35/200

Epoch 00035: val_loss did not improve from -634.83748
Epoch 36/200

Epoch 00036: val_loss improved from -634.83748 to -635.88320, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 37/200

Epoch 00037: val_loss improved from -635.88320


Epoch 00064: val_loss improved from -656.26733 to -656.34644, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 65/200

Epoch 00065: val_loss improved from -656.34644 to -657.92704, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 66/200

Epoch 00066: val_loss did not improve from -657.92704
Epoch 67/200

Epoch 00067: val_loss improved from -657.92704 to -657.97426, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 68/200

Epoch 00068: val_loss did not improve from -657.97426
Epoch 69/200

Epoch 00069: val_loss improved from -657.97426 to -659.74670, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 70/200

Epoch 00070: val_loss improved from -659.74670 to -660.39771, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/m


Epoch 00098: val_loss improved from -674.42800 to -675.49135, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 99/200

Epoch 00099: val_loss improved from -675.49135 to -676.21238, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 100/200

Epoch 00100: val_loss did not improve from -676.21238
Epoch 101/200

Epoch 00101: val_loss improved from -676.21238 to -676.63702, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 102/200

Epoch 00102: val_loss improved from -676.63702 to -677.55123, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 103/200

Epoch 00103: val_loss did not improve from -677.55123
Epoch 104/200

Epoch 00104: val_loss did not improve from -677.55123
Epoch 105/200

Epoch 00105: val_loss did not improve from -677.55123
Epoch 106/200

Epoch 00106:


Epoch 00134: val_loss improved from -687.70150 to -688.47085, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 135/200

Epoch 00135: val_loss did not improve from -688.47085
Epoch 136/200

Epoch 00136: val_loss improved from -688.47085 to -689.10457, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 137/200

Epoch 00137: val_loss did not improve from -689.10457
Epoch 138/200

Epoch 00138: val_loss improved from -689.10457 to -690.13316, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 139/200

Epoch 00139: val_loss did not improve from -690.13316
Epoch 140/200

Epoch 00140: val_loss improved from -690.13316 to -690.70717, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 141/200

Epoch 00141: val_loss improved from -690.70717 to -690.81956, saving model to /U


Epoch 00172: val_loss did not improve from -698.05821
Epoch 173/200

Epoch 00173: val_loss did not improve from -698.05821
Epoch 174/200

Epoch 00174: val_loss did not improve from -698.05821
Epoch 175/200

Epoch 00175: val_loss improved from -698.05821 to -698.50728, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 176/200

Epoch 00176: val_loss improved from -698.50728 to -699.54553, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 177/200

Epoch 00177: val_loss did not improve from -699.54553
Epoch 178/200

Epoch 00178: val_loss did not improve from -699.54553
Epoch 179/200

Epoch 00179: val_loss did not improve from -699.54553
Epoch 180/200

Epoch 00180: val_loss improved from -699.54553 to -699.69820, saving model to /Users/shreyajain/PycharmProjects/textGeneration/VAE_keras/checkpoints/pynb/model.h5
Epoch 181/200

Epoch 00181: val_loss improved from -699.69820 to -

In [18]:
# some matrix magic
def sent_parse(sentence, mat_shape):
    data_concat = []
    word_vecs = vectorize_sentences(sentence)
    for x in word_vecs:
        data_concat.append(list(itertools.chain.from_iterable(x)))
    zero_matr = np.zeros(mat_shape)
    zero_matr[0] = np.array(data_concat)
    return zero_matr

In [19]:
# input: original dimension sentence vector
# output: text
def print_sentence_with_w2v(sent_vect):
    word_sent = ''
    tocut = sent_vect
    for i in range (int(len(sent_vect)/300)):
        word_sent += w2v.most_similar(positive=[tocut[:300]], topn=1)[0][0]
        word_sent += ' '
        tocut = tocut[300:]
    print(word_sent)

In [20]:
# input: encoded sentence vector
# output: encoded sentence vector in dataset with highest cosine similarity
def find_similar_encoding(sent_vect):
    all_cosine = []
    for sent in sent_encoded:
        result = 1 - spatial.distance.cosine(sent_vect, sent)
        all_cosine.append(result)
    data_array = np.array(all_cosine)
    maximum = data_array.argsort()[-3:][::-1][1]
    new_vec = sent_encoded[maximum]
    return new_vec

In [21]:
# input: two points, integer n
# output: n equidistant points on the line between the input points (inclusive)
def shortest_homology(point_one, point_two, num):
    dist_vec = point_two - point_one
    sample = np.linspace(0, 1, num, endpoint = True)
    hom_sample = []
    for s in sample:
        hom_sample.append(point_one + s * dist_vec)
    return hom_sample

In [22]:
# input: two written sentences, VAE batch-size, dimension of VAE input
# output: the function embeds the sentences in latent-space, and then prints their generated text representations
# along with the text representations of several points in between them
def sent_2_sent(sent1,sent2, batch, dim):
    a = sent_parse([sent1], (batch,dim))
    b = sent_parse([sent2], (batch,dim))
    encode_a = encoder.predict(a, batch_size = batch)
    encode_b = encoder.predict(b, batch_size = batch)
    test_hom = hom_shortest(encode_a[0], encode_b[0], 5)
    
    for point in test_hom:
        p = generator.predict(np.array([point]))[0]
        print_sentence(p)

In [23]:
print_sentence_with_w2v(train[1])
print_sentence_with_w2v(train[2])

oh look sir beware the hatchway thank ye man . 
humph harriet ' s ready wit all the better . 


In [24]:
print ("train[1] ", len(train[1]))

train[1]  3000


In [25]:
sent_encoded = encoder.predict(np.array(train), batch_size = 500)


In [26]:
sent_decoded = generator.predict(sent_encoded)


In [27]:
test_hom = shortest_homology(sent_encoded[3], sent_encoded[10], 5)
for point in test_hom:
    p = generator.predict(np.array([point]))[0]
    print_sentence_with_w2v(p)

3 pct of the days 170 000 days 39 . 
3 pct of the october 169 000 thousand remaining . 
4 pct of the october 169 thousand thousand eligible . 
american th texas of washington ended equal the industrial . 
george hence u.s. and usa belonged a the kinds . 


In [28]:
test_hom = shortest_homology(sent_encoded[2], sent_encoded[1500], 20)
for point in test_hom:
    p = generator.predict(np.array([find_similar_encoding(point)]))[0]
    print_sentence_with_w2v(p)

oh ald tell Scot feelin too just just pity . 
oh ald tell Scot feelin too just just pity . 
oh ald tell Scot feelin too just just pity . 
oh ald tell Scot feelin too just just pity . 
oh ald tell Scot feelin too just just pity . 
oh ald tell Scot feelin too just just pity . 
oh ald tell Scot feelin too just just pity . 
oh ald tell Scot feelin too just just pity . 
i couldn eh t able a too just hardly . 
i couldn eh t able a too just hardly . 
oh ald ' s feeling either feeling of taste . 
he was a moment of the feeling without something . 
he was a moment of the feeling without something . 
he was a moment of the feeling without something . 
he was a moment of the feeling without something . 
he was a moment of the feeling without something . 
he was a moment of the feeling without something . 
he was a moment of the feeling without something . 
he was a moment of the feeling without something . 
he was a moment of the feeling without something . 
