In [75]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
import pickle
import itertools
import numpy as np
from scipy import spatial
from scipy.stats import norm
import nltk.data
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.corpus import reuters
from nltk.corpus import gutenberg
from nltk.corpus import brown
from nltk.tokenize import sent_tokenize
from gensim.models import KeyedVectors
from keras.layers import Input, Dense, Lambda, Layer
from keras.callbacks import ModelCheckpoint
from keras.models import Model
from keras import backend as K
from keras import metrics

w2v = KeyedVectors.load_word2vec_format('/content/drive/My Drive/projet_Deep_Learning/wiki-news-300d-1M.vec')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


KeyboardInterrupt: ignored

In [80]:
import pandas as pd
import nltk
from nltk.corpus import nps_chat
from nltk.corpus import webtext

nltk.download('brown')
nltk.download('gutenberg')
nltk.download('reuters')
nltk.download('punkt')
nltk.download('webtext')
nltk.download('nps_chat')

def split_into_sent (text):
    strg = ''
    for word in text:
        strg += word
        strg += ' '
    strg_cleaned = strg.lower()
    for x in ['\xd5d','\n','"',"!", '#','$','%','&','(',')','*','+',',','-','/',':',';','<','=','>','?','@','[','^',']','_','`','{','|','}','~','\t']:
        strg_cleaned = strg_cleaned.replace(x, '')
    sentences = sent_tokenize(strg_cleaned)
    return sentences

def vectorize_sentences(sentences):
    vectorized = []
    for sentence in sentences:
        byword = sentence.split()
        concat_vector = []
        for word in byword:
            try:
                concat_vector.append(w2v[word])
            except:
                pass
        vectorized.append(concat_vector)
    return vectorized

data_concat = []
for t in [brown.words(), reuters.words(), gutenberg.words(), nps_chat.words(), webtext.words()]:
    text = split_into_sent(t)
    vect = vectorize_sentences(text)
    data = [x for x in vect if len(x) == 10]
    for x in data:
        data_concat.append(list(itertools.chain.from_iterable(x)))

# vectorize the data from set
input_texts = []
num_input = 5000
data_path = "/content/drive/My Drive/projet_Deep_Learning/train.txt"
with open(data_path, "r", encoding="utf-8") as f:
    lines = f.read().lower().split("\n")

for line in lines[0:(num_input - 1)]:

    input_text = split_into_sent(line)
    vect = vectorize_sentences(input_text)
    data = [x for x in vect if len(x) == 10]
    for x in data:
        data_concat.append(list(itertools.chain.from_iterable(x)))


data_array = np.array(data_concat)
np.random.shuffle(data_array)
print("nombre de phrases : ",len(data_array))
train = data_array[:8000]
test = data_array[8000:8500]


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package webtext to /root/nltk_data...
[nltk_data]   Package webtext is already up-to-date!
[nltk_data] Downloading package nps_chat to /root/nltk_data...
[nltk_data]   Package nps_chat is already up-to-date!
nombre de phrases :  8533


In [82]:
from keras import callbacks
batch_size = 100
original_dim = 3000
latent_dim = 1000
intermediate_dim = 1200
epochs = 100
epsilon_std = 1.0

x = Input(batch_shape=(batch_size, original_dim))
h = Dense(intermediate_dim, activation='relu')(x)
z_mean = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)

def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0.,
                              stddev=epsilon_std)
    return z_mean + K.exp(z_log_var / 2) * epsilon

# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

# we instantiate these layers separately so as to reuse them later
decoder_h = Dense(intermediate_dim, activation='relu')
decoder_mean = Dense(original_dim, activation='sigmoid')
h_decoded = decoder_h(z)
x_decoded_mean = decoder_mean(h_decoded)

# placeholder loss
def zero_loss(y_true, y_pred):
    return K.zeros_like(y_pred)

# Custom loss layer
class CustomVariationalLayer(Layer):
    def __init__(self, **kwargs):
        self.is_placeholder = True
        super(CustomVariationalLayer, self).__init__(**kwargs)

    def vae_loss(self, x, x_decoded_mean):
        xent_loss = original_dim * metrics.binary_crossentropy(x, x_decoded_mean)
        kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
        return K.mean(xent_loss + kl_loss)

    def call(self, inputs):
        x = inputs[0]
        x_decoded_mean = inputs[1]
        loss = self.vae_loss(x, x_decoded_mean)
        self.add_loss(loss, inputs=inputs)
        # we don't use this output, but it has to have the correct shape:
        return K.ones_like(x)

loss_layer = CustomVariationalLayer()([x, x_decoded_mean])
vae = Model(x, [loss_layer])
vae.compile(optimizer='rmsprop', loss=[zero_loss])

#checkpoint
cp = [callbacks.ModelCheckpoint(filepath="/content/drive/My Drive/projet_Deep_Learning/model.h5", verbose=1, save_best_only=True)]

#train
vae.fit(train, train,
        shuffle=True,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(test, test), callbacks=cp)

# build a model to project inputs on the latent space
encoder = Model(x, z_mean)

# build a generator that can sample from the learned distribution
decoder_input = Input(shape=(latent_dim,))
_h_decoded = decoder_h(decoder_input)
_x_decoded_mean = decoder_mean(_h_decoded)
generator = Model(decoder_input, _x_decoded_mean)



Train on 8000 samples, validate on 500 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to -566.82307, saving model to /content/drive/My Drive/projet_Deep_Learning/model.h5
Epoch 2/100

Epoch 00002: val_loss improved from -566.82307 to -647.15532, saving model to /content/drive/My Drive/projet_Deep_Learning/model.h5
Epoch 3/100

Epoch 00003: val_loss improved from -647.15532 to -667.51768, saving model to /content/drive/My Drive/projet_Deep_Learning/model.h5
Epoch 4/100

Epoch 00004: val_loss improved from -667.51768 to -672.70387, saving model to /content/drive/My Drive/projet_Deep_Learning/model.h5
Epoch 5/100

Epoch 00005: val_loss improved from -672.70387 to -677.21759, saving model to /content/drive/My Drive/projet_Deep_Learning/model.h5
Epoch 6/100

Epoch 00006: val_loss improved from -677.21759 to -681.16799, saving model to /content/drive/My Drive/projet_Deep_Learning/model.h5
Epoch 7/100

Epoch 00007: val_loss improved from -681.16799 to -686.68727, saving model to

In [11]:
vae.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (100, 3000)          0                                            
__________________________________________________________________________________________________
dense_11 (Dense)                (100, 1200)          3601200     input_3[0][0]                    
__________________________________________________________________________________________________
dense_12 (Dense)                (100, 1000)          1201000     dense_11[0][0]                   
__________________________________________________________________________________________________
dense_13 (Dense)                (100, 1000)          1201000     dense_11[0][0]                   
____________________________________________________________________________________________

In [0]:
# some matrix magic
def sent_parse(sentence, mat_shape):
    data_concat = []
    word_vecs = vectorize_sentences(sentence)
    for x in word_vecs:
        data_concat.append(list(itertools.chain.from_iterable(x)))
    zero_matr = np.zeros(mat_shape)
    zero_matr[0] = np.array(data_concat)
    return zero_matr

In [0]:
# input: original dimension sentence vector
# output: text
def print_sentence_with_w2v(sent_vect):
    word_sent = ''
    tocut = sent_vect
    for i in range (int(len(sent_vect)/300)):
        word_sent += w2v.most_similar(positive=[tocut[:300]], topn=1)[0][0]
        word_sent += ' '
        tocut = tocut[300:]
    print(word_sent)

# input: encoded sentence vector
# output: encoded sentence vector in dataset with highest cosine similarity
def find_similar_encoding(sent_vect):
    all_cosine = []
    for sent in sent_encoded:
        result = 1 - spatial.distance.cosine(sent_vect, sent)
        all_cosine.append(result)
    data_array = np.array(all_cosine)
    
    maximum = data_array.argsort()[-3:][::-1][1]
    print(maximum)
    new_vec = sent_encoded[maximum]
    return new_vec

# input: two points, integer n
# output: n equidistant points on the line between the input points (inclusive)
def shortest_homology(point_one, point_two, num):
    dist_vec = point_two - point_one
    sample = np.linspace(0, 1, num, endpoint = True)
    hom_sample = []
    for s in sample:
        hom_sample.append(point_one + s * dist_vec)
    return hom_sample


# input: two written sentences, VAE batch-size, dimension of VAE input
# output: the function embeds the sentences in latent-space, and then prints their generated text representations
# along with the text representations of several points in between them
def sent_2_sent(sent1,sent2, batch, dim):
    a = sent_parse([sent1], (batch,dim))
    b = sent_parse([sent2], (batch,dim))
    encode_a = encoder.predict(a, batch_size = batch)
    encode_b = encoder.predict(b, batch_size = batch)
    test_hom = hom_shortest(encode_a[0], encode_b[0], 5)
    
    for point in test_hom:
        p = generator.predict(np.array([point]))[0]
        print_sentence(p)

In [92]:
print_sentence_with_w2v(train[4789])
sent_encoded = encoder.predict(np.array(train), batch_size = 100)
sent_decoded = generator.predict(sent_encoded)
test_hom1 = shortest_homology(sent_encoded[4789], sent_encoded[88], 10)

print("---phrases originales---")
print_sentence_with_w2v(train[4789])
print_sentence_with_w2v(train[88])
print("---phrases decodées distribuées uniformément---")
for point in test_hom1:
    p = generator.predict(np.array([point]))[0]
    print_sentence_with_w2v(p)

print("---phrases originales---")
print_sentence_with_w2v(train[7111])
print_sentence_with_w2v(train[1312])
test_hom2 = shortest_homology(sent_encoded[7111], sent_encoded[1312], 5)
print("---phrases décodées les plus proches---")
for point in test_hom2:
    p = generator.predict(np.array([find_similar_encoding(point)]))[0]
    print_sentence_with_w2v(p)

  if np.issubdtype(vec.dtype, np.int):


a word with you attend those men our pleasure . 
---phrases originales---
a word with you attend those men our pleasure . 
i w a s r e a d y . 
---phrases decodées distribuées uniformément---
of man of do even one nature of sorrow . 
of man of we even one kind of sorrow . 
one tho of just one one kind of dear . 
just tho of just one one one of man . 
i c a one th e equal a f . 
i w a s l e c e f . 
i w a s l e c e f . 
i w a s r e c e y . 
i w a s r e c e y . 
i w a s r e c e y . 
---phrases originales---
und westbank to reorganize network und westbank ag lt . 
elton but i have not much faith in mrs . 
---phrases décodées les plus proches---
7062
english water a direct global tho oil ltd lt . 
7062
english water a direct global tho oil ltd lt . 
1312
jason say i have not more for of mrs . 
7524
elizabeth could not not not little one of mrs . 
7002
i say i have even a truly kind lady . 


In [26]:
senten1 = ['Hello','world','have','to','run']
senten2 = ['Hello','back']
def inserer_sen(sent1,sent2):
  sent1_vec = []
  sent2_vec = []
  for word in sent1:
    sent1_vec.append(w2v[word])

  for word in sent2:
    sent2_vec.append(w2v[word])

  return sent1_vec, sent2_vec

sent1_vec, sent2_vec = inserer_sen(senten1,senten2)
print("sent1")
print_sentence_with_w2v(sent1_vec)


sent1

