In [None]:
#! /usr/bin/python
# -*- coding: utf-8 -*-
!pip install tensorlayer
#!pip install pickle

import tensorflow as tf
import tensorlayer as tl
import numpy as np
from tensorlayer.cost import cross_entropy_seq, cross_entropy_seq_with_mask
from tqdm import tqdm
from sklearn.utils import shuffle
from tensorlayer.models.seq2seq import Seq2seq
from tensorlayer.models.seq2seq_with_attention import Seq2seqLuongAttention
import os
import pickle

'''
 split data into train (70%), test (15%) and valid(15%)
    return tuple( (trainX, trainY), (testX,testY), (validX,validY) )

'''
def split_dataset(x, y, ratio = [0.7, 0.15, 0.15] ):
    # number of examples
    data_len = len(x)
    lens = [ int(data_len*item) for item in ratio ]

    trainX, trainY = x[:lens[0]], y[:lens[0]]
    testX, testY = x[lens[0]:lens[0]+lens[1]], y[lens[0]:lens[0]+lens[1]]
    validX, validY = x[-lens[-1]:], y[-lens[-1]:]

    return (trainX,trainY), (testX,testY), (validX,validY)


'''
 generate batches from dataset
    yield (x_gen, y_gen)

    TODO : fix needed

'''
def batch_gen(x, y, batch_size):
    # infinite while
    while True:
        for i in range(0, len(x), batch_size):
            if (i+1)*batch_size < len(x):
                yield x[i : (i+1)*batch_size ].T, y[i : (i+1)*batch_size ].T

'''
 generate batches, by random sampling a bunch of items
    yield (x_gen, y_gen)

'''
def rand_batch_gen(x, y, batch_size):
    while True:
        sample_idx = sample(list(np.arange(len(x))), batch_size)
        yield x[sample_idx].T, y[sample_idx].T

def decode(sequence, lookup, separator=''): # 0 used for padding, is ignored
    return separator.join([ lookup[element] for element in sequence if element ])

def load_data():
    # read data control dictionaries
    with open('drive/My Drive/Data Exploration Project/metadata.pkl', 'rb') as f:
        metadata = pickle.load(f)
    # read numpy arrays
    idx_q = np.load('drive/My Drive/Data Exploration Project/idxm.npy')
    idx_a = np.load('drive/My Drive/Data Exploration Project/idxr.npy')
    return metadata, idx_q, idx_a

def initial_setup(data_corpus):
    metadata, idx_q, idx_a = load_data()
    (trainX, trainY), (testX, testY), (validX, validY) = split_dataset(idx_q, idx_a)
    trainX = tl.prepro.remove_pad_sequences(trainX.tolist())
    trainY = tl.prepro.remove_pad_sequences(trainY.tolist())
    testX = tl.prepro.remove_pad_sequences(testX.tolist())
    testY = tl.prepro.remove_pad_sequences(testY.tolist())
    validX = tl.prepro.remove_pad_sequences(validX.tolist())
    validY = tl.prepro.remove_pad_sequences(validY.tolist())
    return metadata, trainX, trainY, testX, testY, validX, validY



if __name__ == "__main__":
    data_corpus = "cornell_corpus"

    #data preprocessing
    metadata, trainX, trainY, testX, testY, validX, validY = initial_setup(data_corpus) #Daten werden gesplittet und mit den nullen nochmal was gemacht

    # Parameters
    src_len = len(trainX)
    tgt_len = len(trainY)

    assert src_len == tgt_len #Muss genauso viele Message wie responses geben, sonst Fehlermeldung

    batch_size = 32
    n_step = src_len // batch_size
    src_vocab_size = len(metadata['index2word']) # Zahl der verschiedenen Wörter
    emb_dim = 1024 #embedding size der wortvektorisierung

    word2idx = metadata['wordindex']   # dict  word 2 index
    idx2word = metadata['index2word']   # list index 2 word

    unk_id = idx2word.index('unk')   # 1
    pad_id = idx2word.index('_')     # 0

    start_id = src_vocab_size  # 8002
    end_id = src_vocab_size + 1  # 8003

    word2idx.update({'start_id': start_id})
    word2idx.update({'end_id': end_id})
    idx2word = idx2word + ['start_id', 'end_id']

    src_vocab_size = tgt_vocab_size = src_vocab_size + 2

    num_epochs = 50 #Zahl der Trainigsepochen
    vocabulary_size = src_vocab_size
    


    def inference(seed, top_n):
        model_.eval()
        seed_id = [word2idx.get(w, unk_id) for w in seed.split(" ")]
        sentence_id = model_(inputs=[[seed_id]], seq_length=20, start_token=start_id, top_n = top_n)
        sentence = []
        for w_id in sentence_id[0]:
            w = idx2word[w_id]
            if w == 'end_id':
                break
            sentence = sentence + [w]
        return sentence

    decoder_seq_length = 20
    model_ = Seq2seq(
        decoder_seq_length = decoder_seq_length,
        cell_enc=tf.keras.layers.GRUCell,
        cell_dec=tf.keras.layers.GRUCell,
        n_layer=3,
        n_units=256,
        embedding_layer=tl.layers.Embedding(vocabulary_size=vocabulary_size, embedding_size=emb_dim),
        )
    

    # Try to load model to continue training
    try:
      load_weights = tl.files.load_npz(name='drive/My Drive/Data Exploration Project/model_epoche.npz')
      tl.files.assign_weights(load_weights, model_)
    except FileNotFoundError:
      pass
    

    optimizer = tf.optimizers.Adam(learning_rate=0.001)
    model_.train()

    seeds = ["happy birthday have a nice day",
                 "donald trump won last nights presidential debate according to snap online polls"]
    for epoch in range(num_epochs):
        model_.train()
        trainX, trainY = shuffle(trainX, trainY, random_state=0)
        total_loss, n_iter = 0, 0
        for X, Y in tqdm(tl.iterate.minibatches(inputs=trainX, targets=trainY, batch_size=batch_size, shuffle=False), 
                        total=n_step, desc='Epoch[{}/{}]'.format(epoch + 1, num_epochs), leave=False):

            X = tl.prepro.pad_sequences(X)
            _target_seqs = tl.prepro.sequences_add_end_id(Y, end_id=end_id)
            _target_seqs = tl.prepro.pad_sequences(_target_seqs, maxlen=decoder_seq_length)
            _decode_seqs = tl.prepro.sequences_add_start_id(Y, start_id=start_id, remove_last=False)
            _decode_seqs = tl.prepro.pad_sequences(_decode_seqs, maxlen=decoder_seq_length)
            _target_mask = tl.prepro.sequences_get_mask(_target_seqs)

            with tf.GradientTape() as tape:
                ## compute outputs
                output = model_(inputs = [X, _decode_seqs])
                
                output = tf.reshape(output, [-1, vocabulary_size])
                ## compute loss and update model
                loss = cross_entropy_seq_with_mask(logits=output, target_seqs=_target_seqs, input_mask=_target_mask)

                grad = tape.gradient(loss, model_.all_weights)
                optimizer.apply_gradients(zip(grad, model_.all_weights))
        
            total_loss += loss
            n_iter += 1
        tl.files.save_npz(model_.all_weights, name=f'drive/My Drive/Data Exploration Project/model_epoche{epoch}.npz')   

        # printing average loss after every epoch
        print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, num_epochs, total_loss / n_iter))

        for seed in seeds:
            print("Query >", seed)
            top_n = 3
            for i in range(top_n):
                sentence = inference(seed, top_n)
                print(" >", ' '.join(sentence))

        tl.files.save_npz(model_.all_weights, name='drive/My Drive/Data Exploration Project/model_finished.npz')

[TL] Embedding embedding_3: (63794, 1024)
[TL] RNN rnn_13: cell: GRUCell, n_units: 256
[TL] RNN rnn_14: cell: GRUCell, n_units: 256
[TL] RNN rnn_15: cell: GRUCell, n_units: 256
[TL] RNN rnn_16: cell: GRUCell, n_units: 256
[TL] RNN rnn_17: cell: GRUCell, n_units: 256
[TL] RNN rnn_18: cell: GRUCell, n_units: 256
[TL] Reshape reshape_7
[TL] Dense  dense_3: 63794 No Activation
[TL] Reshape reshape_8
[TL] Reshape reshape_9




Epoch[1/50]:   0%|          | 0/3728 [00:00<?, ?it/s][A[A

Epoch[1/50]:   0%|          | 1/3728 [00:00<38:32,  1.61it/s][A[A

Epoch[1/50]:   0%|          | 2/3728 [00:01<38:26,  1.62it/s][A[A

Epoch[1/50]:   0%|          | 3/3728 [00:01<38:29,  1.61it/s][A[A

Epoch[1/50]:   0%|          | 4/3728 [00:02<37:59,  1.63it/s][A[A

Epoch[1/50]:   0%|          | 5/3728 [00:03<38:20,  1.62it/s][A[A

Epoch[1/50]:   0%|          | 6/3728 [00:03<37:50,  1.64it/s][A[A

Epoch[1/50]:   0%|          | 7/3728 [00:04<37:54,  1.64it/s][A[A

Epoch[1/50]:   0%|          | 8/3728 [00:04<38:04,  1.63it/s][A[A

Epoch[1/50]:   0%|          | 9/3728 [00:05<38:23,  1.61it/s][A[A

Epoch[1/50]:   0%|          | 10/3728 [00:06<38:25,  1.61it/s][A[A

Epoch[1/50]:   0%|          | 11/3728 [00:06<37:14,  1.66it/s][A[A

Epoch[1/50]:   0%|          | 12/3728 [00:07<38:03,  1.63it/s][A[A

Epoch[1/50]:   0%|          | 13/3728 [00:07<38:20,  1.61it/s][A[A

Epoch[1/50]:   0%|          | 14/372

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
