In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals
#!pip install underthesea
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf
physical_devices = tf.config.experimental.list_physical_devices('GPU')
assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [0]:
!pip install underthesea
import unicodedata
import re
import numpy as np
import os
import io
import time
import underthesea 
from underthesea import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
stopword_set = stopwords.words('english')
from keras.initializers import Constant
from google.colab import drive
drive.mount('/content/gdrive')
dirr='/content/gdrive/My Drive/ColabNotebooks/my_text_generation/models/trying7'
os.chdir(dirr)
path_to_file = '/content/gdrive/My Drive/ColabNotebooks/corpus/paralle dataset/dataset.txt'

In [0]:
def preprocess_sentence(w):
    w = w.lower().strip()
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r"([\'])", r" \1", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r" ?\([^)]+\)", "", w)
    return w
def vie_preprocess(w):
    w=word_tokenize(w,format='text')
    w = '<start> ' + w + ' <end>'
    return w
def remove_eng_stopword(w,stopword_set):
    w=nltk.tokenize.word_tokenize(w)
    w=[w[i] for i in range(len(w)) if w[i] not in stopword_set]
    w=' '.join(w)
    return w

In [0]:
def create_dataset(path, num_examples):
    lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
    obs = [[preprocess_sentence(w) for w in l.split('\t')]  for l in lines[:num_examples]]
    obs = [[re.sub(r' ','',word),remove_eng_stopword(gloss.split(';')[0],stopword_set), vie_preprocess(vie)] for word,gloss,vie in obs]
    return zip(*obs)
def tokenize(lang,vocab_size):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(vocab_size,filters='')
  lang_tokenizer.fit_on_texts(lang)
  lang_tokenizer.word_index=dict((word,idx) for word,idx in lang_tokenizer.word_index.items() if idx<=vocab_size)
  lang_tokenizer.index_word=dict((idx,word) for idx,word in lang_tokenizer.index_word.items() if idx<=vocab_size)
  return lang_tokenizer
def padding(text,tokenizer):
    seqs=tokenizer.texts_to_sequences(text)
    seqs=tf.keras.preprocessing.sequence.pad_sequences(seqs,padding='post')
    seqs=tf.convert_to_tensor(seqs)
    return seqs
def in_out_split(sens):
    vie_in=[]
    vie_out=[]
    for sen in sens:
        sen=sen.split()
        vie_in_sen=' '.join(sen[:-1])
        vie_out_sen=' '.join(sen[1:])
        vie_in.append(vie_in_sen)
        vie_out.append(vie_out_sen)
    return tuple(vie_in),tuple(vie_out)
def load_dataset(path,num_saple=None):
    word,gloss,vie=create_dataset(path_to_file,num_saple)
    #vie_in,vie_out=in_out_split(vie)
    vie_tokenizer = tokenize(vie,VIE_VOCABULARY_SIZE)
    eng_tokenizer = tokenize(word+gloss,ENG_VOCABULARY_SIZE)
    word_tensor = padding(word,eng_tokenizer)
    gloss_tensor = padding(gloss,eng_tokenizer)
    vie_tensor = padding(vie,vie_tokenizer)
    #vie_in_tensor = padding(vie_in,vie_tokenizer)
    #vie_out_tensor = padding(vie_out,vie_tokenizer)
    return word_tensor, gloss_tensor,vie_tensor,vie_tokenizer, eng_tokenizer

In [0]:
BATCH_SIZE = 128
NUM_SAMPLE=150000
BUFFER_SIZE = NUM_SAMPLE
steps_per_epoch = NUM_SAMPLE//BATCH_SIZE
ENG_VOCABULARY_SIZE=15000
VIE_VOCABULARY_SIZE=20000
EMBEDDING_DIM=300
UNITS=1024

In [0]:
word_tensor, gloss_tensor,vie_tensor,vie_tokenizer, eng_tokenizer = load_dataset(path_to_file,NUM_SAMPLE)
dataset = tf.data.Dataset.from_tensor_slices((word_tensor,gloss_tensor, vie_tensor)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [0]:
def load_word_embedding(filename="cc.en.300.vec",DIR='/content/gdrive/My Drive/ColabNotebooks/word_embedding'):        
    embeddings_index={}
    f=open(os.path.join(DIR,filename),encoding="UTF-8")
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    return embeddings_index
def embedding_vocabulary(embedding_index,v):
    words_not_found = []
    vocab = len(v)+1
    embedding_matrix = np.random.uniform(-0.25, 0.25, size=(vocab, 300)  )
    for word,i in v.items():
        embedding_vector = embedding_index.get(word)
        if (embedding_vector is not None) and len(embedding_vector) > 0:
            embedding_matrix[i] = embedding_vector
        else:
            words_not_found.append(word)
    # print('Number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
    print("\tShape of embedding matrix: %s" % str(embedding_matrix.shape))
    print("\tNo. of words not found in GloVe: ", len(words_not_found))
    return embedding_matrix

eng_embedding_index=load_word_embedding()
eng_weights_matrix=embedding_vocabulary(eng_embedding_index,eng_tokenizer.word_index)

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size+1, embedding_dim,embeddings_initializer=Constant(eng_weights_matrix))
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
  def call(self, x1,x2, hidden):
    x1 = self.embedding(x1)
    x2 = self.embedding(x2)
    x2 = tf.math.reduce_mean(x2,1)
    x2 = tf.expand_dims(x2,axis=1)
    x = tf.math.reduce_mean([x1,x1,x2],0)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size+1, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)
  def call(self, x, hidden, enc_output):
    x = self.embedding(x)
    output, state = self.gru(x,initial_state = hidden)
    output = tf.reshape(output, (-1, output.shape[2]))
    x = self.fc(output)
    return x, state

In [0]:
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)
  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask
  return tf.reduce_mean(loss_)

In [0]:
encoder = Encoder(ENG_VOCABULARY_SIZE, EMBEDDING_DIM, UNITS, BATCH_SIZE)
decoder = Decoder(VIE_VOCABULARY_SIZE, EMBEDDING_DIM, UNITS, BATCH_SIZE)
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
    name='train_accuracy')
checkpoint_dir = '/content/gdrive/My Drive/ColabNotebooks/my_text_generation/models/trying7.1'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [0]:
@tf.function
def train_step(inp1,inp2, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp1,inp2, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([vie_tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden= decoder(dec_input, dec_hidden, enc_output)

      loss += loss_function(targ[:, t], predictions)

      # using teacher forcing
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))
  train_loss(loss)
  train_accuracy(targ[:, t], predictions)

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [0]:
EPOCHS =20
for epoch in range(EPOCHS):
  train_loss.reset_states()
  train_accuracy.reset_states()
  start = time.time()
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0
  for (batch, (inp1,inp2, targ)) in enumerate(dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp1,inp2, targ, enc_hidden)
    total_loss += batch_loss
    if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                     batch,
                                                     batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)
  print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, 
                                                train_loss.result(), 
                                                train_accuracy.result()))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [0]:
max_length=40
def greedy(word,gloss):
    gloss = preprocess_sentence(gloss)
    gloss = remove_eng_stopword(gloss,stopword_set)
    gloss = [eng_tokenizer.word_index[i] for i in gloss.split(' ')]
    gloss = tf.keras.preprocessing.sequence.pad_sequences([gloss],padding='post')
    gloss = tf.convert_to_tensor(gloss)
    word = [eng_tokenizer.word_index[i] for i in word.split(' ')]
    word = tf.keras.preprocessing.sequence.pad_sequences([word],padding='post')
    word = tf.convert_to_tensor(word)
    result = ''

    hidden = [tf.zeros((1, UNITS))]
    enc_out, enc_hidden = encoder(word,gloss, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([vie_tokenizer.word_index['<start>']], 0)

    for t in range(max_length):
        predictions, dec_hidden= decoder(dec_input, dec_hidden, enc_out)
        predicted_id = tf.argmax(predictions[0]).numpy()
        if vie_tokenizer.index_word[predicted_id] == '<end>':
            return result
        result += vie_tokenizer.index_word[predicted_id] + ' '
        if vie_tokenizer.index_word[predicted_id] == '<end>':
            return result

        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)

    return result

In [0]:
# restoring the latest checkpoint in checkpoint_dir
'''print(checkpoint_dir)
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))'''