In [9]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [10]:
cd 'gdrive/MyDrive/MSc DSML//UCL/Main/COMP0090 CW + Lab/Assignment 2/Task 3 checkpoints/'

/content/gdrive/MyDrive/MSc DSML/UCL/Main/COMP0090 CW + Lab/Assignment 2/Task 3 checkpoints


In [11]:
!ls

In [12]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
print(tf.__version__)

Instructions for updating:
non-resource variables are not supported in the long term
2.3.0


In [13]:
tf.test.gpu_device_name()

'/device:GPU:0'

In [14]:
import numpy as np
import urllib.request
import zipfile
import os
import random
import time
import matplotlib.pyplot as plt
from google.colab import files

In [15]:
# Download the dataset
url = "https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip"
try:
    local_filename, headers = urllib.request.urlretrieve(url, "trainDevTestTrees_PTB.zip")
except Exception as e:
    print(e)
print(local_filename)

trainDevTestTrees_PTB.zip


In [16]:
# Extract the dataset
with zipfile.ZipFile(local_filename, 'r') as my_zip:
    my_zip.extractall("trainDevTestTrees_PTB")
os.listdir("trainDevTestTrees_PTB") # List down the extracted data files

['trees']

In [20]:
# Load Data
def loadsst(path):
  xs = []
  ys = []
  file = open(path, "r")
  for line in file:
    soup = line.split()
    ys.append(soup[0].lstrip("("))
    tokens = []
    for chunk in soup[1:]:
      if chunk.endswith(")"): 
        tokens.append(chunk.rstrip(")"))
    tokens.append("\n") # Retain enter character at the end of each line
    xs.append(tokens)
  return (xs, ys)

In [21]:
ssttrainxs, _ = loadsst("trainDevTestTrees_PTB/trees/train.txt")
sstvalidxs, _ = loadsst("trainDevTestTrees_PTB/trees/dev.txt")
ssttestxs, _ = loadsst("trainDevTestTrees_PTB/trees/test.txt")

In [22]:
def get_everything_in_one_char_str(str_set,chars_needed):
  final_str = ""
  for i in range(len(str_set)):
    final_str += " " + " ".join(str_set[i])
  final_str = final_str.lstrip() # Remove the leftmost space
  final_str = ''.join([char for char in final_str if char in chars_needed])
  return final_str

In [23]:
chars_needed = ['\n', ' ', '!', '\'', ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

In [24]:
end_char = '_'

In [25]:
text_train = get_everything_in_one_char_str(ssttrainxs,chars_needed)
text_val = get_everything_in_one_char_str(sstvalidxs,chars_needed)
text_test = get_everything_in_one_char_str(ssttestxs,chars_needed)
text_total = text_train + text_val + text_test + end_char

In [26]:
# Get a set of unique characters
unique_char_list = list(set(text_total))
unique_char_list.sort()

In [27]:
# Hyper-parameters
batch_size = 1024
seq_len = 32
learning_rate = 0.01
lambd = 0.05
keep_prob = 0.9
temperature = 0.5
convergence_threshold = 0.0001
max_epoch = 100
class_count = len(unique_char_list)
##
RNN_layer_option = "LSTM" # "Vanilla","LSTM","GRU"
optimizer_option = "Adam" # "Adam","SGD","RMSProp","Adagrad"
##
rnn_layers = 2
rnn_units = [128,128]
assert rnn_layers == len(rnn_units)
##
depths = 2
layer_sizes = [128,class_count]
assert depths == len(layer_sizes)

In [28]:
def prepare_data(text,seq_len):
  seqs = []

  idx = []
  for i in range(0, len(text)-seq_len-1, 1):
    idx.append(i)
  # random.shuffle(idx)
  for i in idx:
    seq = text[i:i+seq_len+1]
    seqs.append(seq)
  return seqs

In [29]:
train_seq = prepare_data(text_train,seq_len)
val_seq = prepare_data(text_val,seq_len)
test_seq = prepare_data(text_test,seq_len)

In [30]:
print(len(train_seq),len(val_seq),len(test_seq))

899602 116853 232862


In [31]:
def trunc(seqs,batch_size):
  amount_to_retain = (len(seqs)//batch_size)*batch_size
  return seqs[0:amount_to_retain]

In [32]:
# Ensure size of datasets can be divided exactly by batch_size
train_seq = trunc(train_seq,batch_size)
val_seq = trunc(val_seq,batch_size)
test_seq = trunc(test_seq,batch_size)

In [33]:
print(len(train_seq),len(val_seq),len(test_seq))

899072 116736 232448


In [34]:
def onehot(x,y,seqs,unique_char_list):

  for i,seq in enumerate(seqs):
    input = seq[:-1]
    output = seq[-1]
    for j,char in enumerate(input):
      x[i,j,unique_char_list.index(char)] = 1 # Turn the encoding of a character at the found position (in the unique list) to 1 
    y[i] = unique_char_list.index(output)

  return x,y

In [35]:
def onehot_for_sampling(x,y,seqs,unique_char_list):

  for i,seq in enumerate(seqs):
    input = seq
    output = seq[-1]
    for j,char in enumerate(input):
      x[i,j,unique_char_list.index(char)] = 1 # Turn the encoding of a character at the found position (in the unique list) to 1 
    y[i] = unique_char_list.index(output)

  return x,y

In [36]:
def build_model(rnn_units, class_count, rnn_layers, learning_rate, RNN_layer_option
                , batch_size, seq_len, optimizer_option, lambd, keep_prob, depths, layer_sizes):  
    
  # Placeholders
  x = tf.placeholder(tf.float32, [batch_size, seq_len, class_count], name='placeholder_x')
  y = tf.placeholder(tf.int64, [batch_size], name='placeholder_y')
  seq_size = tf.placeholder(tf.int32, [batch_size], name='placeholder_seq_size')
  decayed_lr = tf.placeholder(tf.float64, name='placeholder_decayed_lr')
  is_training = tf.placeholder(tf.bool, name='placeholder_is_training')
  is_sampling = tf.placeholder(tf.bool, name='placeholder_is_sampling')
  sampling_index = tf.placeholder(tf.int32, name='placeholder_sampling_index')

  # RNN Cells
  layers = []
  for size in rnn_units:
    if RNN_layer_option == "Vanilla":
      layer = tf.nn.rnn_cell.BasicRNNCell(size)
    elif RNN_layer_option == "LSTM":
      layer = tf.nn.rnn_cell.LSTMCell(size)
    elif RNN_layer_option == "GRU":
      layer = tf.nn.rnn_cell.GRUCell(size)   
    keep_prob = tf.cond(is_training, lambda:keep_prob, lambda:1.0)
    layers.append(tf.nn.rnn_cell.DropoutWrapper(cell=layer,output_keep_prob=keep_prob))
  cells = tf.nn.rnn_cell.MultiRNNCell(layers)
  output, final_state = tf.nn.dynamic_rnn(cell=cells, inputs=x, sequence_length=seq_size, dtype=tf.float32)
  index = tf.cond(is_sampling, lambda:sampling_index, lambda:tf.constant(-1))
  output = tf.reshape(output[:,index,:],[batch_size,rnn_units[-1]])

  # Hidden Layers
  for i in range(depths-1):
    output = tf.compat.v1.layers.dense(output, layer_sizes[i], activation = 'relu')      
 
  # Output (Softmax) Layer
  pred = tf.compat.v1.layers.dense(output, layer_sizes[-1])

  # Loss
  loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=pred))
  # loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=pred))
  
  # Apply L2 regularization (only to training time)
  l2_norms = [tf.nn.l2_loss(v) for v in tf.trainable_variables()]
  l2_norm = tf.reduce_sum(l2_norms)
  loss = (loss + lambd*l2_norm)/batch_size

  # Select optimizer
  if optimizer_option == "Adam":
    optimizer = tf.train.AdamOptimizer(learning_rate=decayed_lr)
  elif optimizer_option == "SGD":
    optimizer = tf.train.GradientDescentOptimizer(learning_rate=decayed_lr)
  elif optimizer_option == "RMSProp":
    optimizer = tf.train.RMSPropOptimizer(learning_rate=decayed_lr)
  elif optimizer_option == "Adagrad":
    optimizer = tf.train.AdagradOptimizer(learning_rate=decayed_lr)
    
  # Gradient clipping
  # gradients_variables = optimizer.compute_gradients(loss)
  # clipped_gradients_variables = [(tf.clip_by_value(gradients, -1.0, 1.0), variables) for gradients, variables in gradients_variables]
  # train_op = optimizer.apply_gradients(clipped_gradients_variables)
  train_op = optimizer.minimize(loss)

  # Evaluation
  prediction = tf.reshape(tf.argmax(pred, 1),[batch_size])
  expectation = y
  matches = tf.equal(prediction, expectation)
  accuracy = tf.reduce_mean(tf.cast(matches, tf.float32))
  total_loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=pred))
  total_matches = tf.reduce_sum(tf.cast(matches, tf.float32))

  return {
        "x": x
        , "y": y
        , "final_state": final_state
        , "loss": loss
        , "accuracy": accuracy
        , "total_loss": total_loss
        , "total_matches": total_matches
        , "optimizer": train_op
        , "sequence_size": seq_size
        , "prediction": prediction
        , "expectation": expectation
        , "posterior_pmf": pred
        , "decayed_lr":decayed_lr
        , "output": output
        , "is_training": is_training
        , "is_sampling": is_sampling
        , "sampling_index": sampling_index
      }

In [45]:
def train_model(session, model, train_seq, seq_len, batch_size, step, learning_rate, decay_rate=1):

  decayed_lr = learning_rate*((decay_rate)**(step/10)) # Learning rate decay
  batch = 1
  total_loss = 0.0
  total_matches = 0.0
  idx = []
  for i in range(0,len(train_seq),batch_size):
    idx.append(i)
  random.shuffle(idx)
  for i in idx:
    batch_train_seq = train_seq[i:i+batch_size]
    trainx_zeros = np.zeros((batch_size,seq_len,class_count))
    trainy_zeros = np.zeros((batch_size))
    x, y = onehot(trainx_zeros,trainy_zeros,batch_train_seq,unique_char_list)
    input_dict = {
              model['x']: x
              , model['y']: y
              , model['sequence_size']: np.array([seq_len]*batch_size, dtype=np.int32)
              , model['decayed_lr']: decayed_lr
              , model['is_training']: True
              , model['is_sampling']: False
              , model['sampling_index']: -1
            }    
    optimizer, prediction, expectation, loss, matches = session.run(fetches=[model["optimizer"],model["prediction"],model["expectation"],model["total_loss"],model["total_matches"]], feed_dict=input_dict)   
    total_loss += loss
    total_matches += matches
    # print("Batch {}/{}".format(batch,len(idx))+": Prediction: "+str(prediction)+"| Expectation: "+str(expectation), end="\r")
    # print("Batch {}/{}".format(batch,len(idx)), end="\r")
    # print("Batch {}/{}".format(batch,len(idx)))
    batch += 1

  total_samples = len(train_seq)

  return total_loss/total_samples, total_matches/total_samples

In [38]:
def run_model(session, model, seq, seq_len, batch_size):

  total_loss = 0.0
  total_matches = 0.0
  
  for i in range(0,len(seq),batch_size):
    batch_seq = seq[i:i+batch_size]
    x_zeros = np.zeros((batch_size,seq_len,class_count))
    y_zeros = np.zeros((batch_size))
    x, y = onehot(x_zeros,y_zeros,batch_seq,unique_char_list)
    input_dict = {
              model['x']: x
              , model['y']: y
              , model['sequence_size']: np.array([seq_len]*batch_size, dtype=np.int32)
              , model['is_training']: False
              , model['is_sampling']: False
              , model['sampling_index']: -1
            }  

    loss, matches, prediction = session.run(fetches=[model["total_loss"],model["total_matches"],model["prediction"]], feed_dict=input_dict)    
    total_loss += loss
    total_matches += matches
    
  total_samples = len(seq)
    
  return total_loss/total_samples, total_matches/total_samples

In [39]:
def predict_posterior(session, model, x, y, seq_len, batch_size, sampling_index):

  input_dict = {
            model['x']: x
            , model['y']: y
            , model['sequence_size']: np.array([seq_len]*batch_size, dtype=np.int32)
            , model['is_training']: False
            , model['is_sampling']: True
            , model['sampling_index']: sampling_index
          }  

  posterior_pmf = session.run(fetches=[model["posterior_pmf"]], feed_dict=input_dict)
    
  return posterior_pmf[0][0]

In [40]:
def sampling(prediction_pmf,temperature,unique_char_list):

  # Randomly generate a character based on the posterior distribution of the previous character
  if temperature == 1: # Control the scale of probabilities of different categories
    prediction = np.random.choice(unique_char_list, p=prediction_pmf)
  else:
    prediction_pmf = np.log(prediction_pmf)/temperature
    prediction_pmf = np.exp(prediction_pmf)
    prediction_pmf = prediction_pmf/np.sum(prediction_pmf) # Normalize
    prediction = np.random.choice(unique_char_list, p=prediction_pmf)

  return prediction

In [41]:
def predict_setences(seq_to_pred,model,session,unique_char_list,seq_len,batch_size):

  sampling_index = -1
  if len(seq_to_pred) < seq_len:
    sentence_generate = seq_to_pred + (seq_len-len(seq_to_pred))*end_char # Padding
    sampling_index = len(seq_to_pred) - 1
  else:
    sentence_generate = seq_to_pred
    sampling_index = -1
  sentence_generate = sentence_generate[-1-seq_len:]
  result = seq_to_pred

  while True: 
    x_zeros = np.zeros((batch_size,seq_len,class_count)) # Generate one-hot enconding matrix of the character (with batch padding if size < batch_size)
    y_zeros = np.zeros((batch_size))
    for i in range(seq_len):
      x_zeros[:,i,unique_char_list.index(sentence_generate[i])] = 1
    x = x_zeros
    dummy_y = y_zeros
    prediction_pmf = predict_posterior(session, model, x, dummy_y, seq_len, batch_size, sampling_index) # Generate the posterior distribution  
    prediction_pmf = np.exp(prediction_pmf)/sum(np.exp(prediction_pmf))
    next_char = sampling(prediction_pmf,temperature,unique_char_list)
    # next_char = unique_char_list[np.argmax(prediction_pmf)]
    result += next_char
    if len(result) < seq_len:
      sentence_generate = result + (seq_len-len(result))*end_char # Padding
      sampling_index = len(result) - 1
    else:
      sentence_generate = result
      sampling_index = -1
    sentence_generate = sentence_generate[-1-seq_len:]
    if next_char == ".":
      break

  return result

In [42]:
seq_to_pred = "I have a pen, I have an app "

In [47]:
restart = True

if restart:
  model_prefix = 'task3_model_v4.2_checkpoint_'
  latest_epoch = 0
  starting_epoch = 0
else:
  # Get the latest checkpoint
  model_prefix = 'task3_model_v4.2_checkpoint_'
  files = os.listdir()
  matched_files = [file for file in files if model_prefix in file]
  latest_epoch = max([int(matched_file.replace(model_prefix, "")[0]) for matched_file in matched_files])
  starting_epoch = latest_epoch

In [44]:
os.listdir()

['trainDevTestTrees_PTB.zip', 'trainDevTestTrees_PTB']

In [None]:
with tf.device('/device:GPU:0'):

    # Prepare performance measures
    train_acc = []
    train_loss = []
    val_acc = []
    val_loss = []
    prev_train_loss = 100.0
    best_val_loss = 100
    sentences_generate = []

    # Start new session
    tf.reset_default_graph()
    model = build_model(rnn_units, class_count, rnn_layers, learning_rate, RNN_layer_option
                        , batch_size, seq_len, optimizer_option, lambd, keep_prob, depths, layer_sizes)
    init = tf.global_variables_initializer()
    session = tf.Session()
    session.run(init)
    saver = tf.train.Saver(tf.trainable_variables(),max_to_keep=3000)

    if not restart:
      # Load Existing Model
      saver.restore(session,model_prefix+str(latest_epoch)) 
      print("Loading weights success")  

    # Training
    for epoch in range(starting_epoch,max_epoch):
      print("#============= Epoch {} =============#".format(epoch+1))
      start_time = time.time()
      checkpoint_name = model_prefix+str(epoch+1)

      current_train_loss, current_train_acc = train_model(session, model, train_seq, seq_len, batch_size, epoch+1, learning_rate)
      current_val_loss, current_val_acc = run_model(session, model, val_seq, seq_len, batch_size)
      train_acc.append(current_train_acc)
      train_loss.append(current_train_loss)
      val_acc.append(current_val_acc)
      val_loss.append(current_val_loss)
      print("Train Loss: {} Train Accuracy: {} ".format(current_train_loss, current_train_acc))
      print("Validation Loss: {} Validation Accuracy: {} ".format(current_val_loss, current_val_acc))

      end_time = time.time()
      print("--- %s seconds ---" % (end_time - start_time))

      # Generate text
      # if current_val_acc >= 0.5:
      #   sentence_generated = predict_setences(seq_to_pred,model,session,unique_char_list,seq_len,batch_size)
      #   sentences_generate.append(sentence_generated)
      #   print("Sentences Generated:")
      #   print(sentence_generated)

      # Convergence rule
      # if (abs(current_train_loss-prev_train_loss) < convergence_threshold) or ((current_val_loss-best_val_loss)/best_val_loss > 0.1):
      if abs(current_train_loss-prev_train_loss) < convergence_threshold:
        break
      else:
        prev_train_loss = current_train_loss
        if current_val_loss < best_val_loss:
          best_val_loss = current_val_loss  
      
      saver.save(session, checkpoint_name)
      print('saved '+checkpoint_name)

Train Loss: 2.174068037768431 Train Accuracy: 0.3706955616457859 
Validation Loss: 1.7836087387904787 Validation Accuracy: 0.4658460115131579 
--- 83.15674352645874 seconds ---
saved task3_model_v4.2_checkpoint_1
Train Loss: 1.7762447569799316 Train Accuracy: 0.4693695276907745 
Validation Loss: 1.6630679599025793 Validation Accuracy: 0.49935752467105265 
--- 81.4774718284607 seconds ---
saved task3_model_v4.2_checkpoint_2
Train Loss: 1.6985327628587537 Train Accuracy: 0.49008866920558086 
Validation Loss: 1.6077603718690705 Validation Accuracy: 0.5158391584429824 
--- 81.00032997131348 seconds ---
saved task3_model_v4.2_checkpoint_3
Train Loss: 1.6671913689795823 Train Accuracy: 0.4998086916287016 
Validation Loss: 1.5856042577509295 Validation Accuracy: 0.5230263157894737 
--- 81.14301776885986 seconds ---
saved task3_model_v4.2_checkpoint_4
Train Loss: 1.6472315925400458 Train Accuracy: 0.5052787763382688 
Validation Loss: 1.5721461762461746 Validation Accuracy: 0.528234649122807 
-

In [None]:
print(train_acc)

In [None]:
print(train_loss)

In [None]:
print(val_acc)

In [None]:
print(val_loss)

In [None]:
seqs_to_pred = [
    "This is a nice mov"
    , "I love trave"
    , "Life is tou"
    , "There is nothing more intere"
    , "Wish you happ"
]

# Generate text based on the final model
for seq in seqs_to_pred:
  print("By Human: "+seq)
  print("By Robot: "+predict_setences(seq,model,session,unique_char_list,seq_len,batch_size))

In [None]:
# Evaluation
label = 'Train'
loss, acc = run_model(session, model, train_seq, seq_len, batch_size)
print('{} Loss:'.format(label), loss)
print('{} Accuracy:'.format(label), acc)  

label = 'Validation'
loss, acc = run_model(session, model, val_seq, seq_len, batch_size)
print('{} Loss:'.format(label), loss)
print('{} Accuracy:'.format(label), acc) 

label = 'Test'
loss, acc = run_model(session, model, test_seq, seq_len, batch_size)
print('{} Loss:'.format(label), loss)
print('{} Accuracy:'.format(label), acc) 

In [None]:
# Loss Plots
x = list(range(len(train_loss)))
plt.plot(x, train_loss, label = "Train Loss") 
plt.plot(x, val_loss, label = "Validation Loss")
plt.xlabel('Epoch')
plt.ylabel('Cross-Entropy Loss')
plt.legend()
plt.savefig("assignment_2_task_3_loss_result_v2.jpg")
files.download("assignment_2_task_3_loss_result_v2.jpg") 
plt.show()

In [None]:
# Accuracy Plots
x = list(range(len(train_acc)))
plt.plot(x, train_acc, label = "Train Accuracy") 
plt.plot(x, val_acc, label = "Validation Accuracy")
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig("assignment_2_task_3_acc_result_v2.jpg")
files.download("assignment_2_task_3_acc_result_v2.jpg") 
plt.show()    

In [None]:
# End session
session.close()