In [2]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import matplotlib.pyplot as plt
import tensorflow.keras.utils as ku 
import numpy as np 
import pandas as pd

Definition of a plot function for training result visualization

In [3]:
def plot_results(history):
    hist_df = pd.DataFrame(history.history)
    hist_df.columns=["loss", "accuracy", "val_loss", "val_accuracy"]
    hist_df.index = np.arange(1, len(hist_df)+1)
    
    fig, axs = plt.subplots(nrows=2, sharex=True, figsize=(16, 10))
    axs[0].plot(hist_df.val_accuracy, lw=3, label='Validation Accuracy')
    axs[0].plot(hist_df.accuracy, lw=3, label='Training Accuracy')
    axs[0].set_ylabel('Accuracy')
    axs[0].set_xlabel('Epoch')
    axs[0].grid()
    axs[0].legend(loc=0)
    axs[1].plot(hist_df.val_loss, lw=3, label='Validation Loss')
    axs[1].plot(hist_df.loss, lw=3, label='Training Loss')
    axs[1].set_ylabel('Loss')
    axs[1].set_xlabel('Epoch')
    axs[1].grid()
    axs[1].legend(loc=0)
    
    plt.show();

## Data preproccesing

In [4]:
# Mounting the google drive to google colab in order to load the data files directly from it
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

We load a txt file containing Shakespeare sonnets

In [5]:
data = open('sonnets.txt').read()

corpus = data.lower().split("\n")

In [6]:
corpus[0:10]

['from fairest creatures we desire increase,',
 "that thereby beauty's rose might never die,",
 'but as the riper should by time decease,',
 'his tender heir might bear his memory:',
 'but thou, contracted to thine own bright eyes,',
 "feed'st thy light'st flame with self-substantial fuel,",
 'making a famine where abundance lies,',
 'thyself thy foe, to thy sweet self too cruel.',
 "thou that art now the world's fresh ornament",
 'and only herald to the gaudy spring,']

We need to transform each text sentence into token sequence.

Then we need to generate from each token sequence, several token subsequences in order to augment the dataset. **Remember** that we want to learn how to predict the next word of a sentence. A sentence of 5 words can so be used to generate 4 training sequences.

e.g. the sentence "*to be or not to be*" can give the following training sequences: 

"*to* **be**"

"*to be* **or**"

"*to be or* **not**"

"*to be or not* **to**"

"*to be or not to* **be**"

In [49]:
tokenizer = Tokenizer()

# Question 1: use the preprocessing steps learned in course3_text_sequence_preprocessing_ex.ipynb to 
#create the padded sequences needed to train the model
# Hint: Be careful about the length you will use for the padding sequences AND about where you put the extra 
#zero coming from the padding (at the beginning of the sequence or at the end)

# Tokenize
tokenizer.fit_on_texts(corpus)
tokenized_sentences = tokenizer.texts_to_sequences(corpus)
total_words = len(tokenizer.word_index) + 1

#Padding
max_length = 300
#Put paddings in the front
padded_sentences = pad_sequences(tokenized_sentences, maxlen=max_length, padding='pre')

In [50]:
tokenizer = Tokenizer()
 
tokenizer.fit_on_texts(corpus)

 
# create input sequences using list of tokens
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
#         print(n_gram_sequence)
        input_sequences.append(n_gram_sequence)
    
# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

#--- create predictors and label
# The predictors correspond to the input sequence without the last token
# the label corresponds to the last token (the one we want to predict)
predictors, label = input_sequences[:,:-1],input_sequences[:,-1]

## Neural network model definition

Build a neural network using at least one LSTM layer

(you may have a look at https://keras.io/api/layers/)

In [54]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

# Question 2: define a neural network model using at least one LSTM layer
# Hint1: we advise you to use as first layer an Embedding layer (have a look at course3_sentiment_analysis_LSTM_ex.ipynb for a reminder of how to use it)
# Hint2: you can import any additional layers from tensorflow.keras.layers if needed

model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len - 1))
model.add(Bidirectional(LSTM(150, activation='tanh', return_sequences=True)))
model.add(Dropout(.2))
model.add(LSTM(256, activation='tanh', recurrent_activation='sigmoid', return_sequences=True))

model.add(Dense(2, activation='softmax'))
# Question 3: define a relevant loss function and optimizer

loss_function = 'sparse_categorical_crossentropy'
optimizer = Adam()

model.compile(loss=loss_function, optimizer=optimizer,metrics=['accuracy'])

print(model.summary())

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 10, 100)           321100    
                                                                 
 bidirectional_3 (Bidirectio  (None, 10, 300)          301200    
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 10, 300)           0         
                                                                 
 lstm_13 (LSTM)              (None, 10, 256)           570368    
                                                                 
 dense_1 (Dense)             (None, 10, 2)             514       
                                                                 
Total params: 1,193,182
Trainable params: 1,193,182
Non-trainable params: 0
___________________________________________

In [58]:
# Question 4: train your model with relevant parameters

# predictors = ??????
# label = ??????
# epochs_value = ??????
# validation_split_value = ??????
epochs_value=20
history = model.fit(predictors, label, epochs=epochs_value, verbose=1, validation_split=validation_split_value )

NameError: name 'validation_split_value' is not defined

## Result visualization

In [None]:
plot_results(history)

## Text generation

You can now use your model to sequentially generate new words from a given uncomplete sentence

In [None]:
seed_text = "Put here the begining of a sentence of your own"
nb_additional_words = 100
  
# Question 5: generate nb_additional_words at the end of seed_text according to your trained model
