<a href="https://colab.research.google.com/github/Sahanaka/ASP.NET-CORE-with-jwt-tokens/blob/master/Chapter16_NaturalLanguageProcessingwithRNNsandAttention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Usual imports and the plotting functions
import sys
import sklearn
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os

# To make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

n_steps = 5
dataset = tf.data.Dataset.from_tensor_slices(tf.range(15))
dataset = dataset.window(n_steps, shift=2, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(n_steps))
dataset = dataset.shuffle(10).map(lambda window: (window[:-1], window[1:]))
dataset = dataset.batch(3).prefetch(1)
for index, (X_batch, Y_batch) in enumerate(dataset):
    print("_" * 20, "Batch", index, "\nX_batch")
    print(X_batch.numpy())
    print("=" * 5, "\nY_batch")
    print(Y_batch.numpy())

____________________ Batch 0 
X_batch
[[6 7 8 9]
 [2 3 4 5]
 [4 5 6 7]]
===== 
Y_batch
[[ 7  8  9 10]
 [ 3  4  5  6]
 [ 5  6  7  8]]
____________________ Batch 1 
X_batch
[[ 0  1  2  3]
 [ 8  9 10 11]
 [10 11 12 13]]
===== 
Y_batch
[[ 1  2  3  4]
 [ 9 10 11 12]
 [11 12 13 14]]


# **Char RNN**

## **Loading the Data and Preparing the Dataset**

In [None]:
shakespeare_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt" # Dataset url
filepath = keras.utils.get_file('shakespeare.txt', shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

Downloading data from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


In [None]:
print(shakespeare_text[:200]) # Contains a dialog. Change the number and run 

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


Tokenizing

In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True) # Using the keras tokenizer; char_level true gives character level tokenization. Default is the word tokenization
tokenizer.fit_on_texts(shakespeare_text)

In [None]:
tokenizer.texts_to_sequences(["First"]) # Gets the id's

[[20, 6, 9, 8, 3]]

In [None]:
tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]]) # Gets the characters

['f i r s t']

In [None]:
max_id = len(tokenizer.word_index) # Number of distinct characters
dataset_size = tokenizer.document_count # Total number of characters

In [None]:
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1 # -1 to get the IDs from 0
train_size = dataset_size * 90 // 100 # Train split
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [None]:
n_steps = 100
window_length = n_steps + 1 # target = input shifted 1 character ahead 
dataset = dataset.window(window_length, shift=1, drop_remainder=True) # Shift=1 to get the largest possbile set


In [None]:
# Flatten the dataset
dataset = dataset.map(lambda window: window.batch(window_length)) # transform the nested dataset into a flat list

<MapDataset shapes: DatasetSpec(TensorSpec(shape=(None, None), dtype=tf.int64, name=None), TensorShape([])), types: DatasetSpec(TensorSpec(shape=(None, None), dtype=tf.int64, name=None), TensorShape([]))>


In [None]:
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
# Shuffle the dataset
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
# dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [None]:
# One hot encode the characters
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

In [None]:
dataset = dataset.prefetch(1) # PRefetching

In [None]:
for X_batch, Y_batch in dataset.take(1):
    print(X_batch.shape, Y_batch.shape)

(3, 4, 39) (3, 4)


## **Exercises**

Exercise: Train an Encoder–Decoder model that can convert a date string from one format to another (e.g., from "April 22, 2019" to "2019-04-22").

**First we have to make the dataset. Let's create a random dataset in the following time range**

In [2]:
from datetime import date

In [3]:
# Creating a dataset of inputs and targets
MONTHS = ["January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December"]

def random_dates(n_dates):
  # MIN MAX dates
  min_date = date(1000, 1, 1).toordinal()
  max_date = date(9999, 12, 31).toordinal()

  # Creating random ordinals in the range
  ordinals = np.random.randint(max_date - min_date, size=n_dates) + min_date
  
  # Convert to dates
  dates = [date.fromordinal(ordinal) for ordinal in ordinals]
  x = [MONTHS[dt.month - 1] + " " + dt.strftime("%d, %Y") for dt in dates]
  y = [dt.isoformat() for dt in dates]
  return x, y


In [4]:
X, y = random_dates(23)

In [5]:
print(X)
print(y)

['September 20, 7075', 'May 15, 8579', 'January 11, 7103', 'June 01, 7451', 'July 29, 5634', 'November 27, 1301', 'August 23, 3004', 'December 26, 9762', 'October 29, 7117', 'June 01, 9479', 'July 13, 5298', 'October 11, 6484', 'June 20, 4110', 'November 06, 9240', 'July 01, 7221', 'October 06, 4394', 'August 06, 1761', 'April 23, 6854', 'October 10, 1901', 'March 08, 9790', 'April 15, 3155', 'March 07, 4752', 'August 02, 5837']
['7075-09-20', '8579-05-15', '7103-01-11', '7451-06-01', '5634-07-29', '1301-11-27', '3004-08-23', '9762-12-26', '7117-10-29', '9479-06-01', '5298-07-13', '6484-10-11', '4110-06-20', '9240-11-06', '7221-07-01', '4394-10-06', '1761-08-06', '6854-04-23', '1901-10-10', '9790-03-08', '3155-04-15', '4752-03-07', '5837-08-02']


**Let's find all possible inputs chars and output chars**

In [6]:
INPUT_CHARS = "".join(sorted(set("".join(MONTHS) + "0123456789, ")))
INPUT_CHARS

' ,0123456789ADFJMNOSabceghilmnoprstuvy'

In [7]:
OUTPUT_CHARS = "0123456789-"

Function to convert to list of characters

In [8]:
def date_str_to_ids(date_str, chars=INPUT_CHARS):
    return [chars.index(c) for c in date_str]

In [9]:
def prepare_date_strs(date_strs, chars=INPUT_CHARS):
    X_ids = [date_str_to_ids(dt, chars) for dt in date_strs]
    X = tf.ragged.constant(X_ids, ragged_rank=1)
    return (X + 1).to_tensor() # using 0 as the padding token ID

def create_dataset(n_dates):
    x, y = random_dates(n_dates)
    return prepare_date_strs(x, INPUT_CHARS), prepare_date_strs(y, OUTPUT_CHARS)

In [10]:
# Create train, validation and test sets
np.random.seed(42)

# All are tensors
X_train, Y_train = create_dataset(10000)
X_valid, Y_valid = create_dataset(2000)
X_test, Y_test = create_dataset(2000)

## **v1: A very basic seq2seq model**

**We feed in the input sequence, which first goes through the encoder (an embedding layer followed by a single LSTM layer), which outputs a vector, then it goes through a decoder (a single LSTM layer, followed by a dense output layer), which outputs a sequence of vectors, each representing the estimated probabilities for all possible output character.**

In [11]:
embedding_size = 32
max_output_length = Y_train.shape[1]

np.random.seed(42)
tf.random.set_seed(42)

In [12]:
# Encoder
encoder = keras.models.Sequential([
                                   keras.layers.Embedding(input_dim=len(INPUT_CHARS) + 1,
                                                          output_dim=embedding_size,
                                                          input_shape=[None]),
                                   keras.layers.LSTM(128)
])

In [13]:
# Decoder
decoder = keras.models.Sequential([
                                   keras.layers.LSTM(128, return_sequences=True),
                                   keras.layers.Dense(len(OUTPUT_CHARS) + 1, activation="softmax") # Output layer
])

In [14]:
# Final model
model = keras.models.Sequential([
    encoder,
    keras.layers.RepeatVector(max_output_length),
    decoder
])

In [15]:
model.compile(optimizer="Nadam", loss="sparse_categorical_crossentropy", metrics=['accuracy'])

In [16]:
history = model.fit(X_train, Y_train, epochs=20,
                    validation_data=(X_valid, Y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
