<a href="https://colab.research.google.com/github/Snigdho64/Google_Collab_Notebooks/blob/main/Chapter_16_Natural_language_processing_With_RNNs_And_Attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
from tensorflow import keras

#Char-RNN

##Spliting a seuquence into batches of shuffeled windows

In [None]:
tf.random.set_seed(42)
n_steps = 5
dataset = tf.data.Dataset.from_tensor_slices(tf.range(15))
dataset= dataset.window(n_steps,shift=2, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(n_steps))
dataset = dataset.shuffle(10).map(lambda window : (window[:-1],window[1:]))
dataset = dataset.batch(3).prefetch(1)
list(dataset.as_numpy_iterator())

[(array([[6, 7, 8, 9],
         [2, 3, 4, 5],
         [4, 5, 6, 7]], dtype=int32), array([[ 7,  8,  9, 10],
         [ 3,  4,  5,  6],
         [ 5,  6,  7,  8]], dtype=int32)), (array([[ 0,  1,  2,  3],
         [ 8,  9, 10, 11],
         [10, 11, 12, 13]], dtype=int32), array([[ 1,  2,  3,  4],
         [ 9, 10, 11, 12],
         [11, 12, 13, 14]], dtype=int32))]

## Loading and Preparing the dataset

In [None]:
#import the shakespeare_text dataset from github
shakespeare_url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
filepath = keras.utils.get_file('shakespeare.txt',shakespeare_url)
with open(filepath) as f:
  shakespeare_text = f.read()

Downloading data from https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


In [None]:
print(shakespeare_text[:148])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?



In [None]:
from numpy.lib.arraysetops import unique
#get all the unique chars in the text document
unique_chars = ''.join(sorted(set(shakespeare_text.lower())))
unique_chars

"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

In [None]:
#tokenize the datset at char-level
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)

In [None]:
tokenizer.fit_on_texts(shakespeare_text)

In [None]:
tokenizer.sequences_to_texts(tokenizer.texts_to_sequences([shakespeare_text[:13]]))

['f i r s t   c i t i z e n']

In [None]:
max_id = len(tokenizer.word_index) #number of distinct characters
dataset_size = tokenizer.document_count #total number of characters

In [None]:
# sorted(tokenizer.word_index.items(),key=lambda kv:kv[1])

In [None]:
#encode the text dataset
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text]))
encoded

array([20,  6,  9, ..., 21, 27, 11])

In [None]:
#create train dataset
train_size =dataset_size * 9 // 100
train_size
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

In [None]:
n_steps = 100
window_length = n_steps + 1 #target = input shifted 1 character ahead
dataset = dataset.window(window_length, shift=1,drop_remainder=True)

In [None]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))
list(dataset.take(1))

[<tf.Tensor: shape=(101,), dtype=int64, numpy=
 array([20,  6,  9,  8,  3,  1, 19,  6,  3,  6, 36,  2, 10, 24, 11, 22,  2,
        20,  4,  9,  2,  1, 17,  2,  1, 23,  9,  4, 19,  2,  2, 13,  1,  5,
        10, 16,  1, 20, 14,  9,  3,  7,  2,  9, 18,  1,  7,  2,  5,  9,  1,
        15,  2,  1,  8, 23,  2,  5, 25, 27, 11, 11,  5, 12, 12, 24, 11,  8,
        23,  2,  5, 25, 18,  1,  8, 23,  2,  5, 25, 27, 11, 11, 20,  6,  9,
         8,  3,  1, 19,  6,  3,  6, 36,  2, 10, 24, 11, 16,  4, 14,  1])>]

In [None]:
tf.random.set_seed(42)
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:,:-1], windows[:,1:]))
list(dataset.take(1))

[(<tf.Tensor: shape=(32, 100), dtype=int64, numpy=
  array([[ 7,  6,  8, ...,  2,  1, 19],
         [ 7,  2,  1, ...,  2,  1, 12],
         [12, 13,  1, ..., 10, 30, 11],
         ...,
         [ 4, 28,  1, ..., 11,  8,  4],
         [19,  4, 10, ...,  6,  5, 10],
         [15,  8,  1, ...,  5,  9,  2]])>,
  <tf.Tensor: shape=(32, 100), dtype=int64, numpy=
  array([[ 6,  8,  1, ...,  1, 19,  5],
         [ 2,  1, 25, ...,  1, 12,  2],
         [13,  1, 22, ..., 30, 11, 11],
         ...,
         [28,  1,  3, ...,  8,  4, 20],
         [ 4, 10, 13, ...,  5, 10,  8],
         [ 8,  1,  4, ...,  9,  2,  1]])>)]

In [None]:
# one hot encoded the features
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch,depth=max_id),Y_batch))
list(dataset.take(1))

[(<tf.Tensor: shape=(32, 100, 39), dtype=float32, numpy=
  array([[[0., 0., 0., ..., 0., 0., 0.],
          [0., 1., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.]],
  
         [[0., 1., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 1., ..., 0., 0., 0.],
          ...,
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 1., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.]],
  
         [[0., 0., 0., ..., 0., 0., 0.],
          [0., 1., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          ...,
          [0., 1., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.]],
  
         ...,
  
         [[0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
          [0., 0., 0., ..., 0., 0., 0.],
        

In [None]:
dataset = dataset.prefetch(1)

In [None]:
for X_batch, Y_batch in dataset.take(1):
  print(X_batch.shape, Y_batch.shape)

(32, 100, 39) (32, 100)


## Creating And Training The Model

In [None]:
#build the model
model = keras.models.Sequential([
                                 keras.layers.GRU(128,
                                                  return_sequences=True,
                                                  input_shape=[None, max_id],
                                                  dropout=0.2),
                                 keras.layers.GRU(128,
                                                  return_sequences=True,
                                                  dropout=0.2),
                                 keras.layers.TimeDistributed(keras.layers.Dense(max_id,
                                                                                 activation='softmax'))
])

#compile the model
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
#fit the model
history = model.fit(dataset, epochs=5)

Epoch 1/5
    794/Unknown - 52s 51ms/step - loss: 2.2457

KeyboardInterrupt: ignored

In [None]:
import tensorflow_datasets as tfds

In [None]:
datasets, info = tfds.load('imdb_reviews',with_info=True,as_supervised=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompletePMVVMX/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompletePMVVMX/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompletePMVVMX/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
train_size = info.splits['train'].num_examples
train_size

25000

In [None]:
def preprocess(X_batch,y_batch):
  X_batch = tf.strings.substr(X_batch,0, 300)
  X_batch = tf.strings.regex_replace()