In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import os

In [None]:
DIRECTORY_URL='https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES=['cowper.txt','derby.txt','butler.txt']

for name in FILE_NAMES:
  text_dir=tf.keras.utils.get_file(name,origin=DIRECTORY_URL+name)

parent_dir=os.path.dirname(text_dir)
parent_dir

'/root/.keras/datasets'

In [None]:
def labeler(example,index):
  return example,tf.cast(i,tf.int64)

labeled_data_sets=[]

for i,file_name in enumerate(FILE_NAMES):
  lines_dataset=tf.data.TextLineDataset(os.path.join(parent_dir,file_name))
  labeled_dataset=lines_dataset.map(lambda ex:labeler(ex,i))
  labeled_data_sets.append(labeled_dataset)

In [None]:
BUFFER_SIZE=50000
BATCH_SIZE=64
TAKE_SIZE=5000

In [None]:
all_labeled_data=labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
  all_labeled_data=all_labeled_data.concatenate(labeled_dataset)

all_labeled_data=all_labeled_data.shuffle(BUFFER_SIZE,reshuffle_each_iteration=False)

In [None]:
tokenizer=tfds.features.text.Tokenizer()

vocabulary_set=set()
for text_tensor,_ in all_labeled_data:
  some_tokens=tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

vocab_size=len(vocabulary_set)
vocab_size

17178

In [None]:
vocabulary_set  #Words are present

{'compasser',
 'morsels',
 'revilest',
 'haven',
 'Clytius',
 'misconducting',
 'dyed',
 'mood',
 'shrewd',
 'palpitating',
 'demesne',
 'Calchas',
 'signal',
 'cavity',
 'Unrivall',
 'relaxing',
 'somehow',
 'Pyres',
 'partook',
 'gored',
 'Flight',
 'surpassest',
 'sure',
 'Urg',
 'Upwafted',
 'drives',
 'suspecting',
 'Mæra',
 'ensnare',
 'Threïcian',
 'settled',
 'envoys',
 'vociferating',
 'inclining',
 'highlands',
 'struggles',
 'for',
 'fumed',
 'Alcander',
 'Cytorus',
 'Slaughter',
 'entangled',
 'worst',
 'Phorcis',
 'Horrent',
 'figs',
 'Glaphyrae',
 'requited',
 'eased',
 'Dissembling',
 'herds',
 'Innumerous',
 'Champing',
 'Avert',
 'Portheus',
 'fugitives',
 'shortly',
 'Men',
 'pangs',
 'Niobe',
 'Diomede',
 'Whereby',
 'Golden',
 'hold',
 'sliced',
 'beaker',
 'Refused',
 'charge',
 'Asunder',
 'foremost',
 'rivers',
 'Seer',
 'creeping',
 'dweller',
 'huntsman',
 'mete',
 'oxgoad',
 'wrangling',
 'utt',
 'too',
 'Averted',
 'antidotes',
 'swoops',
 'Clear',
 'resemble

In [None]:
encoder=tfds.features.text.TokenTextEncoder(vocabulary_set)

In [None]:
example_text=next(iter(all_labeled_data))[0].numpy()  #0 means 1st element in tuple which is the text
print(example_text)

b"While yet the sun ascending climb'd the heavens,"


In [None]:
encoder_example=encoder.encode(example_text)
print(encoder_example)

[3328, 5922, 13842, 5952, 13553, 3198, 5257, 13842, 10083]


In [None]:
print(encoder.decode(encoder_example))

While yet the sun ascending climb d the heavens


In [None]:
def encode(text_tensor,label):
  encoded_text=encoder.encode(text_tensor.numpy())
  return encoded_text,label

In [None]:
def encode_map_fn(text,label):
  encoded_text,label=tf.py_function(encode,
                                    inp=[text,label],
                                    Tout=[tf.int64,tf.int64])
  
  encoded_text.set_shape([None])
  label.set_shape([])

  return encoded_text,label

all_encoded_data=all_labeled_data.map(encode_map_fn)

In [None]:
next(iter(all_encoded_data))

(<tf.Tensor: shape=(9,), dtype=int64, numpy=array([ 3328,  5922, 13842,  5952, 13553,  3198,  5257, 13842, 10083])>,
 <tf.Tensor: shape=(), dtype=int64, numpy=0>)

In [None]:
train_data=all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data=train_data.padded_batch(BATCH_SIZE)

In [None]:
test_data=all_encoded_data.take(TAKE_SIZE)
test_data=all_encoded_data.padded_batch(BATCH_SIZE)

In [None]:
sample_text, sample_labels = next(iter(train_data))
sample_text[0], sample_labels[0]

(<tf.Tensor: shape=(14,), dtype=int64, numpy=
 array([ 9409, 16047,  1259, 13842,  7612,  4758, 13842,  7732,  7859,
         2472,  5257,     0,     0,     0])>,
 <tf.Tensor: shape=(), dtype=int64, numpy=1>)

In [None]:
sample_text, sample_labels = next(iter(test_data))
sample_text[0], sample_labels[0]

(<tf.Tensor: shape=(16,), dtype=int64, numpy=
 array([ 3328,  5922, 13842,  5952, 13553,  3198,  5257, 13842, 10083,
            0,     0,     0,     0,     0,     0,     0])>,
 <tf.Tensor: shape=(), dtype=int64, numpy=0>)

In [None]:
vocab_size+=1

In [None]:
model=tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size,64))

In [None]:
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

In [None]:
for units in [64,64]:
  model.add(tf.keras.layers.Dense(units,activation='relu'))

model.add(tf.keras.layers.Dense(3))  

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.fit(train_data,epochs=10,validation_data=test_data)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fe3cd7fae10>

In [None]:
eval_loss,eval_acc=model.evaluate(test_data)

