In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow.keras import layers

import re

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

Version:  2.5.0
Eager mode:  True
GPU is available


In [None]:
batch_size = 128
max_len = 100
min_len = 15
fictional = '$'

In [None]:
file = tf.keras.utils.get_file("papers.txt", "https://raw.githubusercontent.com/EagleW/ACL_titles_abstracts_dataset/master/acl_titles_and_abstracts.txt")
papers = open(file, "rb").read().decode(encoding="utf-8")

In [None]:
vocabulary = list(sorted(set(papers)))
vocabulary

['\n',
 '\x0c',
 '\r',
 ' ',
 '!',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '|',
 '}',
 '~',
 '—',
 '\u3000']

In [None]:
texts = papers.split("\r\n\r\n")
texts[0]

'evaluation technology from speaker identification to affective analysis : a multi-step system for analyzing childrens stories\r\nwe propose a multi-step system for the analysis of childrens stories that is intended to be part of a larger text-to-speechbased storytelling system . a hybrid approach is adopted , where pattern-based and statistical methods are used along with utilization of external knowledge sources . this system performs the following story analysis tasks : identification of characters in each story ; attribution of quotes to specific story characters ; identification of character age , gender and other salient personality attributes ; and finally , affective analysis of the quoted material . the different types of analyses were evaluated using several datasets . for the quote attribution , as well as for the gender and age estimation , substantial improvement over baseline was realized , whereas results for personality attribute estimation and valence estimation are mo

In [None]:
def process(text):
    text = text.split("\r\n")[1]
  
    if re.search('[\?\$\%\#\&\*\\\/\=\<\>\@\!\_\-\+\`\~\;\:\|\—]', text):
        return []

    sentences = []
    for big_sentence in text.split(" . "):
        for sentence in big_sentence.split(" , "):
            sentence = re.sub('[0-9\.]', '', sentence)
            sentence = re.sub('\([^)]*\)', '', sentence)
            sentence = re.sub('\{[^}]*\}', '', sentence)
            sentence = re.sub('\[[^\]]*\]', '', sentence)
            sentence = re.sub('[\(\)\[\]\{\}]', '', sentence)
            sentence = re.sub('\s+', ' ', sentence)
            if len(sentence) < max_len and len(sentence) >= min_len:
                sentence += "."
                sentences.append(sentence)
    return sentences

In [None]:
sentences = []
for text in texts:
    sentences += process(text)
len(sentences)

7420

In [None]:
vocab = set()
for sentence in sentences:
    vocab |= set(sentence)
vocab = list(sorted(vocab))
vocab = [fictional] + vocab
vocab_size = len(vocab)

print(max(sentences, key=len))
print(min(sentences, key=len))

mainly due to the large body of electronic resources and tools that are available for this language.
to this purpose.


In [None]:
indices = {vocab[i]:i for i in range(len(vocab))}
dot_index = indices['.']
fictional_index = 0

inputs = []
labels = []

for sentence in sentences:
    encoded_sentence = np.array([indices[sentence[i]] if i < len(sentence) else fictional_index for i in range(max_len)])
    inputs.append(encoded_sentence[:-1])
    labels.append(encoded_sentence[1:])

dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
dataset = dataset.batch(batch_size)

In [None]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

In [None]:
def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size):
        super(TokenAndPositionEmbedding, self).__init__()
        self.positional_encoding = positional_encoding(maxlen, vocab_size)
        self.vocab_size = vocab_size

    def call(self, x):
        input_len = tf.shape(x)[-1]
        x = tf.one_hot(x, self.vocab_size)
        positions = self.positional_encoding[:,:input_len,:]
        return x + positions

In [None]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Mask the upper half of the dot product matrix in self attention.
    This prevents flow of information from future tokens to current token.
    1's in the lower triangle, counting from the lower right corner.
    """
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads, embed_dim)
        self.ffn = tf.keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(inputs + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

In [None]:
num_heads = 16
feed_forward_dim = 256

def create_model():
    inputs = layers.Input(shape=(max_len - 1,), dtype=tf.int32)
    embedding_layer = TokenAndPositionEmbedding(max_len - 1, vocab_size)
    x = embedding_layer(inputs)
    transformer_block1 = TransformerBlock(vocab_size, num_heads, feed_forward_dim)
    x = transformer_block1(x)
    transformer_block2 = TransformerBlock(vocab_size, num_heads, feed_forward_dim)
    x = transformer_block2(x)
    transformer_block3 = TransformerBlock(vocab_size, num_heads, feed_forward_dim)
    x = transformer_block3(x)
    transformer_block4 = TransformerBlock(vocab_size, num_heads, feed_forward_dim)
    x = transformer_block4(x)
    transformer_block5 = TransformerBlock(vocab_size, num_heads, feed_forward_dim)
    x = transformer_block5(x)
    transformer_block6 = TransformerBlock(vocab_size, num_heads, feed_forward_dim)
    x = transformer_block6(x)
    outputs = layers.Dense(vocab_size)(x)
    model = tf.keras.Model(inputs=inputs, outputs=[outputs, x])
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(
        "adam", loss=[loss_fn, None],
    )  # No loss and optimization based on word embeddings from transformer block
    return model


In [None]:
class TextGenerator(tf.keras.callbacks.Callback):
    def __init__(
        self, max_tokens, start_tokens, vocab, top_k=5, print_every=1
    ):
        self.max_tokens = max_tokens
        self.start_tokens = start_tokens
        self.vocab = vocab
        self.print_every = print_every
        self.k = top_k

    def sample_from(self, logits):
        logits, indices = tf.math.top_k(logits, k=self.k, sorted=True)
        indices = np.asarray(indices).astype("int32")
        preds = tf.keras.activations.softmax(tf.expand_dims(logits, 0))[0]
        preds = np.asarray(preds).astype("float32")
        return np.random.choice(indices, p=preds)

    def detokenize(self, number):
        return self.vocab[number]

    def on_epoch_end(self, epoch, logs=None):
        start_tokens = [_ for _ in self.start_tokens]
        if (epoch + 1) % self.print_every != 0:
            return
        num_tokens_generated = 0
        tokens_generated = []
        while (num_tokens_generated <= self.max_tokens and 
               (len(tokens_generated) == 0 or 
                (tokens_generated[-1] != dot_index and
                tokens_generated[-1] != 0))):
            pad_len = max_len - 1 - len(start_tokens)
            sample_index = len(start_tokens) - 1
            if pad_len < 0:
                x = start_tokens[:max_len - 1]
                sample_index = max_len - 1 - 1
            elif pad_len > 0:
                x = start_tokens + [0] * pad_len
            else:
                x = start_tokens
            x = np.array([x])
            y, _ = self.model.predict(x)
            sample_token = self.sample_from(y[0][sample_index])
            tokens_generated.append(sample_token)
            start_tokens.append(sample_token)
            num_tokens_generated = len(tokens_generated)
        txt = "".join(
            [self.detokenize(_) for _ in self.start_tokens + tokens_generated]
        )
        print(f"generated text:\n{txt}\n")

In [None]:
line_start = "we "
start_tokens = [indices[char] for char in line_start]
text_gen_callback = TextGenerator(max_len, start_tokens, vocab, top_k=2, print_every=10)

In [None]:
model = create_model()

model.fit(dataset, verbose=1, epochs=1000, callbacks=[text_gen_callback])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
generated text:
we prore the the a these tor an the the an the and these the tof theratis.

Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
generated text:
we propose these topers a sumplent a the a setem a and and a the to the the the anding anding.

Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
generated text:
we proposed a method the to the are and and the propose an a text of sentences .

Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
generated text:
we describe a semantics and approach that a substantation summarization structure strumew.

Epoch 41/1000
Epoch 42/1000
Epoch 43/10

<tensorflow.python.keras.callbacks.History at 0x7fc6f31c0250>