# FNET AutoEncoder for Text Generation
Created by Paul K. Mandal

This code is an autoencoder based off of the proposed FNET Architecture that can be found here: https://arxiv.org/abs/2105.03824

In [1]:
import os, json
import pandas as pd

path_to_json = 'c4/realnewslike/'
json_files = [ 'c4/realnewslike/' + pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

data = []
i = 0
for file in json_files:
        for line in open(file, 'r'):
            if i < 1000000:
                data.append(json.loads(line))
                i+=1

In [2]:
len(data)

1000000

In [3]:
titles = []

for i in range(0,len(data)):
    titles.append(data[i]['text'])
    

In [4]:
len(titles)

1000000

In [5]:
x_test = titles[:100000]
x_train = titles[100000:]
x_val = x_train[:100000]
x_partial_train = x_train[100000:]

In [6]:
x_test[0]

'A good deal of sunshine. High around 85F. Winds light and variable..\nThe first time the McDowell Lady Titans played the Watauga Pioneers this season, McDowell was admittedly overwhelmed in a 9-0 defeat that ended in the 63rd minute via the mercy rule.\nThe second match between the two also resulted in loss for the Lady Titans, but Thursday’s 4-0 defeat was nothing like the first game. McDowell gave up three goals in the opening 15 minutes and then battled the league-leading Pioneers tooth-and-nail for the final 75 in a much closer 4-0 defeat.\nThe Lady Titans (4-9 overall, 2-5 Northwestern 3A/4A Conference) couldn’t find the net, but Hunt said they played hard the whole way and never quit despite being behind Watauga (9-3, 7-0) most of the way.\nMcDowell got a big game from junior goalkeeper Leila Bragg, who stopped 18 Watauga shots. One of the Pioneers’ three goals came on an error by the Lady Titans.\n“Top honors go to senior Noelle Fields for her leadership on the field, senior Be

In [7]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((x_partial_train, x_partial_train))
val_dataset = tf.data.Dataset.from_tensor_slices((x_val,x_val))
train_full = tf.data.Dataset.from_tensor_slices((x_train, x_train))
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, x_test))

2022-08-06 14:36:12.529564: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-06 14:36:12.590067: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-06 14:36:12.592169: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-06 14:36:12.613343: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [8]:
from keras import layers

def preprocess_text(sentence):
    sentence = tf.strings.lower(sentence)
    # Adding a space between the punctuation and the last word to allow better tokenization
    sentence = tf.strings.regex_replace(sentence, r"([?.!,])", r" \1 ")
    # Replacing multiple continuous spaces with a single space
    sentence = tf.strings.regex_replace(sentence, r"\s\s+", " ")
    # Replacing non english words with spaces
    sentence = tf.strings.regex_replace(sentence, r"[^a-z?.!,]+", " ")
    sentence = tf.strings.strip(sentence)
    sentence = tf.strings.join(["[start]", sentence, "[end]"], separator=" ")
    return sentence

VOCAB_SIZE = 10000
MAX_LENGTH = 150
BATCH_SIZE = 1024

#VOCAB_SIZE = max_words
MAX_SAMPLES = 50000
BUFFER_SIZE = 20000
#MAX_LENGTH = maxlen
EMBED_DIM = 128
LATENT_DIM = 64
NUM_HEADS = 8
BATCH_SIZE = 64

vectorizer = layers.TextVectorization(
    VOCAB_SIZE,
    standardize=preprocess_text,
    output_mode="int",
    output_sequence_length=MAX_LENGTH,
)

# We will adapt the vectorizer to both the questions and answers
# This dataset is batched to parallelize and speed up the process
vectorizer.adapt(tf.data.Dataset.from_tensor_slices(titles).batch(128))


In [9]:
def vectorize_text(inputs, outputs):
    inputs, outputs = vectorizer(inputs), vectorizer(outputs)
    # One extra padding token to the right to match the output shape
    outputs = tf.pad(outputs, [[0, 1]])
    return (
        {"encoder_inputs": inputs, "decoder_inputs": outputs[:-1]},
        {"outputs": outputs[1:]},
    )


train_dataset = train_dataset.map(vectorize_text, num_parallel_calls=tf.data.AUTOTUNE)
val_dataset = val_dataset.map(vectorize_text, num_parallel_calls=tf.data.AUTOTUNE)

train_dataset = (
    train_dataset.cache()
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

train_full = (
    train_full.cache()
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)


val_dataset = val_dataset.cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [18]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras.layers import LSTM
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.layers import Bidirectional

#from tensorflow.python.framework.ops import disable_eager_execution
#disable_eager_execution()

class FNetEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, **kwargs):
        super(FNetEncoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(dense_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs):
        # Casting the inputs to complex64
        inp_complex = tf.cast(inputs, tf.complex64)
        # Projecting the inputs to the frequency domain using FFT2D and
        # extracting the real part of the output
        fft = tf.math.real(tf.signal.fft2d(inp_complex))
        proj_input = self.layernorm_1(inputs + fft)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


class FNetDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super(FNetDecoder, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(latent_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)
    


def create_model():
    encoder_inputs = keras.Input(shape=(None,), dtype="int32", name="encoder_inputs")
    x = PositionalEmbedding(MAX_LENGTH, VOCAB_SIZE, EMBED_DIM)(encoder_inputs)
    encoder_outputs = FNetEncoder(EMBED_DIM, LATENT_DIM)(x)
    encoder = keras.Model(encoder_inputs, encoder_outputs)
    decoder_inputs = keras.Input(shape=(None,), dtype="int32", name="decoder_inputs")
    encoded_seq_inputs = keras.Input(
        shape=(None, EMBED_DIM), name="decoder_state_inputs"
    )
    x = PositionalEmbedding(MAX_LENGTH, VOCAB_SIZE, EMBED_DIM)(decoder_inputs)
    x = FNetDecoder(EMBED_DIM, LATENT_DIM, NUM_HEADS)(x, encoded_seq_inputs)
    x = layers.Dropout(0.5)(x)
    decoder_outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
    decoder = keras.Model(
        [decoder_inputs, encoded_seq_inputs], decoder_outputs, name="outputs"
    )
    decoder_outputs = decoder([decoder_inputs, encoder_outputs])
    fnet = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs, name="fnet")
    return fnet

fnet = create_model()
fnet.compile("adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
fnet.summary()

Model: "fnet"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding_4 (Positi  (None, None, 128)   1299200     ['encoder_inputs[0][0]']         
 onalEmbedding)                                                                                   
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 f_net_encoder_2 (FNetEncoder)  (None, None, 128)    17088       ['positional_embedding_4[0][0]

In [19]:
VOCAB = vectorizer.get_vocabulary()


def decode_sentence(input_sentence):
    # Mapping the input sentence to tokens and adding start and end tokens
    tokenized_input_sentence = vectorizer(
        tf.constant("[start] " + preprocess_text(input_sentence) + " [end]")
    )
    # Initializing the initial sentence consisting of only the start token.
    tokenized_target_sentence = tf.expand_dims(VOCAB.index("[start]"), 0)
    decoded_sentence = ""

    for i in range(MAX_LENGTH):
        # Get the predictions
        predictions = fnet.predict(
            {
                "encoder_inputs": tf.expand_dims(tokenized_input_sentence, 0),
                "decoder_inputs": tf.expand_dims(
                    tf.pad(
                        tokenized_target_sentence,
                        [[0, MAX_LENGTH - tf.shape(tokenized_target_sentence)[0]]],
                    ),
                    0,
                ),
            }
        )
        # Calculating the token with maximum probability and getting the corresponding word
        sampled_token_index = tf.argmax(predictions[0, i, :])
        sampled_token = VOCAB[sampled_token_index.numpy()]
        # If sampled token is the end token then stop generating and return the sentence
        if tf.equal(sampled_token_index, VOCAB.index("[end]")):
            break
        decoded_sentence += sampled_token + " "
        tokenized_target_sentence = tf.concat(
            [tokenized_target_sentence, [sampled_token_index]], 0
        )

    return decoded_sentence


In [21]:
y = []

for i in range (0,50):
    history = fnet.fit(train_dataset, epochs=1, validation_data=val_dataset)
    y.append(decode_sentence(""))
    print("epoch: %s" %(i+1))

#vae.save('models/FNETv2.h5')

#history = model.fit(x_partial_train, y_partial_train, epochs = 20, batch_size = BATCH_SIZE, validation_data=(x_val, y_val))

epoch: 1
epoch: 2
epoch: 3
epoch: 4
epoch: 5
epoch: 6
epoch: 7
epoch: 8
epoch: 9
epoch: 10
epoch: 11
epoch: 12
epoch: 13
epoch: 14
epoch: 15
epoch: 16
epoch: 17
epoch: 18
epoch: 19
epoch: 20
epoch: 21
epoch: 22
epoch: 23
epoch: 24
epoch: 25
epoch: 26
epoch: 27
epoch: 28
epoch: 29
epoch: 30
epoch: 31
epoch: 32
epoch: 33
epoch: 34
epoch: 35
epoch: 36
epoch: 37
epoch: 38
epoch: 39
epoch: 40
epoch: 41
epoch: 42
epoch: 43
epoch: 44
epoch: 45
epoch: 46
epoch: 47
epoch: 48
epoch: 49
epoch: 50


In [23]:
f = open('null_sent_by_epoch.json', 'w', encoding='utf-8')

for i in range(0,len(y)):
    data = {'epoch': i+1, 'pred': y[i]}
    json.dump(data, f, ensure_ascii=False, indent=4)
    print(i)
f.close()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [None]:
f = open('results_FNETv3_full.json', 'w', encoding='utf-8')

for i in range(0,50):
    data = {'original': x_test[i], 'pred': decode_sentence(x_test[i])}
    json.dump(data, f, ensure_ascii=False, indent=4)
    print(i)
f.close()

In [26]:
decode_sentence("this is a test sentence")



'[UNK] end start , start this start is a novel amendment ready legislature however start fortune viewers peaceful exchange dairy sometime ! talked tribal changing service blake counter prof road . weaker parks indian mess up domestic hell clinics legendary services bad services start black sector damn health weekend , start john merchants nothing telecommunications don t . , . start of start over then hostage then take other take stack level . [UNK] . . each [UNK] the [UNK] off . end . . [UNK] . rivers [UNK] '

In [27]:
decode_sentence("Neural Networks have been shown to have use in Natural Language Processing")



'[UNK] have been start paced start to pose non regime have created california gateway get policy unlikely point thanks to coffee mohammad completely season border wallet part fresh football grew partial gear indiana hands lighter education injured entertainment taxpayer experience despite scratch poorly bucks planning beating meals last additions last bathrooms last bathrooms . off vice matter clothing matter clothing matter later nation shoot jobs . start . start , [UNK] directly . , . . . [UNK] , [UNK] . , end all anymore [UNK] about [UNK] [UNK] reaching start . [UNK] each start higher colored start detail . at some last some [UNK] john . t [UNK] . adam [UNK] , '

In [29]:
decode_sentence("Paul Mandal, a Neural Networks reseracher, has indadvertently proved that any moron with a computer can pursue research in machine learning")



'[UNK] , [UNK] start start paul a networks , [UNK] [UNK] imaging has proved that any time with a computer can pursue research machine in end learning end '

In [30]:
decode_sentence("Where are the snowden's of yesteryear?")



'[UNK] start at the start where are it s snowden clinton end ? excited bank virtual rebel clinton services excitement certified comic illegal allies recording temperature investing lies normal communications friendly april offensive supermarket often roll families yet plaza so tackle consumer officers pga link seven grandchildren wild operating crowd operating crowd until global until connected go inside . start over start . even square defeat spotify defeat . defeat along defeat . '

In [31]:
decode_sentence("More human remains have been found at drought-stricken Lake Mead National Recreation Area east of Las Vegas, authorities said Sunday.")



'[UNK] have been more start increasing problem saw north dakota volunteer building issued price beer eastern open la tribune of computers manchester el area said championship end emerge job c neighbours online wide intentionally panic indoor sen . dancing senate cars host motorists movie support sun disappointment officials touch officials touch officials vocal road interviews until leave . , start over then take points . hadn start such start amounts city such . [UNK] . behind nearly [UNK] , [UNK] , [UNK] end . , start , what go [UNK] . , certain per start . nancy [UNK] ! . ! clinton starting , start '