# Variational Auto Encoder based off of seq2seq for News Headline Generation
Created by Paul K. Mandal

This is a very simple Feed Forward Neural Network that I wrote in a few hours. As Andrew Ng said, the best thing to do is to immediately create a simple model and then add to it. I will be attempting to create a more advanced model later.

In [1]:
import json

data = []
for line in open('Sarcasm_Headlines_Dataset.json', 'r'):
    data.append(json.loads(line))

In [2]:
titles = []
y_vals = []

for i in range(0,len(data)):
    titles.append(data[i]['headline'])
    y_vals.append(data[i]['is_sarcastic'])
    

In [3]:
#Let's now do some preprocessing

import nltk
nltk.download('punkt')
from nltk import word_tokenize

titles_tokenized = []
for title in titles:
    titles_tokenized.append(word_tokenize(title))

[nltk_data] Downloading package punkt to /home/paul/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
titles_an = [] #alphanumeric
for title in titles_tokenized:
    words = [word for word in title if word.isalpha()]
    titles_an.append(words)

In [5]:
titles_an[0]

['former',
 'versace',
 'store',
 'clerk',
 'sues',
 'over',
 'secret',
 'code',
 'for',
 'minority',
 'shoppers']

In [6]:
#Let's now stem the words
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
titles_preprocessed = []
for title in titles_an:
    stemmed = [porter.stem(word) for word in title]
    titles_preprocessed.append(stemmed)

In [7]:
titles_preprocessed[0]

['former',
 'versac',
 'store',
 'clerk',
 'sue',
 'over',
 'secret',
 'code',
 'for',
 'minor',
 'shopper']

In [59]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

MAX_SEQUENCE_LENGTH = 20
MAX_NB_WORDS = 9999

#code from https://github.com/NicGian/text_VAE/blob/master/text_VAE_v18.ipynb
tokenizer = Tokenizer(MAX_NB_WORDS+1, oov_token='unk') #+1 for 'unk' token
tokenizer.fit_on_texts(titles_preprocessed)
print('Found %s unique tokens' % len(tokenizer.word_index))
## **Key Step** to make it work correctly otherwise drops OOV tokens anyway!
tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= MAX_NB_WORDS} # <= because tokenizer is 1 indexed
tokenizer.word_index[tokenizer.oov_token] = MAX_NB_WORDS + 1
sequences = tokenizer.texts_to_sequences(titles_preprocessed)

Found 16437 unique tokens


In [60]:
sequences[0]

[371, 9792, 716, 2739, 1285, 56, 322, 1893, 6, 1286, 3309]

In [61]:
#Convert back to text
text = tokenizer.sequences_to_texts(sequences)
text[0]

'former versac store clerk sue over secret code for minor shopper'

In [62]:
word_index = tokenizer.word_index #the dict values start from 1 so this is fine with zeropadding
index2word = {v: k for k, v in word_index.items()}
print('Found %s unique tokens.' %len(word_index))

Found 9999 unique tokens.


In [63]:
x_vals = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [64]:
print('Shape of data tensor:', data_1.shape)
NB_WORDS = (min(tokenizer.num_words, len(word_index))+1) #+1 for zero padding 

Shape of data tensor: (26709, 20)


In [65]:
x_real_vals = []
x_fake_vals = []

for i in range(0,len(x_vals)):
    if y_vals[i] == 0:
        x_real_vals.append(x_vals[i])
    else:
        x_fake_vals.append(x_vals[i])

In [66]:
len(x_real_vals)

14985

In [67]:
x_test = x_real_vals[:1500]
x_train = x_real_vals[1500:]
x_val = x_train[:1500]
x_partial_train = x_train[1500:]

In [68]:
#These two lines need to be uncommented to download the weight embeddings if it has not been done before
#!wget http://nlp.stanford.edu/data/glove.6B.zip
#!unzip -q glove.6B.zip

embeddings_index = {}
f = open('glove.6B.100d.txt', encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [69]:
embedding_dim = 100

glove_embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i<max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            glove_embedding_matrix[i] = embedding_vector

In [74]:
!pip install tensorflow-addons

Defaulting to user installation because normal site-packages is not writeable
Collecting tensorflow-addons
  Downloading tensorflow_addons-0.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m731.4 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting typeguard>=2.7
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, tensorflow-addons
Successfully installed tensorflow-addons-0.17.1 typeguard-2.13.3


In [75]:
#Much thanks to NicGian for his VAE code!
#NicGian's Repo can be found here https://github.com/NicGian/text_VAE/blob/master/text_VAE_v18.ipynb
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from keras.layers import Bidirectional, Dense, Embedding, Input, Lambda, LSTM, RepeatVector, TimeDistributed, Layer, Activation, Dropout
#from keras.preprocessing.sequence import pad_sequences
from keras.layers import ELU
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras import backend as K
from keras.models import Model
from scipy import spatial
import tensorflow as tf
import pandas as pd
import numpy as np
import codecs
import csv
import os


max_len = 20
batch_size = 100
emb_dim = 100
latent_dim = 64
intermediate_dim = 256
epsilon_std = 1.0
kl_weight = 0.01
num_sampled=500
act = ELU()


x = Input(shape=(max_len,))
x_embed = Embedding(NB_WORDS, emb_dim, weights=[glove_embedding_matrix],
                            input_length=max_len, trainable=False)(x)
h = Bidirectional(LSTM(intermediate_dim, return_sequences=False, recurrent_dropout=0.2), merge_mode='concat')(x_embed)
#h = Bidirectional(LSTM(intermediate_dim, return_sequences=False), merge_mode='concat')(h)
#h = Dropout(0.2)(h)
#h = Dense(intermediate_dim, activation='linear')(h)
#h = act(h)
#h = Dropout(0.2)(h)
z_mean = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)

def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0.,
                              stddev=epsilon_std)
    return z_mean + K.exp(z_log_var / 2) * epsilon

# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
# we instantiate these layers separately so as to reuse them later
repeated_context = RepeatVector(max_len)
decoder_h = LSTM(intermediate_dim, return_sequences=True, recurrent_dropout=0.2)
decoder_mean = Dense(NB_WORDS, activation='linear')#softmax is applied in the seq2seqloss by tf #TimeDistributed()
h_decoded = decoder_h(repeated_context(z))
x_decoded_mean = decoder_mean(h_decoded)


# placeholder loss
def zero_loss(y_true, y_pred):
    return K.zeros_like(y_pred)

#Sampled softmax
#logits = tf.constant(np.random.randn(batch_size, max_len, NB_WORDS), tf.float32)
#targets = tf.constant(np.random.randint(NB_WORDS, size=(batch_size, max_len)), tf.int32)
#proj_w = tf.constant(np.random.randn(NB_WORDS, NB_WORDS), tf.float32)
#proj_b = tf.constant(np.zeros(NB_WORDS), tf.float32)
#
#def _sampled_loss(labels, logits):
#    labels = tf.cast(labels, tf.int64)
#    labels = tf.reshape(labels, [-1, 1])
#    logits = tf.cast(logits, tf.float32)
#    return tf.cast(
#                    tf.nn.sampled_softmax_loss(
#                        proj_w,
#                        proj_b,
#                        labels,
#                        logits,
#                        num_sampled=num_sampled,
#                        num_classes=NB_WORDS),
#                    tf.float32)
#softmax_loss_f = _sampled_loss


# Custom loss layer
class CustomVariationalLayer(Layer):
    def __init__(self, **kwargs):
        self.is_placeholder = True
        super(CustomVariationalLayer, self).__init__(**kwargs)
        self.target_weights = tf.constant(np.ones((batch_size, max_len)), tf.float32)

    def vae_loss(self, x, x_decoded_mean):
        #xent_loss = K.sum(metrics.categorical_crossentropy(x, x_decoded_mean), axis=-1)
        labels = tf.cast(x, tf.int32)
        xent_loss = K.sum(tfa.seq2seq.sequence_loss(x_decoded_mean, labels, 
                                                     weights=self.target_weights,
                                                     average_across_timesteps=False,
                                                     average_across_batch=False), axis=-1)#,
                                                     #softmax_loss_function=softmax_loss_f), axis=-1)#,
        kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
        xent_loss = K.mean(xent_loss)
        kl_loss = K.mean(kl_loss)
        return K.mean(xent_loss + kl_weight * kl_loss)

    def call(self, inputs):
        x = inputs[0]
        x_decoded_mean = inputs[1]
        print(x.shape, x_decoded_mean.shape)
        loss = self.vae_loss(x, x_decoded_mean)
        self.add_loss(loss, inputs=inputs)
        # we don't use this output, but it has to have the correct shape:
        return K.ones_like(x)
    
def kl_loss(x, x_decoded_mean):
    kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
    kl_loss = kl_weight * kl_loss
    return kl_loss

loss_layer = CustomVariationalLayer()([x, x_decoded_mean])
vae = Model(x, [loss_layer])
opt = Adam(lr=0.01) 
vae.compile(optimizer='adam', loss=[zero_loss], metrics=[kl_loss])
vae.summary()

(None, 20) (100, 20, 10000)
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_9 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding_5 (Embedding)        (None, 20, 100)      1000000     ['input_9[0][0]']                
                                                                                                  
 bidirectional_3 (Bidirectional  (None, 512)         731136      ['embedding_5[0][0]']            
 )                                                                                                
                                                                                                  
 dense_9 (Dense)                (None, 64)           32832       [

  super(Adam, self).__init__(name, **kwargs)


In [21]:
from tensorflow import keras
from keras import layers
from keras.layers import LSTM
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.layers import Bidirectional

BATCH_SIZE = 128

model = Sequential()
model.add(layers.Embedding(max_words, 128, input_length=maxlen))
model.add(layers.Conv1D(32, 7, activation='relu', padding='same'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(32, 7, activation='relu', padding='same'))
model.add(layers.TimeDistributed(Flatten()))
model.add(Bidirectional(LSTM(16, recurrent_dropout=0.5)))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer = 'adam', loss = "binary_crossentropy", metrics = ['accuracy'])



In [20]:
history = model.fit(x_partial_train, y_partial_train, epochs = 20, batch_size = BATCH_SIZE, validation_data=(x_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

KeyboardInterrupt: 

In [22]:
history = model.fit(x_train, y_train, epochs = 2, batch_size = BATCH_SIZE)

Epoch 1/2
Epoch 2/2


In [23]:
results = model.evaluate(x_test, y_test)
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 20, 128)           1280000   
                                                                 
 conv1d_4 (Conv1D)           (None, 20, 32)            28704     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 4, 32)            0         
 1D)                                                             
                                                                 
 conv1d_5 (Conv1D)           (None, 4, 32)             7200      
                                                                 
 time_distributed_2 (TimeDis  (None, 4, 32)            0         
 tributed)                                                       
                                                                 
 bidirectional_2 (Bidirectio  (None, 32)              