# Clickbait Generator
Give me those clicks!!!
Data taken from https://github.com/bhargaviparanjape/clickbait

## Data Collection
1. Download source file
2. Extract archive

In [None]:
import os
import requests

In [None]:
data_file = "clickbait.gz"

def download_data():
    
    if os.path.exists(data_file):
        print("Found data file, not downloading")
        return
    
    print("Downloading data...")
    
    zip_file = requests.get("https://raw.githubusercontent.com/bhargaviparanjape/clickbait/master/dataset/clickbait_data.gz")

    with open(data_file, "wb+") as f:
        f.write(zip_file.content)
        

    

In [None]:
download_data()

## Data Processing
1. Extract archived data
2. Add end_of_headline terminator word
3. Tokenize titles
4. Split into X and Y data

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import gzip
import numpy as np

vocab_size = 8000
sample_length = 20
step = 2
end_of_headline = 'eoh'

In [None]:
def load_text():
    with gzip.open(data_file, 'rt') as f:
        return f.read().replace('\n\n', f" {end_of_headline} ")

def split_into_samples(text):
    samples = []
    next_words = []
    
    for i in range(0, len(text) - sample_length, step):
        samples.append(text[i:i + sample_length])
        next_words.append(text[i+sample_length])
    
    p = np.random.permutation(len(samples))
    return np.asarray(samples)[p], np.asarray(next_words)[p]

In [None]:
np.random.seed(0)

text = load_text()

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts([text])
text = tokenizer.texts_to_sequences([text])[0]
X, Y = split_into_samples(text)

print(f"{len(tokenizer.word_index)} total words")
print(f"{len(X)} samples")
Y = to_categorical(Y, num_classes=vocab_size)


## LSTM Model
Uses GloVe word embeddings

In [None]:
import matplotlib.pyplot as plt
import tensorflow.keras
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Dropout
from tensorflow.keras.models import Model

In [None]:
def get_embedding_weights(tokenizer):
    # code based on Deep Learning with Python, 6.1.3
    glove_file = 'glove.6B.100d.txt'
    
    embedding_index = {}
    
    with open(glove_file) as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding_index[word] = np.asarray(values[1:], dtype='float32')
            
    embedding_dim = 100 # should match glove filename
    
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    
    not_found = []
    
    for word, i in tokenizer.word_index.items():
        if i < vocab_size:
            embedding_vector = embedding_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                continue

            # try common 's and plural endings
            word = word.replace('\'s', '')
            embedding_vector = embedding_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
                continue

            if word.endswith('s'):
                word = word[:-1]
                embedding_vector = embedding_index.get(word)
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
                    continue
                
            not_found.append(word)
                
    print(f"Not found: {len(not_found)}\n{not_found}")
                
    return embedding_matrix

In [None]:
embedding_weights = get_embedding_weights(tokenizer)

In [None]:
def create_model(sample_length, vocab_size, embedding_weights):
    input_layer = Input(shape=(sample_length,))
    
    m = Embedding(vocab_size, 100, input_length=sample_length)(input_layer)
    m = LSTM(256, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)(m)
    m = LSTM(256, dropout=0.5, recurrent_dropout=0.5)(m)
    m = Dense(500, activation='relu')(m)
    m = Dropout(0.5)(m)
    m = Dense(vocab_size, activation='softmax')(m)
    
    model = Model(inputs=[input_layer], outputs=m)
    
    model.layers[1].set_weights([embedding_weights])
    model.layers[1].trainable = False # todo maybe make this True
    
    return model

def make_plots(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    
    epochs = range(1, len(acc) + 1)
    
    plt.plot(epochs, acc, 'bo', label='Training accuracy')
    plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
    plt.title('Accuracy')
    plt.legend()
    
    plt.figure()
    plt.plot(epochs, loss, 'bo', label='Training loss')
    plt.plot(epochs, val_loss, 'b', label='Validation loss')
    plt.title('Loss')
    plt.legend()
    
    plt.show()

In [None]:
model = create_model(sample_length, vocab_size, embedding_weights)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X, Y, batch_size=200, epochs=15, validation_split=0.05)
# model.save_weights('model.h5')

In [None]:
make_plots(history)

## Prediction

In [None]:
# temperature sampling based on Deep Learning book
def sample(preds, temperature):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probs = np.random.multinomial(1, preds, 1)
    return np.argmax(probs)

def generate_seq(model, tokenizer, seq_length, seed_text, n_words, temperature = 0.5):
    result = seed_text.split()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        preds = model.predict(encoded, verbose=0)
        yhat = sample(preds[0], temperature)
        # map predicted word index to word
        out_word = 'X'
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        if out_word == end_of_headline:
            break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [None]:
for i in range(0,10):
    output = generate_seq(model, tokenizer, sample_length, "", 20, .5)
    print(f"{i} {output}")

## Results
My favorite results so far
- we know your zodiac sign based on your zodiac sign
- are you more like more dog or a dog
- the 17 most important canadian celebrity moments of 2015
- this is what it's like when you have a boyfriend
- the new best thing you've ever been
- are you more best or the most mermaid
- this is what it's like to be a best of the world
- these women are actually super
- here's how to make a vampire
- can you guess your favorite '90s movie based on your favorite kitten
- are you more a canadian or taylor swift or oprah