# Generating New TED Talk Descriptions with RNN's

intro


First we'll import our libraries and data and take a peak:

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

data_path = "ted_main.csv"
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,comments,description,duration,event,film_date,languages,main_speaker,name,num_speaker,published_date,ratings,related_talks,speaker_occupation,tags,title,url,views
0,4553,Sir Ken Robinson makes an entertaining and pro...,1164,TED2006,1140825600,60,Ken Robinson,Ken Robinson: Do schools kill creativity?,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 19645}, {...","[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...",Author/educator,"['children', 'creativity', 'culture', 'dance',...",Do schools kill creativity?,https://www.ted.com/talks/ken_robinson_says_sc...,47227110
1,265,With the same humor and humanity he exuded in ...,977,TED2006,1140825600,43,Al Gore,Al Gore: Averting the climate crisis,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...","[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...",Climate advocate,"['alternative energy', 'cars', 'climate change...",Averting the climate crisis,https://www.ted.com/talks/al_gore_on_averting_...,3200520
2,124,New York Times columnist David Pogue takes aim...,1286,TED2006,1140739200,26,David Pogue,David Pogue: Simplicity sells,1,1151367060,"[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...","[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...",Technology columnist,"['computers', 'entertainment', 'interface desi...",Simplicity sells,https://www.ted.com/talks/david_pogue_says_sim...,1636292
3,200,"In an emotionally charged talk, MacArthur-winn...",1116,TED2006,1140912000,35,Majora Carter,Majora Carter: Greening the ghetto,1,1151367060,"[{'id': 3, 'name': 'Courageous', 'count': 760}...","[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i...",Activist for environmental justice,"['MacArthur grant', 'activism', 'business', 'c...",Greening the ghetto,https://www.ted.com/talks/majora_carter_s_tale...,1697550
4,593,You've never seen data presented like this. Wi...,1190,TED2006,1140566400,48,Hans Rosling,Hans Rosling: The best stats you've ever seen,1,1151440680,"[{'id': 9, 'name': 'Ingenious', 'count': 3202}...","[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i...",Global health expert; data visionary,"['Africa', 'Asia', 'Google', 'demo', 'economic...",The best stats you've ever seen,https://www.ted.com/talks/hans_rosling_shows_t...,12005869


That looks good, but we don't need all those variables. Let's trim our dataset to just what we need:

In [38]:
data2 = data["description"]
data2.head()

0    Sir Ken Robinson makes an entertaining and pro...
1    With the same humor and humanity he exuded in ...
2    New York Times columnist David Pogue takes aim...
3    In an emotionally charged talk, MacArthur-winn...
4    You've never seen data presented like this. Wi...
Name: description, dtype: object

## Preprocess the Data

In order for the computer to understand each word, we'll need to convert them into numbers. 

In [114]:
def create_lookup_table(data2):
    #Create two dictionaries, one that converts vocabulary to integers
    #and another that converts integers back to correct word
    
    all_words = []
    for descrip in data2:    
        for word in descrip.split(" "):
            all_words.append(word)
    

    from collections import Counter
    counts = Counter(all_words)
    vocab = sorted(counts, key = counts.get, reverse = True)
    vocab_to_int = {word: ind for ind, word in enumerate(vocab,1)}
    int_to_vocab = {ind: word for ind, word in enumerate(vocab,1)}
    input_text = [vocab_to_int[word] for word in all_words]

    return input_text, vocab_to_int, int_to_vocab

input_text, vocab_to_int, int_to_vocab = create_lookup_table(data2)


In [115]:
def punctuation_handler():
    
    punc_dict = {"." : "||Period||",
                  "," : "||Comma||",
                  '"' : "||Quotation_Mark||",
                  ";" : "||Semicolon||",
                  "!" : "||Exclamation_Mark||",
                  "?" : "||Question_Mark||",
                  "(" : "||Left_Parentheses||",
                  ")" : "||Right_Parentheses||",
                  "--" : "||Dash||",
                  "\n" : "||Return||"}
    return punc_dict


## Input

In [116]:
def get_inputs():
    #Create tf placeholders for inputs
    #Return: (inputs, targets, learning_rate)
    inputs = tf.placeholder(tf.int32, [None,None], name = "inputs")
    targets = tf.placeholder(tf.int32, [None, None], name = "targets")
    learning_rate = tf.placeholder(tf.float32, name = "learning_rate")
    
    return inputs, targets, learning_rate

### Build RNN Cell and Initialize:

In [117]:
def get_init_cell(batch_size, rnn_size):
    #Create RNN cell and initialize
    #Return: (rnn_cell, initial_state)
    
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(rnn_size)
    rnn_cell = tf.contrib.rnn.MultiRNNCell([lstm_cell])
    initial_state = rnn_cell.zero_state(batch_size, tf.float32)
    initial_state = tf.identity(initial_state, name = "initial_state")
    
    return rnn_cell, initial_state

### Embed Words:

In [118]:
def get_embed(inputs, vocab_size, embed_dims):
    #Create embedding for inputs, Return embedded_inputs
    
    embedding = tf.Variable(tf.random_uniform((vocab_size, embed_dim), -1, 1))
    embedded_inputs = tf.nn.embedding_lookup(embedding, inputs)
    
    return embedded_inputs

### Build RNN

In [119]:
def build_rnn(rnn_cell, inputs):
    #Build RNN using rnn_cell created earlier
    
    outputs, final_state = tf.nn.dynamic_rnn(rnn_cell, inputs, dtype = tf.float32)
    final_state = tf.identity(final_state, name = "final_state")
    
    return outputs, final_state

### Build the Neural Network

In [120]:
def build_network(rnn_cell, rnn_size, inputs, vocab_size, embed_dims):
    
    embed = get_embed(inputs, vocab_size, rnn_size)
    outputs, final_state = build_rnn(rnn_cell, embed)
    
    logits = tf.contrib.layers.fully_connected(outputs, vocab_size, activation_fn = None)
    
    return logits, final_state

### Batches

In [121]:
def get_batches(input_text, batch_size, seq_length):
    #Return batches as needed to the rnn
    
    n_batches = len(input_text)// (batch_size * seq_length)
    
    #We only want full batches, so we drop the last few chars:
    x = np.array(input_text[: n_batches * batch_size * seq_length])
    y = np.array(input_text[1: n_batches * batch_size * seq_length + 1])
    
    #Circle the first input to the last target:
    y[-1] = x[0]
    
    x_batches = np.split(x.reshape(batch_size, -1), n_batches, 1)
    y_batches = np.split(y.reshape(batch_size, -1), n_batches, 1)
    
    batches = np.array(list(zip(x_batches, y_batches)))
    
    return batches

### Hyperparameters:

In [125]:
# Number of Epochs
num_epochs = 4
# Batch Size
batch_size = 128
# RNN Size
rnn_size = 200
# Embedding Dimension Size
embed_dim = 200
# Sequence Length
seq_length = 25
# Learning Rate
learning_rate = 0.01
# Show stats for every n number of batches
show_every_n_batches = 5

### Build the Graph

In [126]:
from tensorflow.contrib import seq2seq

train_graph = tf.Graph()
with train_graph.as_default():
    inputs, targets, learn_rate = get_inputs()
    vocab_size = len(int_to_vocab)
    input_shape = tf.shape(inputs)
    rnn_cell, initial_state = get_init_cell(input_shape[0], rnn_size)
    logits, final_state = build_network(rnn_cell, rnn_size, inputs, vocab_size, embed_dim)
    
    #Probabilities for generating words:
    probs = tf.nn.softmax(logits, name = "probs")
    
    #Loss function:
    cost = seq2seq.sequence_loss(logits, targets, tf.ones([input_shape[0], input_shape[1]]))
    
    #Optimizer
    optimizer = tf.train.AdamOptimizer(learn_rate)
    
    #Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_grads = [(tf.clip_by_value(grad, -1.0, 1.0), var) for grad, var in gradients if grad is not None]
    train_optimizer = optimizer.apply_gradients(capped_grads)

### Train

In [127]:
batches = get_batches(input_text, batch_size, seq_length)

with tf.Session(graph = train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch_i in range(num_epochs):
        state = sess.run(initial_state, {inputs: batches[0][0]})
        
        for batch_i, (x,y) in enumerate(batches):
            feed = {
                inputs: x,
                targets: y,
                initial_state: state,
                learn_rate: learning_rate
                
            }
            train_loss, state, _ = sess.run([cost, final_state, train_optimizer], feed)
            
            #Show every show_every_n_batches batches
            if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss))
    #Save Model
    saver = tf.train.Saver()
    saver.save(sess, './save')
    print("Model Trained and Saved!")

Epoch   0 Batch    0/41   train_loss = 10.127
Epoch   0 Batch    5/41   train_loss = 8.748
Epoch   0 Batch   10/41   train_loss = 8.733
Epoch   0 Batch   15/41   train_loss = 8.413
Epoch   0 Batch   20/41   train_loss = 8.143
Epoch   0 Batch   25/41   train_loss = 8.108
Epoch   0 Batch   30/41   train_loss = 7.883
Epoch   0 Batch   35/41   train_loss = 8.079
Epoch   0 Batch   40/41   train_loss = 7.965
Epoch   1 Batch    4/41   train_loss = 7.459
Epoch   1 Batch    9/41   train_loss = 7.464
Epoch   1 Batch   14/41   train_loss = 7.430
Epoch   1 Batch   19/41   train_loss = 7.226
Epoch   1 Batch   24/41   train_loss = 7.189
Epoch   1 Batch   29/41   train_loss = 7.098
Epoch   1 Batch   34/41   train_loss = 7.073
Epoch   1 Batch   39/41   train_loss = 6.983
Epoch   2 Batch    3/41   train_loss = 6.761
Epoch   2 Batch    8/41   train_loss = 6.718
Epoch   2 Batch   13/41   train_loss = 6.853
Epoch   2 Batch   18/41   train_loss = 6.810
Epoch   2 Batch   23/41   train_loss = 6.790
Epoch   2

NameError: name 'save_dir' is not defined

In [128]:
def get_tensors(loaded_graph):
    """
    Get input, initial state, final state, and probabilities tensor from <loaded_graph>
    :param loaded_graph: TensorFlow graph loaded from file
    :return: Tuple (InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor)
    """
    input_tensor = loaded_graph.get_tensor_by_name("input:0")
    initial_state_tensor = loaded_graph.get_tensor_by_name("initial_state:0")
    final_state_tensor = loaded_graph.get_tensor_by_name("final_state:0")
    probs_tensor = loaded_graph.get_tensor_by_name("probs:0")
    
    
    
    return (input_tensor, initial_state_tensor, final_state_tensor, probs_tensor)



In [129]:
def pick_word(probabilities, int_to_vocab):
    """
    Pick the next word in the generated text
    :param probabilities: Probabilites of the next word
    :param int_to_vocab: Dictionary of word ids as the keys and words as the values
    :return: String of the predicted word
    """
    
    word = np.random.choice(list(int_to_vocab.values()), 1, p=np.squeeze(probabilities))[0]
    
    return word

In [131]:
gen_length = 200

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    #loader = tf.train.import_meta_graph(load_dir + '.meta')
    #loader.restore(sess, load_dir)

    # Get Tensors from loaded model
    input_text, initial_state, final_state, probs = get_tensors(loaded_graph)

    # Sentences generation setup
    gen_sentences = []
    prev_state = sess.run(initial_state, {input_text: np.array([[1]])})

    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        pred_word = pick_word(probabilities[:,dyn_seq_length-1], int_to_vocab)

        gen_sentences.append(pred_word)
    
    # Remove tokens
    tv_script = ' '.join(gen_sentences)
    for key, token in token_dict.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        tv_script = tv_script.replace(' ' + token.lower(), key)
    tv_script = tv_script.replace('\n ', '\n')
    tv_script = tv_script.replace('( ', '(')
        
    print(tv_script)

KeyError: "The name 'input:0' refers to a Tensor which does not exist. The operation, 'input', does not exist in the graph."