In [3]:
import requests
from bs4 import BeautifulSoup

import tensorflow as tf

import numpy as np
import os
import time

import re

In [11]:
transcript_url = 'http://scrapsfromtheloft.com/stand-up-comedy-scripts/'

def get_urls(url):
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    links = set()
    for link in soup.find_all('a'):
        if link.get('href').endswith('transcript/'):
            links.add(re.sub(r"https", 'http', link.get('href')))
    return list(links)
     

urls = get_urls(transcript_url)

In [12]:
#SLOW RUNTIME WARNING --- BULK OF DATA COLLECTION

def url_to_transcript(url):
    '''Returns transcript data specifically from scrapsfromtheloft.com.'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find(class_="elementor-element elementor-element-74af9a5b elementor-widget elementor-widget-theme-post-content").find_all('p')]
    return text

trans = [url_to_transcript(u) for u in urls]

In [15]:
def combine_text(list_of_text):
    '''
    Take a list of texts and combine them into one large chunk of text
    
    Return a text (larger one)
    '''
    return ' '.join(list_of_text)

#combine paragraphs in each transcript
trans_comb = [combine_text(tran) for tran in trans]
#combine transcripts
trans_comb = combine_text(trans_comb)

In [16]:
import re   
import string
import unicodedata

def clean_text_re(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation,
       remove words containing numbers, additional punctuation and non-sensical text.'''
    
    
    #text = text.lower()
    
    #remove things in brackets and parenthesis
    text = re.sub('\[.*?\]', '', text)
    #text = re.sub('\(.*?\)', '', text)
    
    #remove some sensorship
    text = re.sub(r'#%@@', 'shit', text)
    text = re.sub(r'!\+\$%\$!#', 'fucking', text)
    text = re.sub(r'\*!%\*', 'fuck', text)
    text = re.sub(r'\$%%%!\$', 'fucked', text)
    text = re.sub(r'\*\$\*', 'ass', text)
    
    #text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    #text = re.sub('\w*\d\w*', '', text)
    
    #convert ` to apostrophe
    text = re.sub(r'`', "'", text)
    text = re.sub(r'’', "'", text)
    #convert underscore to space
    text = re.sub(r'_', ' ', text)
    # remove rare symbol with word representation
    #text = re.sub(r'=', ' equals ', text)
    #text = re.sub(r'\+', ' plus ', text)
    
    #remove quotes, elipses, and new lines
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    
    #normalize text
    text = ascii(unicodedata.normalize('NFKD', text))
    #remove non utf-8 bytes (what used to be special characters)
    text = re.sub(r'\\u....', '', text) #remove anything normalize missed
    text = re.sub(r'\\x..', '', text) #remove anything normalize missed
    
    #final clean up of rare/special characters
    text = re.sub(r'["#\*>\\\]~]', '', text) 
    text = re.sub(r' +',' ', text) #make all spaces single spaces
    return text

data = clean_text_re(trans_comb)

In [4]:
file_path = "data.txt"

In [18]:

file = open(file_path, "w")
file.write(data)
file.close()

In [5]:
text = open(file_path, 'rb').read().decode(encoding='utf-8')
print(f'Length of text: {len(text)} characters')

Length of text: 15875767 characters


In [6]:
print(text[:250])

Man We were waiting for you. I'm happy you're here. I'm happy all of you are here. I have so much to tell you. You're comfortable? You can talk back to me. I want you guys to feel that. This only works if we feel like family. I know the camera's here


In [7]:
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')
print(vocab)

81 unique characters
[' ', '!', '$', '%', '&', "'", '(', ')', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|']


In [8]:
ids_from_chars = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [9]:
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))
all_ids

<tf.Tensor: shape=(15875767,), dtype=int64, numpy=array([41, 55, 68, ..., 67, 59,  1], dtype=int64)>

In [10]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

M
a
n
 
W
e
 
w
e
r


In [11]:
seq_length = 100

sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'M' b'a' b'n' b' ' b'W' b'e' b' ' b'w' b'e' b'r' b'e' b' ' b'w' b'a'
 b'i' b't' b'i' b'n' b'g' b' ' b'f' b'o' b'r' b' ' b'y' b'o' b'u' b'.'
 b' ' b'I' b"'" b'm' b' ' b'h' b'a' b'p' b'p' b'y' b' ' b'y' b'o' b'u'
 b"'" b'r' b'e' b' ' b'h' b'e' b'r' b'e' b'.' b' ' b'I' b"'" b'm' b' '
 b'h' b'a' b'p' b'p' b'y' b' ' b'a' b'l' b'l' b' ' b'o' b'f' b' ' b'y'
 b'o' b'u' b' ' b'a' b'r' b'e' b' ' b'h' b'e' b'r' b'e' b'.' b' ' b'I'
 b' ' b'h' b'a' b'v' b'e' b' ' b's' b'o' b' ' b'm' b'u' b'c' b'h' b' '
 b't' b'o' b' '], shape=(101,), dtype=string)


In [12]:
for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

b"Man We were waiting for you. I'm happy you're here. I'm happy all of you are here. I have so much to "
b"tell you. You're comfortable? You can talk back to me. I want you guys to feel that. This only works "
b"if we feel like family. I know the camera's here and it's a whole thing. It's a big night, it's a lot"
b' of pressure. That kinda thing, you know? I want you guys to feel as comfortable as I hope to be. We '
b"got a lot of shit to talk about. I'm happy you're here. I need you. I wanna talk about secrets! Secre"


In [13]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b"Man We were waiting for you. I'm happy you're here. I'm happy all of you are here. I have so much to"
Target: b"an We were waiting for you. I'm happy you're here. I'm happy all of you are here. I have so much to "


In [14]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset

<PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [15]:
# Length of the vocabulary in StringLookup Layer
vocab_size = len(ids_from_chars.get_vocabulary())

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [16]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [17]:
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 100, 82) # (batch_size, sequence_length, vocab_size)


In [18]:
model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  20992     
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  84050     
                                                                 
Total params: 4,043,346
Trainable params: 4,043,346
Non-trainable params: 0
_________________________________________________________________


In [19]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

array([20, 56, 44, 24, 32, 35, 73, 61, 47, 31, 60, 71, 53, 28, 31, 45, 22,
       59, 22, 15, 39, 16, 23, 69, 30, 11,  4, 10, 69,  9, 63, 25, 81, 81,
       36, 48, 30, 66, 52, 59, 30,  7,  4, 20, 56, 15, 78, 14, 21, 56, 34,
       31, 75, 46, 48, 28, 43, 32, 23, 54, 52,  1, 49, 46, 43, 59, 59, 41,
       48, 37, 39, 66, 42, 44, 56, 78, 19, 19, 12, 47, 76, 49,  7, 42, 35,
        0, 32, 72, 10, 17, 27, 42, 37, 58, 14, 34, 65, 14, 70, 31],
      dtype=int64)

In [20]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
print("Next Char Predictions:\n", text_from_ids(sampled_indices).numpy())

Input:
 b"ames. I don't have that knowledge anymore. I'm too old. When I think of a teen idol, there's Britney"

Next Char Predictions:
 b'6bP:DGsgSCfqY@CQ8e81K29oB-%,o+i;||HTBlXeB(%6b1x07bFCuRT@OD9ZX UROeeMTIKlNPbx55.SvU(NG[UNK]Dr,3?NId0Fk0pC'


In [21]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (64, 100, 82)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.407608, shape=(), dtype=float32)


In [22]:
tf.exp(example_batch_mean_loss).numpy()

model.compile(optimizer='adam', loss=loss)

In [23]:
#training path

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [57]:
#SLOW RUNTIME WARNING --- BULK OF MODEL TRAINING
#Cancel at any point and training is saved
#Be careful restarting training, it does NOT pick up where it left off
#and WILL overwrite previous saved training

#set high and cancel if it is taking too long?
EPOCHS = 15

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15

KeyboardInterrupt: 

In [25]:
#load from checkpoint path

model.load_weights(checkpoint_dir + '/ckpt_12')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x196ab1ef220>

In [26]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [27]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [29]:
start = time.time()
states = None
next_char = tf.constant(['How you all doing Los Angeles '])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

How you all doing Los Angeles together? Biomin' brings it. This is what I'm going to read. And my bad place has perfuck syy bucks. I'm still distracting to seminak. What where two things I've known and the cover that happens. My bad guy, all the other day we knew I was worried about that. Like, it's crazy. You know what I mean? Because that have edge those cellphone a baby is like a job? Why would I go? I said, What happened? But the sharks were so candid. But, aren't we sconged it to you idea in a Haildrops? Ugh, most put porn movie Stevol. I did not pay it. Had my wife and If she since that's cuddling. Really cook, more ebusement, just proviculary. It's a really important requited, if we don't know. I joined a bad joke, while I used it, not the whole site of, a new language. Would this is a heaptoo of popty on this. We need somebody cos Everywhere 3 in that century Tommy. Many Republican because that didn't yeah you were forgot! I knew I was gonna wake up in the magician. He hadn't s