# This notebook is intended to:
- Ingest the entire collected works of the TV show "Friends"
- Preprocess the data by extracting text and concatenating into a single document
- Train a Natural Language Processing model to generate similar works
- Evaluate the model
- Save the model so it may be used by a Twitter bot

In [2]:
from bs4 import BeautifulSoup
import requests
from typing import List
import re

In [3]:
base_url = 'https://fangj.github.io/friends/'
links = List[str]

with requests.get(base_url) as response:
    html = BeautifulSoup(response.text)
    links = [a['href'] for a in html.find_all('a')]



In [4]:
def extract_script(i: int, path: str) -> str:
    with requests.get(base_url + path) as page_res:
        page_html = BeautifulSoup(page_res.text)
    
    try:
        first_scene_annotation = page_html.find(text=re.compile('Scene:'))
        after = first_scene_annotation.parent.find_next_siblings()

        return '\n'.join([first_scene_annotation] + [el.text for el in after])
    except:
        raise Exception('Loop failed on iteration: %d' % i)

In [5]:
entire_friends_script = [extract_script(i, link) for i, link in enumerate(links) if i not in [26, 34]]
# Episode 26 & 34 don't follow the pattern of transcription seen in other episode scripts.
# They lack the first '[Scene: ...]' stage direction
# Recommend a PR to fix this.

In [6]:
entire_friends_script = '\n'.join(entire_friends_script)

In [7]:
entire_friends_script[:248]

"[Scene: Central Perk, Chandler, Joey, Phoebe, and Monica are there.]\nMonica: There's nothing to tell! He's just some guy\nI work with!\nJoey: C'mon, you're going out with the guy! There's\ngotta be something wrong with him!\nChandler: All right Joey, b"

## Ingest is now done.
The `entire_friends_script` variable holds the concatenated scripts for all the Friends episodes

In [8]:
import numpy as np

In [9]:
vocab = sorted(set(entire_friends_script))
char_to_ind = {u:i for i, u in enumerate(vocab)}
ind_to_char = np.array(vocab)

In [10]:
encoded_scripts = np.array([char_to_ind[c] for c in entire_friends_script])

In [11]:
from tensorflow.data import Dataset

In [12]:
char_dataset = Dataset.from_tensor_slices(encoded_scripts)

In [13]:
desired_sequence_length = 140

In [14]:
sequences = char_dataset.batch(desired_sequence_length+1, drop_remainder=True)

In [15]:
def create_input_target_pairs(sequence: str) -> (str, str):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [16]:
create_input_target_pairs('This is just some test text lol')

('This is just some test text lo', 'his is just some test text lol')

In [17]:
dataset = sequences.map(create_input_target_pairs)

In [18]:
batch_size = 128
shuffle_buffer_size = 10000

shuffled_dataset = dataset.shuffle(shuffle_buffer_size).batch(batch_size=batch_size, drop_remainder=True)

## Dataset created
Pairs of sequences shifted by 1 character have been shuffled into a dataset

'Hello, I am Ricoo' -> 'ello, I am Ricool'

In [19]:
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, LSTM, Dense, Embedding, Dropout

In [20]:
def sparse_cat_loss(y_true, y_pred):
    return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

In [21]:
vocab_size = len(vocab)
embedding_dimension = 64


In [22]:
model = Sequential()

model.add(Embedding(
    vocab_size,
    embedding_dimension,
    batch_input_shape=[batch_size, None]))

model.add(GRU(
    1026,
    return_sequences=True,
    stateful=True,
    recurrent_initializer='glorot_uniform'))

model.add(Dense(vocab_size))

model.compile(optimizer='adam', loss=sparse_cat_loss)

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 64)           6976      
_________________________________________________________________
gru (GRU)                    (128, None, 1026)         3361176   
_________________________________________________________________
dense (Dense)                (128, None, 109)          111943    
Total params: 3,480,095
Trainable params: 3,480,095
Non-trainable params: 0
_________________________________________________________________


## Model built
The next bit is just to confirm the model shape and training is correct

In [24]:
for input_example, target_example in shuffled_dataset.take(1):
    example_preds = model(input_example)

    print(example_preds.shape)

(128, 140, 109)


In [25]:
from tensorflow.random import categorical
from tensorflow import squeeze

In [26]:
example_preds_categories = categorical(example_preds[0], num_samples=1)
example_preds_categories = squeeze(example_preds_categories, axis=-1).numpy()


In [27]:
''.join(ind_to_char[example_preds_categories])

'OUH�aJdéy&dÉOJm[q?f4’W:)\xa0h9,";+”:EpBÉ—égté0\xa0m}0p*É<Q/%12z"oD^”_g=—d,pTq\r\r<76r[vqPP-…K“!cçL(|UC1rJB6:.5{J%Qd—!f14C/]52\r”M6jL W,!m2”LVt,+ÉMX=”'

## Shape is correct
Now to train the model


In [29]:
model.fit(shuffled_dataset, epochs=120)

Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fcb5c14dfa0>

In [1]:
import tensorflow as tf
print("Num GPUs Available: ", tf.config.experimental.list_physical_devices('GPU'))

Num GPUs Available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [30]:
model.save('../../func/resources/friends_model2.h5')

In [31]:
from tensorflow import TensorShape

In [32]:
practice_model = Sequential()

practice_model.add(Embedding(
    vocab_size,
    embedding_dimension,
    batch_input_shape=[1, None]))

practice_model.add(GRU(
    1026,
    return_sequences=True,
    stateful=True,
    recurrent_initializer='glorot_uniform'))

practice_model.add(Dense(vocab_size))

practice_model.compile(optimizer='adam', loss=sparse_cat_loss)

practice_model.load_weights('../../func/resources/friends_model2.h5')

practice_model.build(TensorShape([1, None]))

practice_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 64)             6976      
_________________________________________________________________
gru_1 (GRU)                  (1, None, 1026)           3361176   
_________________________________________________________________
dense_1 (Dense)              (1, None, 109)            111943    
Total params: 3,480,095
Trainable params: 3,480,095
Non-trainable params: 0
_________________________________________________________________


In [33]:
from tensorflow import expand_dims

def generate_text(this_model: Sequential, start_seed: str, num_chars=100, temp=1.0) -> str:
  '''
  model: Trained Model to Generate Text
  start_seed: Intial Seed text in string form
  gen_size: Number of characters to generate

  Basic idea behind this function is to take in some seed text, format it so
  that it is in the correct shape for our network, then loop the sequence as
  we keep adding our own predicted characters. Similar to our work in the RNN
  time series problems.
  '''

  # Vecotrizing starting seed text
  input_eval = [char_to_ind[s] for s in start_seed]

  # Expand to match batch format shape
  input_eval = expand_dims(input_eval, 0)

  # Empty list to hold resulting generated text
  text_generated = []

  # Temperature effects randomness in our resulting text
  # The term is derived from entropy/thermodynamics.
  # The temperature is used to effect probability of next characters.
  # Higher probability == lesss surprising/ more expected
  # Lower temperature == more surprising / less expected
 
  temperature = temp

  # Here batch size == 1
  this_model.reset_states()

  for i in range(num_chars):

      # Generate Predictions
      predictions = this_model(input_eval)

      # Remove the batch shape dimension
      predictions = squeeze(predictions, 0)

      # Use a cateogircal disitribution to select the next character
      predictions = predictions / temperature
      predicted_id = categorical(predictions, num_samples=1)[-1,0].numpy()

      # Pass the predicted charracter for the next input
      input_eval = expand_dims([predicted_id], 0)

      # Transform back to character letter
      text_generated.append(ind_to_char[predicted_id])

  return (start_seed + ''.join(text_generated))

In [41]:
print(generate_text(practice_model, 'Monica:', num_chars=280))

Monica: Okay. (Starting to use us open it and picks up.)
Ross: Hey, Mr. Geller!
Phoebe: That was your miertigion? Oh man! I- What Phoebe make looks disgusted by her wine?
Ross: No! No, no! No if you’re cheating on you, one marriage
museum is gene!
Ross: Please, don’t freak out us and Ma


In [42]:
import json

In [44]:
with open('../../func/resources/vocab.json', 'w') as f:
    json.dump(vocab, f)

In [45]:
practice_model.save('../../func/resources/friends_practice_model2.h5')