In [11]:
import os
import pandas as pd
from tqdm import tqdm
import re

In [12]:
# Open the dataset documents and store their data into a DataFrame
def load_himym_dataset():
    episodes_folder = os.path.join(os.getcwd(), "Datasets", "Sources", "HIMYM", "Episodes")
    dataframe_rows = []
    # Get number of documents and their names
    documents_n = len(os.listdir(episodes_folder))
    documents_names = os.listdir(episodes_folder)

    # Loop over documents
    for i in tqdm(range(documents_n)):
        filename = documents_names[i]
        # Open document
        file = open(os.path.join(episodes_folder, filename))
        episode_index = filename[:-4]
        # Loop over lines (= words)
        for line in file.readlines():
                dataframe_row = {
                    "episode": episode_index,
                    "line": line,
                }
                dataframe_rows.append(dataframe_row)
    # Build the dataframe from the words
    df = pd.DataFrame(dataframe_rows)
    return df

In [13]:
# Execute creation of dataset
himym_df = load_himym_dataset()
himym_df.head()
himym_df.count()

100%|██████████████████████████████████████████████████████████████████████████████| 139/139 [00:00<00:00, 1674.75it/s]


episode    39284
line       39284
dtype: int64

In [14]:
def process_himym_dataset(df):
    df = df[~df['line'].str.startswith("[")]
    df = df[~df['line'].str.startswith("(")]
    df['line'] = df['line'].str.strip()
    df['line'] = df['line'].str.replace(r"\(.*\)","")
    df[['character', 'line']] = df['line'].str.split(":", 1, expand=True)
    df = df[~df['line'].isnull()]
    df = df.reset_index(drop=True)
    return df
    
himym_df = process_himym_dataset(himym_df)

In [15]:
himym_df.head(20)

Unnamed: 0,episode,line,character
0,01x01,"Kids, I'm going to tell you an incredible sto...",Narrator
1,01x01,Are we being punished for something?,Son
2,01x01,No,Narrator
3,01x01,"Yeah, is this going to take a while?",Daughter
4,01x01,"Yes. Twenty-five years ago, before I was dad...",Narrator
5,01x01,It was way back in 2005. I was twenty-seven j...,Narrator
6,01x01,Will you marry me.,Marshall
7,01x01,"Yes, perfect! And then you're engaged, you po...",Ted
8,01x01,"Got it. Thanks for helping me plan this out, ...",Marshall
9,01x01,"Dude, are you kidding? It's you and Lily! I'v...",Ted


In [16]:
# NOTE: May consider feeding one sentence and one Barney reply or multiple sentences encoded with one Barney reply
def get_barney(himym_df, level=2):
    dataframe_rows = []
    idxs_barney = himym_df[himym_df['character'] == 'Barney'].index
    for i in range(-1, -level-1, -1):
        for j in idxs_barney:
            dataframe_row = {
                "reply": himym_df['line'][j],
                "sentence": himym_df['line'][j+i],
            }
            dataframe_rows.append(dataframe_row)
    df = pd.DataFrame(dataframe_rows)
    return df
    
barney_df = get_barney(himym_df)

In [17]:
barney_df.head()

Unnamed: 0,reply,sentence
0,"hey, so you know how I've always had a thing...",What was I doing? Your Uncle Marshall was tak...
1,"Okay, meet me at the bar in fifteen minutes, ...","Hey, you wanna do something tonight?"
2,Where's your suit!? Just once when I say suit...,Hey.
3,It was a blazer!,I did that one time.
4,I see what this is about. Have you forgotten ...,"You know, ever since college it's been Marsha..."


In [18]:
barney_path = os.path.join(os.getcwd(), "Datasets", "Characters", "Barney")
if not os.path.exists(barney_path):
    os.makedirs(barney_path)
barney_df.to_csv(os.path.join(barney_path, "Barney.csv"))

In [None]:
# gensim API to download embeddings
import gensim
import gensim.downloader as gloader

# Download Glove embeddings
def load_embedding_model(embedding_dimension: int = 50):
    # Glove download URL
    download_path = "glove-wiki-gigaword-{}".format(embedding_dimension)
    # Download
    try:
        emb_model = gloader.load(download_path)
    except ValueError as e:
        print("Invalid embedding model name!")
        raise e
    return emb_model

# Set embedding dimension to 50 and download embeddings
embedding_dimension = 50
embedding_model = load_embedding_model(embedding_dimension)

In [None]:
# Build or augment the current vocabulary given a list of words
def build_vocabulary(glove_model,
                     current_embeddings,
                     current_idx_to_word,
                     embedding_dimension,
                     wordlist):
    # To keep track of the OOV terms
    oov_set = set()
    # If no vocabulary has been built yet...
    if not current_embeddings:
        # Add a non-word for padding
        current_idx_to_word[0] = 0
        current_embeddings[0] = np.zeros(embedding_dimension)
        # Add all words in the provided wordlist
        for word in tqdm(wordlist):
            try:
                # If a word has a glove embedding, use it
                current_embeddings[word] = glove_model[word]
            # Otherwise, generate a random vector with small magnitude
            except KeyError:
                embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)
                current_embeddings[word] = embedding_vector
                oov_set.add(word)
            current_idx_to_word[len(current_idx_to_word)] = word
    # If there is already a non-empty vocabulary...
    else:
        # Find which words are new in the wordlist (as in, they don't appear in the current vocabulary)
        old_wordlist = set(current_embeddings.keys())
        new_wordlist = wordlist - old_wordlist
        # Add all new words to the vocabulary
        for word in tqdm(new_wordlist):
            try:
                current_embeddings[word] = glove_model[word]
            except KeyError:
                embedding_vector = np.random.uniform(low=-0.05, high=0.05, size=embedding_dimension)
                current_embeddings[word] = embedding_vector
                oov_set.add(word)
            current_idx_to_word[len(current_idx_to_word)] = word
    return current_embeddings, current_idx_to_word, oov_set

In [4]:
from transformers import BlenderbotTokenizer, TFBlenderbotForConditionalGeneration
mname = 'facebook/blenderbot-400M-distill'
model = TFBlenderbotForConditionalGeneration.from_pretrained(mname)
tokenizer = BlenderbotTokenizer.from_pretrained(mname)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1572.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1459579344.0), HTML(value='')))




All model checkpoint layers were used when initializing TFBlenderbotForConditionalGeneration.

All the layers of TFBlenderbotForConditionalGeneration were initialized from the model checkpoint at facebook/blenderbot-400M-distill.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBlenderbotForConditionalGeneration for predictions without further training.


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=126891.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=62871.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1153.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=772.0), HTML(value='')))




In [7]:
UTTERANCE = "Fuck you."
print("Human: ", UTTERANCE)
inputs = tokenizer([UTTERANCE], return_tensors='tf')
reply_ids = model.generate(**inputs)
print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])

Human:  Fuck you.
Bot:   I know, right?  I was so mad.  I don't know what I would have done if it happened to me.
