In [1]:
import os
import pandas as pd
from tqdm import tqdm
import re
import tensorflow as tf

In [2]:
# Open the dataset documents and store their data into a DataFrame
def load_himym_dataset():
    episodes_folder = os.path.join(os.getcwd(), "Datasets", "Sources", "HIMYM", "Episodes")
    dataframe_rows = []
    # Get number of documents and their names
    documents_n = len(os.listdir(episodes_folder))
    documents_names = os.listdir(episodes_folder)

    # Loop over documents
    for i in tqdm(range(documents_n)):
        filename = documents_names[i]
        # Open document
        file = open(os.path.join(episodes_folder, filename))
        episode_index = filename[:-4]
        # Loop over lines (= words)
        for line in file.readlines():
                dataframe_row = {
                    "episode": episode_index,
                    "line": line,
                }
                dataframe_rows.append(dataframe_row)
    # Build the dataframe from the words
    df = pd.DataFrame(dataframe_rows)
    return df

In [3]:
# Execute creation of dataset
himym_df = load_himym_dataset()
himym_df.head()
himym_df.count()

100%|████████████████████████████████████████████████████████████████████████████████| 139/139 [00:02<00:00, 59.14it/s]


episode    39284
line       39284
dtype: int64

In [4]:
def process_himym_dataset(df):
    df = df[~df['line'].str.startswith("[")]
    df = df[~df['line'].str.startswith("(")]
    df['line'] = df['line'].str.strip()
    df['line'] = df['line'].str.replace(r"\(.*\)","")
    df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
    df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
    df = df[~df['line'].isnull()]
    df[['character', 'line']] = df['line'].str.split(":", 1, expand=True)
    df = df.dropna()
    df['line'] = df['line'].str.strip()
    df['line'] = df['line'][df['line'].str.len() >= 2]
    df = df[~df['line'].isnull()]
    df = df.dropna()
    df = df.reset_index(drop=True)
    return df
    
himym_df = process_himym_dataset(himym_df)
print(len(himym_df))

30274


In [5]:
himym_df.head(20)

Unnamed: 0,episode,line,character
0,01x01,"Kids, I'm going to tell you an incredible stor...",Narrator
1,01x01,Are we being punished for something?,Son
2,01x01,No,Narrator
3,01x01,"Yeah, is this going to take a while?",Daughter
4,01x01,"Yes. Twenty-five years ago, before I was dad,...",Narrator
5,01x01,It was way back in 2005. I was twenty-seven ju...,Narrator
6,01x01,Will you marry me.,Marshall
7,01x01,"Yes, perfect! And then you're engaged, you pop...",Ted
8,01x01,"Got it. Thanks for helping me plan this out, Ted.",Marshall
9,01x01,"Dude, are you kidding? It's you and Lily! I've...",Ted


In [6]:
himym_df['character'].unique()

array(['Narrator', 'Son', 'Daughter', 'Marshall', 'Ted', 'Barney',
       'Yasmine', 'Lily', 'Robin', 'Cabdriver', "Robin's Dumped Friend",
       'Producer', 'Waitor', 'Ranjit', 'Lily, Marshall and Barney',
       'Son and Daughter', 'Rangit', 'Marshal', 'Carl', 'Cameraman',
       'Leroy', 'Lily and Marshall', 'Fantasy Girl', 'Tatiana',
       'Lily and Ted', 'Crowd', 'Carlos', 'Barney and Ted',
       'Marshall, Lily and Ted', 'Mashall, Lily and Ted', 'Guy 1',
       'Laura', 'Fight Attendant', 'Guy 2', 'Guy 3', 'Officer McNeil',
       'bmb Squad Guy', 'Derrick', 'Dana', 'Sascha', 'Cabdriver 2',
       'Cute Girl', 'Stefanie', 'Marshall and Ted', 'Mr. Adams',
       'Natalie', 'One Guest', 'All', 'Henry', 'Waiter', 'Claire',
       'Bradley', 'Chris', 'Austin', 'Kelly', 'Bartender', 'Phil',
       'Man on Street', 'Doorman 2', 'Woman', 'Coat Check Girl',
       'Barney, Ted and Robin', 'Future Ted', 'Lily ', 'Barney ',
       'Marshall, Lily, Barney', 'Lily, Marshall, Barney', 'Mik

In [61]:
# NOTE: May consider feeding one sentence and one Barney reply or multiple sentences encoded with one Barney reply
def get_barney(himym_df, level=2):
    dataframe_rows = []
    idxs_barney = himym_df[himym_df['character'] == 'Barney'].index
    dataframe_rows = []
    for i in idxs_barney:
        l = []
        l.append(himym_df['line'][i])
        for j in range(0,level):
            l.append(himym_df['line'][i-j-1])
        dataframe_rows.append(l)
    df = pd.DataFrame(dataframe_rows, columns=['response', 'context', 'context/0'])
    return df

barney_df = get_barney(himym_df)

In [62]:
barney_df.head()

Unnamed: 0,response,context,context/0
0,"hey, so you know how I've always had a thing f...",What was I doing? Your Uncle Marshall was taki...,"Yeah, what are you doing tonight?"
1,"Okay, meet me at the bar in fifteen minutes, a...","Hey, you wanna do something tonight?","hey, so you know how I've always had a thing f..."
2,Where's your suit!? Just once when I say suit ...,Hey.,"Okay, meet me at the bar in fifteen minutes, a..."
3,It was a blazer!,I did that one time.,Where's your suit!? Just once when I say suit ...
4,I see what this is about. Have you forgotten w...,"You know, ever since college it's been Marshal...",It was a blazer!


In [63]:
barney_path = os.path.join(os.getcwd(), "Datasets", "Characters", "Barney")
if not os.path.exists(barney_path):
    os.makedirs(barney_path)
barney_df.to_csv(os.path.join(barney_path, "Barney.csv"), index=False)

In [64]:
from datasets import load_dataset

barney_hg = load_dataset('csv', data_files=os.path.join(barney_path, "Barney.csv"))
barney_hg = barney_hg["train"].train_test_split(test_size=0.1)

Using custom data configuration default-86f70c1791bbdd2c


Downloading and preparing dataset csv/default to C:\Users\tonel\.cache\huggingface\datasets\csv\default-86f70c1791bbdd2c\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to C:\Users\tonel\.cache\huggingface\datasets\csv\default-86f70c1791bbdd2c\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [65]:
from transformers import TFAutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model = TFAutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at microsoft/DialoGPT-medium.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [23]:
# Let's chat for 3 lines
for step in range(3):
    # encode the new user input, add the eos_token and return a tensor
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='tf')
    # append the new user input tokens to the chat history
    bot_input_ids = tf.concat([chat_history_ids, new_user_input_ids], axis=-1) if step > 0 else new_user_input_ids
    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> User:Hello
DialoGPT: Hello! :D
>> User:How are you?
DialoGPT: I'm good, how are you?
>> User:I'm fine
DialoGPT: That's good!


In [67]:
print(barney_hg)

DatasetDict({
    train: Dataset({
        features: ['response', 'context', 'context/0'],
        num_rows: 4465
    })
    test: Dataset({
        features: ['response', 'context', 'context/0'],
        num_rows: 497
    })
})


In [68]:
def construct_conv(row, tokenizer):
    row = list(reversed(list(row.values())))
    model_inputs = tokenizer(row, padding="max_length", max_length=64, truncation=True)
    for i in range(len(model_inputs['input_ids'])):
        model_inputs['input_ids'][i].append(tokenizer.eos_token_id)
    for i in range(1, len(model_inputs['input_ids'])):
        model_inputs['input_ids'][0].extend(model_inputs['input_ids'][i])
        model_inputs['attention_mask'][0].extend(model_inputs['attention_mask'][i])
    model_inputs['input_ids'] = [item for sublist in model_inputs['input_ids'] for item in sublist]
    model_inputs['attention_mask'] = [item for sublist in model_inputs['attention_mask'] for item in sublist]
    return model_inputs

def preprocess_function(examples):
    tokenizer.pad_token = tokenizer.eos_token
    model_inputs = construct_conv(examples, tokenizer)
    return model_inputs

tokenized_barney_hg = barney_hg.map(preprocess_function, batched=False)

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [69]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors='tf')

print(tokenized_barney_hg)

DatasetDict({
    train: Dataset({
        features: ['response', 'context', 'context/0', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4465
    })
    test: Dataset({
        features: ['response', 'context', 'context/0', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 497
    })
})


In [72]:
tf_train_set = tokenized_barney_hg["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = tokenized_barney_hg["test"].to_tf_dataset(
    columns=["input_ids", "attention_mask"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [73]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5))
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as keys in the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


Epoch 1/3




 17/279 [>.............................] - ETA: 10:44 - loss: -0.0114 - past_key_values_1_loss: -0.0114

KeyboardInterrupt: 