In [1]:
import os
import pandas as pd
from tqdm import tqdm
import re
import tensorflow as tf

In [2]:
# Open the dataset documents and store their data into a DataFrame
def load_himym_dataset():
    episodes_folder = os.path.join(os.getcwd(), "Datasets", "Sources", "HIMYM", "Episodes")
    dataframe_rows = []
    # Get number of documents and their names
    documents_n = len(os.listdir(episodes_folder))
    documents_names = os.listdir(episodes_folder)

    # Loop over documents
    for i in tqdm(range(documents_n)):
        filename = documents_names[i]
        # Open document
        file = open(os.path.join(episodes_folder, filename))
        episode_index = filename[:-4]
        # Loop over lines (= words)
        for line in file.readlines():
                dataframe_row = {
                    "episode": episode_index,
                    "line": line,
                }
                dataframe_rows.append(dataframe_row)
    # Build the dataframe from the words
    df = pd.DataFrame(dataframe_rows)
    return df

In [3]:
# Execute creation of dataset
himym_df = load_himym_dataset()
himym_df.head()
himym_df.count()

100%|███████████████████████████████████████████████████████████████████████████████| 139/139 [00:00<00:00, 149.85it/s]


episode    39284
line       39284
dtype: int64

In [4]:
def process_himym_dataset(df):
    df = df[~df['line'].str.startswith("[")]
    df = df[~df['line'].str.startswith("(")]
    df['line'] = df['line'].str.strip()
    df['line'] = df['line'].str.replace(r"\(.*\)","")
    df[['character', 'line']] = df['line'].str.split(":", 1, expand=True)
    df = df[~df['line'].isnull()]
    df = df[~df['character'].isnull()]
    df = df[~df['line'].str.isspace()]
    df = df.reset_index(drop=True)
    return df
    
himym_df = process_himym_dataset(himym_df)
print(len(himym_df))

30286


In [5]:
himym_df.head(20)

Unnamed: 0,episode,line,character
0,01x01,"Kids, I'm going to tell you an incredible sto...",Narrator
1,01x01,Are we being punished for something?,Son
2,01x01,No,Narrator
3,01x01,"Yeah, is this going to take a while?",Daughter
4,01x01,"Yes. Twenty-five years ago, before I was dad...",Narrator
5,01x01,It was way back in 2005. I was twenty-seven j...,Narrator
6,01x01,Will you marry me.,Marshall
7,01x01,"Yes, perfect! And then you're engaged, you po...",Ted
8,01x01,"Got it. Thanks for helping me plan this out, ...",Marshall
9,01x01,"Dude, are you kidding? It's you and Lily! I'v...",Ted


In [6]:
himym_df['character'].unique()

array(['Narrator', 'Son', 'Daughter', 'Marshall', 'Ted', 'Barney',
       'Yasmine', 'Lily', 'Robin', 'Cabdriver', "Robin's Dumped Friend",
       'Producer', 'Waitor', 'Ranjit', 'Lily, Marshall and Barney',
       'Son and Daughter', 'Rangit', 'Marshal', 'Carl', 'Cameraman',
       'Leroy', 'Lily and Marshall', 'Fantasy Girl', 'Tatiana',
       'Lily and Ted', 'Crowd', 'Carlos', 'Barney and Ted',
       'Marshall, Lily and Ted', 'Mashall, Lily and Ted', 'Guy#1',
       'Laura', 'Fight Attendant', 'Guy#2', 'Guy#3', 'Officer McNeil',
       'b*mb Squad Guy', 'Derrick', 'Dana', 'Sascha', 'Cabdriver#2',
       'Cute Girl', 'Stefanie', 'Marshall and Ted', 'Mr. Adams',
       'Natalie', 'One Guest', 'All', 'Henry', 'Waiter', 'Claire',
       'Bradley', 'Chris', 'Austin', 'Kelly', 'Bartender', 'Phil',
       'Man on Street', 'Doorman#2', 'Woman', 'Coat Check Girl',
       'Barney, Ted and Robin', 'Future Ted', 'Lily ', 'Barney ',
       'Marshall, Lily, Barney', 'Lily, Marshall, Barney', 'Mi

In [7]:
# NOTE: May consider feeding one sentence and one Barney reply or multiple sentences encoded with one Barney reply
def get_barney(himym_df, level=2):
    dataframe_rows = []
    idxs_barney = himym_df[himym_df['character'] == 'Barney'].index
    for i in range(-1, -level-1, -1):
        for j in idxs_barney:
            dataframe_row = {
                "reply": himym_df['line'][j],
                "sentence": himym_df['line'][j+i],
            }
            dataframe_rows.append(dataframe_row)
    df = pd.DataFrame(dataframe_rows)
    return df
    
barney_df = get_barney(himym_df)

In [8]:
barney_df.head()

Unnamed: 0,reply,sentence
0,"hey, so you know how I've always had a thing...",What was I doing? Your Uncle Marshall was tak...
1,"Okay, meet me at the bar in fifteen minutes, ...","Hey, you wanna do something tonight?"
2,Where's your suit!? Just once when I say suit...,Hey.
3,It was a blazer!,I did that one time.
4,I see what this is about. Have you forgotten ...,"You know, ever since college it's been Marsha..."


In [9]:
print(barney_df.iloc[8174]["sentence"])

 Why? Give me one good reason.


In [10]:
barney_path = os.path.join(os.getcwd(), "Datasets", "Characters", "Barney")
if not os.path.exists(barney_path):
    os.makedirs(barney_path)
barney_df.to_csv(os.path.join(barney_path, "Barney.csv"))

In [11]:
from datasets import load_dataset

barney_hg = load_dataset('csv', data_files=os.path.join(barney_path, "Barney.csv"))
barney_hg = barney_hg["train"].train_test_split(test_size=0.1)

Using custom data configuration default-6e7b55061fd540b5


Downloading and preparing dataset csv/default to C:\Users\tonel\.cache\huggingface\datasets\csv\default-6e7b55061fd540b5\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to C:\Users\tonel\.cache\huggingface\datasets\csv\default-6e7b55061fd540b5\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
# TEMPORARY FIX
barney_hg = barney_hg.filter(lambda x: x['sentence'] != None)

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [30]:
from transformers import AutoTokenizer, BlenderbotForConditionalGeneration
mname = 'facebook/blenderbot-400M-distill'
model = BlenderbotForConditionalGeneration.from_pretrained(mname)
tokenizer = AutoTokenizer.from_pretrained(mname)

loading configuration file https://huggingface.co/facebook/blenderbot-400M-distill/resolve/main/config.json from cache at C:\Users\tonel/.cache\huggingface\transformers\cec3df71c8d94a67ad9280220975d00732c9dbcf7adaa3afe440d6626e5fdf02.6d6f9df9c9b98c6aa3a827af7ec4aae285715ccb3673cadfe98cacb7235b5809
Model config BlenderbotConfig {
  "_name_or_path": "./",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": true,
  "architectures": [
    "BlenderbotForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1280,
  "decoder_attention_heads": 32,
  "decoder_ffn_dim": 5120,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 1,
  "do_blenderbot_90_layernorm": true,
  "dropout": 0.1,
  "encoder_attention_heads": 32,
  "encoder_ffn_dim": 5120,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 2,
  "encoder_no_repeat_n

Model config BlenderbotConfig {
  "_name_or_path": "facebook/blenderbot-400M-distill",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": true,
  "architectures": [
    "BlenderbotForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1280,
  "decoder_attention_heads": 32,
  "decoder_ffn_dim": 5120,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 1,
  "do_blenderbot_90_layernorm": true,
  "dropout": 0.1,
  "encoder_attention_heads": 32,
  "encoder_ffn_dim": 5120,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 2,
  "encoder_no_repeat_ngram_size": 3,
  "eos_token_id": 2,
  "extra_layer_norm": false,
  "extra_pos_embeddings": 0,
  "force_bos_token_to_be_generated": false,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LA

In [31]:
UTTERANCE = "Fuck you."
print("Human: ", UTTERANCE)
inputs = tokenizer([UTTERANCE], return_tensors='pt')
reply_ids = model.generate(**inputs)
print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])

Human:  Fuck you.
Bot:   I know, right?  I was so mad.  I don't know what I would have done if it happened to me.


In [15]:
print(barney_hg)

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'reply', 'sentence'],
        num_rows: 8932
    })
    test: Dataset({
        features: ['Unnamed: 0', 'reply', 'sentence'],
        num_rows: 993
    })
})


In [44]:
def preprocess_function(examples):
    inputs = examples["sentence"]
    targets = examples["reply"]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    with tokenizer.as_target_tokenizer():
        model_targets = tokenizer(targets, max_length=128, truncation=True)
    model_inputs["decoder_input_ids"] = model_targets["input_ids"]
    model_inputs["decoder_attention_mask"] = model_targets["attention_mask"]
    return model_inputs

tokenized_barney_hg = barney_hg.map(preprocess_function, batched=True)

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [45]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

print(tokenized_barney_hg)

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'reply', 'sentence', 'input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask'],
        num_rows: 8932
    })
    test: Dataset({
        features: ['Unnamed: 0', 'reply', 'sentence', 'input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask'],
        num_rows: 993
    })
})


In [48]:
len(tokenized_barney_hg['train'][0]['input_ids'])

15

In [18]:
'''
for layer in model.layers[0].encoder.layers:
    layer.trainable = False
for i in range(10):
    model.layers[0].decoder.layers[i].trainable = False
    
model.summary()
'''

'\nfor layer in model.layers[0].encoder.layers:\n    layer.trainable = False\nfor i in range(10):\n    model.layers[0].decoder.layers[i].trainable = False\n    \nmodel.summary()\n'

In [47]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_barney_hg["train"],
    eval_dataset=tokenized_barney_hg["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set  don't have a corresponding argument in `BlenderbotForConditionalGeneration.forward` and have been ignored: sentence, reply, Unnamed: 0. If sentence, reply, Unnamed: 0 are not expected by `BlenderbotForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8932
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 559


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length.

In [15]:
'''
tf_train_set = tokenized_barney_hg["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = tokenized_barney_hg["test"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5))
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3)
'''

'\ntf_train_set = tokenized_barney_hg["train"].to_tf_dataset(\n    columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"],\n    shuffle=True,\n    batch_size=16,\n    collate_fn=data_collator,\n)\n\ntf_test_set = tokenized_barney_hg["test"].to_tf_dataset(\n    columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"],\n    shuffle=False,\n    batch_size=16,\n    collate_fn=data_collator,\n)\n\nmodel.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5))\nmodel.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3)\n'

In [None]:
UTTERANCE = "My friends are cool but they eat too many carbs."
print("Human: ", UTTERANCE)
inputs = tokenizer([UTTERANCE], return_tensors='pt')
reply_ids = model.generate(**inputs)
print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0])