In [40]:
import os
import pandas as pd
from tqdm import tqdm
import re
import tensorflow as tf
import json

In [2]:
# Mount google drive (for Colab only)
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
    os.system('pip install datasets')
    os.system('pip install transformers')
else:
    base_folder = os.getcwd()

Mounted at /content/drive


# Preprocessing

In [3]:
# Open the dataset documents and store their data into a DataFrame
def load_himym_dataset():
    episodes_folder = os.path.join(base_folder, "Datasets", "Sources", "HIMYM", "Episodes")
    dataframe_rows = []
    # Get number of documents and their names
    documents_n = len(os.listdir(episodes_folder))
    documents_names = os.listdir(episodes_folder)

    # Loop over documents
    for i in tqdm(range(documents_n)):
        filename = documents_names[i]
        # Open document
        file = open(os.path.join(episodes_folder, filename))
        episode_index = filename[:-4]
        # Loop over lines (= words)
        for line in file.readlines():
                dataframe_row = {
                    "episode": episode_index,
                    "line": line,
                }
                dataframe_rows.append(dataframe_row)
    # Build the dataframe from the words
    df = pd.DataFrame(dataframe_rows)
    return df

In [4]:
# Execute creation of dataset
himym_df = load_himym_dataset()
himym_df.head()
himym_df.count()

100%|██████████| 139/139 [00:00<00:00, 208.52it/s]


episode    39284
line       39284
dtype: int64

In [5]:
def process_himym_dataset(df):
    df = df[~df['line'].str.startswith("[")]
    df = df[~df['line'].str.startswith("(")]
    df['line'] = df['line'].str.strip()
    df['line'] = df['line'].str.replace(r"\(.*\)","")
    df['line'] = df['line'].str.replace(r"[\/(){}\[\]\|@_#]|\\t|\\n"," ")
    df['line'] = df['line'].str.replace(r"[^.\',;:?!0-9a-zA-Z \-]","")
    df = df[~df['line'].isnull()]
    df[['character', 'line']] = df['line'].str.split(":", 1, expand=True)
    df = df.dropna()
    df['line'] = df['line'].str.strip()
    df['line'] = df['line'][df['line'].str.len() >= 2]
    df = df[~df['line'].isnull()]
    df = df.dropna()
    df = df.reset_index(drop=True)
    return df
    
himym_df = process_himym_dataset(himym_df)
print(len(himym_df))

  """
  
  import sys


30274


In [6]:
himym_df.head(20)

Unnamed: 0,episode,line,character
0,02x13,Some kids dream of being astronauts. Some kids...,Ted
1,02x13,That's stupid.,Boy
2,02x13,There were some obstacles along the way. But e...,Ted
3,02x13,"Morning, everyone! So, I had an idea for the a...",Ted
4,02x13,That's stupid.,Mr. Druthers
5,02x13,I... I can't believe you knocked over my model.,Ted
6,02x13,"Well, it's just... it's not exactly new, is it...",Mr Druthers
7,02x13,I know what you're thinking: Who's this jerk? ...,Ted
8,02x13,Stairs?,Mr Druthers
9,02x13,"And to be honest, I wasn't sure I liked it eit...",Ted


In [7]:
himym_df['character'].unique()

array(['Ted ', 'Boy', 'Ted', 'Mr. Druthers', 'Mr Druthers', 'Co-worker',
       'Robin', 'Barney', 'Marshall', 'Lily', 'MP', 'Employees', 'Carl',
       'Voice', 'Mr Druters', 'Man', 'Men ', 'Future Ted', 'Lily ',
       'Barney ', 'Marshall, Lily, Barney', 'Lily, Marshall, Barney',
       'Mike', 'Marshall ', 'King Costume Guy', 'Shagarats',
       'Lily, Robin', 'Hula Girl', 'Angel Guy', 'Future Ted VO',
       'Hula girl', 'Tanya', 'Victoria', 'Lily, Marshall', 'Claudia',
       'Stuart', 'Nirvana', 'Ted from 2030', 'Wendy', 'Wendy ',
       'Everybody', 'Marshal', 'Man in the street ', 'Marshall voiceover',
       'Regis Philbin ', 'Regis Philbin', 'Million Dollar', 'Robin ',
       'Waitress', 'Man in the streets', 'Regis', 'Stella ', 'Stella',
       'Sister Stella', 'Nora', 'Bartender', 'Woman', 'Tony', 'Lucy',
       'Barman', 'All', 'Shannon', 'Ranjit', 'Blah-blah', 'Marshall VO',
       'Lily VO', 'Marshall, Lily VO', 'College Marshall', 'College Lily',
       'Everyone', 'Co

In [8]:
# NOTE: May consider feeding one sentence and one Barney reply or multiple sentences encoded with one Barney reply
def get_barney(himym_df, level=2):
    dataframe_rows = []
    idxs_barney = himym_df[himym_df['character'] == 'Barney'].index
    dataframe_rows = []
    for i in idxs_barney:
        l = []
        l.append(himym_df['line'][i])
        for j in range(0,level):
            l.append(himym_df['line'][i-j-1])
        dataframe_rows.append(l)
    df = pd.DataFrame(dataframe_rows, columns=['response', 'context', 'context/0'])
    return df

barney_df = get_barney(himym_df)

In [9]:
barney_df.head()

Unnamed: 0,response,context,context/0
0,I have found that to be true.,"Well, 'cause only good-looking people can get ...",Why would you say that?
1,Mona.,Angela.,Tony.
2,Watch it more closely. Rock your world.,Mona?,Mona.
3,You mind if I charge my phone?,"Fine, Mosby can stay. But tell him he's on thi...","Mosby, sir? I, I... I hear Mosby's doing some ..."
4,"Oh, my God!",You know what? I'm gonna do it. I'm gonna fire...,"Vicki, um... I'm so sorry about this, but ther..."


In [10]:
barney_path = os.path.join(base_folder, "Datasets", "Characters", "Barney")
if not os.path.exists(barney_path):
    os.makedirs(barney_path)
barney_df.to_csv(os.path.join(barney_path, "Barney.csv"), index=False)

In [11]:
from datasets import load_dataset
os.environ["HF_DATASETS_CACHE"] = os.path.join(os.getcwd(), "cache")

barney_hg = load_dataset('csv', data_files=os.path.join(barney_path, "Barney.csv"), cache_dir=os.path.join(os.getcwd(), "cache"))
barney_hg = barney_hg["train"].train_test_split(test_size=0.1)

Using custom data configuration default-1f01bc59e0935acf


Downloading and preparing dataset csv/default to /content/cache/csv/default-1f01bc59e0935acf/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /content/cache/csv/default-1f01bc59e0935acf/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

# Bot Loading & Fine-Tuning

In [12]:
from transformers import TFAutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small", cache_dir=os.path.join(os.getcwd(), "cache"))
model = TFAutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small", cache_dir=os.path.join(os.getcwd(), "cache"))

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/641 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/641 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/475M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at microsoft/DialoGPT-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [13]:
# Let's chat for 3 lines
for step in range(3):
    # encode the new user input, add the eos_token and return a tensor
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='tf')
    # append the new user input tokens to the chat history
    bot_input_ids = tf.concat([chat_history_ids, new_user_input_ids], axis=-1) if step > 0 else new_user_input_ids
    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> User:hi
DialoGPT: Hi
>> User:how are you?
DialoGPT: I'm here
>> User:what is your name?
DialoGPT: I'm here


In [14]:
print(barney_hg)

DatasetDict({
    train: Dataset({
        features: ['response', 'context', 'context/0'],
        num_rows: 4465
    })
    test: Dataset({
        features: ['response', 'context', 'context/0'],
        num_rows: 497
    })
})


In [15]:
def construct_conv(row, tokenizer):
    MAX_LENGTH = 512
    row = list(reversed(list(row.values())))
    model_inputs = tokenizer(row)
    tokenizer_pad_token_id = tokenizer.encode('#')[0]
    for i in range(len(model_inputs['input_ids'])):
        model_inputs['input_ids'][i].append(tokenizer.eos_token_id)
        model_inputs['attention_mask'][i].append(1)
    model_inputs['input_ids'] = [item for sublist in model_inputs['input_ids'] for item in sublist]
    model_inputs['attention_mask'] = [item for sublist in model_inputs['attention_mask'] for item in sublist]
    if MAX_LENGTH > len(model_inputs['input_ids']):
        model_inputs['input_ids'] += [tokenizer_pad_token_id] * (MAX_LENGTH - len(model_inputs['input_ids']))
        model_inputs['attention_mask'] += [0] * (MAX_LENGTH - len(model_inputs['attention_mask']))
    elif MAX_LENGTH < len(model_inputs['input_ids']):
        model_inputs['input_ids'] = model_inputs['input_ids'][:MAX_LENGTH-1]
        model_inputs['input_ids'][-1] = tokenizer.eos_token_id
        model_inputs['attention_mask'] = model_inputs['attention_mask'][:MAX_LENGTH-1]
        model_inputs['attention_mask'][-1] = 1
    model_inputs["labels"] = model_inputs["input_ids"]
    return model_inputs

def preprocess_function(examples):
    tokenizer.pad_token = '#'
    model_inputs = construct_conv(examples, tokenizer)
    return model_inputs

tokenized_barney_hg = barney_hg.map(preprocess_function, batched=False)

  0%|          | 0/4465 [00:00<?, ?ex/s]

  0%|          | 0/497 [00:00<?, ?ex/s]

In [16]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(mlm=False, tokenizer=tokenizer, return_tensors='tf')

print(tokenized_barney_hg)

DatasetDict({
    train: Dataset({
        features: ['response', 'context', 'context/0', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4465
    })
    test: Dataset({
        features: ['response', 'context', 'context/0', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 497
    })
})


In [17]:
tf_train_set = tokenized_barney_hg["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    batch_size=2,
    collate_fn=data_collator,
)

tf_test_set = tokenized_barney_hg["test"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    batch_size=2,
    collate_fn=data_collator,
)

In [18]:
from transformers import AdamWeightDecay

model.compile(optimizer=AdamWeightDecay(learning_rate=2e-5))
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=1)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as keys in the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


  17/2232 [..............................] - ETA: 11:26:59 - loss: 6.4689

KeyboardInterrupt: ignored

In [19]:
# Let's chat for 3 lines
for step in range(3):
    # encode the new user input, add the eos_token and return a tensor
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='tf')
    # append the new user input tokens to the chat history
    bot_input_ids = tf.concat([chat_history_ids, new_user_input_ids], axis=-1) if step > 0 else new_user_input_ids
    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(bot_input_ids, max_length=64, pad_token_id=tokenizer.eos_token_id)
    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> User:what are you wearing?
DialoGPT: I'm not wearing anything.
>> User:ok
DialoGPT: I'm not wearing anything.
>> User:no+
DialoGPT: I'm not wearing anything.


# BLEU Computation

In [20]:
import datasets
bleu_metric = datasets.load_metric('bleu')

Downloading builder script:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

In [21]:
sample_references = tokenized_barney_hg["test"]['response']
sample_questions = tokenized_barney_hg["test"]['context']

In [37]:
predictions = list()
for x in tqdm(sample_questions):
    tokenized_question = tokenizer.encode(x + tokenizer.eos_token, return_tensors='tf')
    max_length = 128 + tokenized_question.shape[1]
    generated_answer = model.generate(tokenized_question,
                               pad_token_id=tokenizer.eos_token_id, max_length=max_length)[0].numpy().tolist()
    predictions.append(generated_answer[len(tokenized_question[0]):])

100%|██████████| 497/497 [33:23<00:00,  4.03s/it]


In [38]:
in_folder = os.path.join(base_folder, "in")
if not os.path.exists(in_folder):
    os.makedirs(in_folder)

In [41]:
# Save predictions as a JSON file
filename = 'predictions.json'

output_string = json.dumps(predictions)
with open(os.path.join(in_folder, filename), 'w') as file:
    file.write(output_string)

In [42]:
# Load history as a JSON file
filename = 'predictions.json'

with open(os.path.join(in_folder, filename), 'r') as file:
    json_string = file.read()

predictions_2 = json.loads(json_string)

In [44]:
labels = list()
for x in tqdm(sample_references):
    labels.append(tokenizer.encode(x + tokenizer.eos_token, return_tensors='tf').numpy().tolist())

100%|██████████| 497/497 [00:00<00:00, 2882.17it/s]


In [54]:
# print(predictions)
# print(labels)

bleu_metric.add_batch(predictions=predictions, references=labels)
final_score = bleu_metric.compute()

In [52]:
print(final_score)

{'bleu': 0.0037512898178157653, 'precisions': [0.15431164901664146, 0.016090935931502806, 0.003186235462800701, 0.0010368066355624676, 0.0005660377358490566], 'brevity_penalty': 0.6931391199990402, 'length_ratio': 0.7317834138486312, 'translation_length': 7271, 'reference_length': 9936}


In [55]:
print(predictions)

[[40, 1101, 407, 1654, 611, 345, 821, 852, 47037, 393, 407, 475, 314, 1101, 2495, 3967, 326, 338, 644, 339, 531, 764, 50256], [40, 1101, 407, 1654, 611, 345, 821, 29711, 393, 407, 475, 314, 1101, 2495, 1654, 326, 338, 407, 262, 1339, 764, 50256], [40, 1101, 407, 1654, 644, 345, 821, 2111, 284, 910, 994, 764, 50256], [40, 1101, 407, 1654, 611, 345, 821, 852, 47037, 393, 407, 837, 475, 314, 1101, 407, 1654, 611, 345, 821, 852, 47037, 764, 50256], [40, 1101, 257, 1263, 4336, 286, 262, 474, 4131, 282, 11729, 50256], [40, 1101, 407, 1654, 644, 345, 821, 2111, 284, 910, 994, 764, 50256], [40, 1101, 407, 534, 3516, 837, 6340, 13, 50256], [2061, 750, 345, 466, 284, 651, 503, 286, 326, 5633, 50256], [40, 1101, 407, 1654, 314, 460, 5412, 428, 764, 50256], [40, 750, 764, 50256], [5211, 345, 423, 262, 649, 2196, 286, 262, 598, 5633, 50256], [40, 1101, 407, 1016, 284, 2245, 1566, 314, 1101, 407, 1016, 284, 2245, 764, 50256], [40, 1101, 407, 257, 4336, 286, 262, 649, 804, 764, 314, 588, 262, 1468, 8