# <u>Chapter 9</u>: Generating Text in Chatbots

In [18]:
import sys
import subprocess
import pkg_resources

# Find out which packages are missing.
installed_packages = {dist.key for dist in pkg_resources.working_set}
required_packages = {'pandas', 'transformers', 'convokit', 'datasets', 'transformers[torch]'}
missing_packages = required_packages - installed_packages

# If there are missing packages install them.
if missing_packages:
    print('Installing the following packages: ' + str(missing_packages))
    python = sys.executable
    subprocess.check_call([python, '-m', 'pip', 'install', *missing_packages], stdout=subprocess.DEVNULL)

In [None]:
import os

# Check if the data directory already exists.
if not os.path.exists("data"):
    !mkdir data

## Fine-tuning the pre-trained model

We incorporate the [Cornell Movie-Dialogs Corpus](https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html), a large collection of fictional conversations extracted from raw movie scripts. The corpus is available from the _convokit_ toolkit.

In [19]:
from convokit import Corpus, download

# Load the corpus.
corpus = Corpus(download('movie-corpus'))

Downloading movie-corpus to C:\Users\tsouraki\.convokit\downloads\movie-corpus
Downloading movie-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/movie-corpus/movie-corpus.zip (40.9MB)... Done


For each conversation, we extract the sentences and store the results in a training and validation file.

In [20]:
# Extract the sentences for each dialog.
def extract_dialogs(corpus, split=None):
    dialogs = []

    # Iterate over all conversations.
    for convo in corpus.iter_conversations():
        # Consider only conversations in the specified split of the data.
        if split is None or convo.meta['split'] == split:

            dialog_str = ""

            # Get the sentences in the conversation.
            for utterance in convo.iter_utterances():                
                dialog_str = dialog_str + " " + utterance.text

            dialogs.append(dialog_str)          
    
    return dialogs

samples = extract_dialogs(corpus)

For efficiency we filter the dataset.

In [4]:
import pandas as pd

samples_df = pd.DataFrame()

# Filter text with only ten words.
samples_df['text'] = samples
samples_df['count'] = samples_df['text'].str.split().apply(len)

mask = (samples_df['count'] == 10)
samples_df = samples_df.loc[mask]

# Store the training data.
samples_df[0:1000].to_csv("./data/cornell_train.csv", columns=["text"], index=False, header=False)
# Store the validation data.
samples_df[1001:1301].to_csv("./data/cornell_val.csv", columns=["text"], index=False, header=False)

We can now load the data from the previously created files.

In [5]:
from datasets import load_dataset

# Load the data from the text files.
data = load_dataset("text", data_files={"train": "./data/cornell_train.csv", "validation": "./data/cornell_val.csv"})

data["train"][15]

Using custom data configuration default-c9cc09a4997bb90e


Downloading and preparing dataset text/default to C:\Users\tsouraki\.cache\huggingface\datasets\text\default-c9cc09a4997bb90e\0.0.0\21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad...


Downloading data files: 100%|██████████| 2/2 [00:00<00:00, 687.42it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 462.46it/s]
                            

Dataset text downloaded and prepared to C:\Users\tsouraki\.cache\huggingface\datasets\text\default-c9cc09a4997bb90e\0.0.0\21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad. Subsequent calls will reuse this data.


100%|██████████| 2/2 [00:00<00:00, 73.68it/s]


{'text': ' Do you know how much I missed you? Welcome home.'}

To speed up the training process, we incorporate the small version of the _DialoGPT_ model and tokenize the input data.

In [6]:
from transformers import AutoTokenizer

# Setup tokenization.
model_name = "microsoft/DialoGPT-small"    
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

def perform_tokenization(samples):
    return tokenizer(samples["text"])

tokenized_data = data.map(perform_tokenization, batched=True, num_proc=4, remove_columns=["text"])

tokenized_data["train"][15]

Downloading: 100%|██████████| 26.0/26.0 [00:00<00:00, 26.0kB/s]
Downloading: 100%|██████████| 641/641 [00:00<00:00, 576kB/s]
Downloading: 100%|██████████| 1.04M/1.04M [00:00<00:00, 1.21MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 852kB/s] 


{'input_ids': [2141, 345, 760, 703, 881, 314, 6825, 345, 30, 19134, 1363, 13],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Next, we concatenate all our texts together then split the result in small chunks of a certain _block_size_.

In [7]:
# Maximum length of the block.
block_size = 64

# Create new samples from a batch of examples.
def preprocess_text(samples):

    # Concatenate all samples.
    concatenated = {k: sum(samples[k], []) for k in samples.keys()}
    length = len(concatenated[list(samples.keys())[0]])
    length = (length // block_size) * block_size

    # Split by chunks of block_size.
    output = {
        k: [t[i : i + block_size] for i in range(0, length, block_size)]
        for k, t in concatenated.items()
    }
    
    output["labels"] = output["input_ids"].copy()

    return output

The _map_ method will send a batch of 1,000 examples to be treated by the preprocessing function.

In [8]:
new_dataset = tokenized_data.map(
    preprocess_text,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Now that the data has been cleaned, we're ready to instantiate our _Trainer_.

In [12]:
from transformers import AutoModelForCausalLM
from transformers import Trainer, TrainingArguments

# Load the model to be tuned.
model = AutoModelForCausalLM.from_pretrained(model_name)

name = model_name.split("/")[-1]

# Define the training arguments.
training_args = TrainingArguments(
    f"{name}-finetuned-cornell",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
    report_to="none",
)

loading configuration file config.json from cache at C:\Users\tsouraki/.cache\huggingface\hub\models--microsoft--DialoGPT-small\snapshots\f9c829d0285e7addb0667aeb6e33956916ec6cd0\config.json
Model config GPT2Config {
  "_name_or_path": "microsoft/DialoGPT-small",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "conversational": {
      "max_length": 1000
  

We pass along all of those to the _Trainer_ class.



In [13]:
# Create the trainer.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=new_dataset["train"],
    eval_dataset=new_dataset["validation"],
)

# Start training the model.
trainer.train()

***** Running training *****
  Num examples = 237
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 90
 33%|███▎      | 30/90 [02:19<04:17,  4.28s/it]***** Running Evaluation *****
  Num examples = 72
  Batch size = 8

 33%|███▎      | 30/90 [02:31<04:17,  4.28s/it]

{'eval_loss': 7.098649978637695, 'eval_runtime': 11.6995, 'eval_samples_per_second': 6.154, 'eval_steps_per_second': 0.769, 'epoch': 1.0}


 67%|██████▋   | 60/90 [04:48<02:08,  4.28s/it]***** Running Evaluation *****
  Num examples = 72
  Batch size = 8

 67%|██████▋   | 60/90 [05:01<02:08,  4.28s/it]

{'eval_loss': 5.578792095184326, 'eval_runtime': 13.3756, 'eval_samples_per_second': 5.383, 'eval_steps_per_second': 0.673, 'epoch': 2.0}


100%|██████████| 90/90 [07:18<00:00,  4.92s/it]***** Running Evaluation *****
  Num examples = 72
  Batch size = 8

100%|██████████| 90/90 [07:31<00:00,  4.92s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 90/90 [07:31<00:00,  5.02s/it]

{'eval_loss': 5.300169944763184, 'eval_runtime': 13.2658, 'eval_samples_per_second': 5.427, 'eval_steps_per_second': 0.678, 'epoch': 3.0}
{'train_runtime': 451.8702, 'train_samples_per_second': 1.573, 'train_steps_per_second': 0.199, 'train_loss': 6.958156331380208, 'epoch': 3.0}





TrainOutput(global_step=90, training_loss=6.958156331380208, metrics={'train_runtime': 451.8702, 'train_samples_per_second': 1.573, 'train_steps_per_second': 0.199, 'train_loss': 6.958156331380208, 'epoch': 3.0})

Once the training is completed, we can evaluate our model and get its perplexity on the validation set like this:

In [15]:
import math

# Evaluate the trained model.
eval = trainer.evaluate()
ppl = math.exp(eval['eval_loss'])

print("The perplexity of the model is: %.2f" % ppl)

***** Running Evaluation *****
  Num examples = 72
  Batch size = 8
100%|██████████| 9/9 [00:12<00:00,  1.36s/it]

The perplexity of the model is: 200.37





Let's test the model.

In [16]:
# Test the model with a sample sentence.
test = "I have a question."

# Tokenize the input.
input_ids = tokenizer.encode(test+tokenizer.eos_token, return_tensors='pt')

history = model.generate(input_ids)
output = tokenizer.decode(history[0]).split("<|endoftext|>")
output = [(output[i], output[i+1]) for i in range(0, len(output)-1, 2)] 

print(output)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[('I have a question.', "I don't know.")]


## What we have learned …

| |
| --- |
| **ML concepts** <ul><li>Fine-tuning</li></ul> |
| |