In [None]:
from google.colab import auth
auth.authenticate_user()

# https://cloud.google.com/resource-manager/docs/creating-managing-projects
project_id = 'ai5-c1-group1'
!gcloud config set project {project_id}

Updated property [core/project].


To take a quick anonymous survey, run:
  $ gcloud survey



In [None]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-1.14.0-py3-none-any.whl (290 kB)
[?25l[K     |█▏                              | 10 kB 17.3 MB/s eta 0:00:01[K     |██▎                             | 20 kB 7.1 MB/s eta 0:00:01[K     |███▍                            | 30 kB 5.4 MB/s eta 0:00:01[K     |████▌                           | 40 kB 5.2 MB/s eta 0:00:01[K     |█████▋                          | 51 kB 2.6 MB/s eta 0:00:01[K     |██████▊                         | 61 kB 2.9 MB/s eta 0:00:01[K     |████████                        | 71 kB 2.9 MB/s eta 0:00:01[K     |█████████                       | 81 kB 3.3 MB/s eta 0:00:01[K     |██████████▏                     | 92 kB 3.5 MB/s eta 0:00:01[K     |███████████▎                    | 102 kB 2.8 MB/s eta 0:00:01[K     |████████████▍                   | 112 kB 2.8 MB/s eta 0:00:01[K     |█████████████▌                  | 122 kB 2.8 MB/s eta 0:00:01[K     |██████████████▊                 | 133 kB 2.8 MB/s eta 0:00:01[

#### Imports

In [None]:
import os
import requests
import zipfile
import tarfile
import json
import time
import sys
import math
import logging
import numpy as np
import pandas as pd
from argparse import ArgumentParser
from subprocess import call
import textwrap

from collections import defaultdict
from multiprocessing import Pool
from tqdm.auto import tqdm, trange
from itertools import chain

import torch
import torch.nn.functional as F
from torch.cuda import amp
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler

from transformers.optimization import AdamW, get_linear_schedule_with_warmup
from transformers import GPT2Config, GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2Tokenizer

#### Setup Logger

In [None]:
# Setup Logger
if '__file__' not in globals():
  __file__ = "."
logger = logging.getLogger(__file__)

# Logger config
logging.basicConfig(level=logging.INFO)

#### Verify Setup

In [None]:
logger.info('__Python VERSION: %s', sys.version)
logger.info("torch version: %s", torch.__version__)
logger.info('CUDNN VERSION: %s', torch.backends.cudnn.version())
logger.info('Number CUDA Devices: %s', torch.cuda.device_count())
cuda_available = torch.cuda.is_available()
device = torch.device("cuda:0" if cuda_available else "cpu")
device_count = 0

if cuda_available:
  device_count = torch.cuda.device_count()
  logger.info('Devices:')
  logger.info('Active CUDA Device: %s', torch.cuda.current_device())
  logger.info('Available device count: %s', device_count)
  logger.info('Current cuda device: %s', torch.cuda.current_device())
else:
  logger.info('No CUDA Devices are available')

logger.info('Device: %s', device)
  

# nvidia-smi
call(["nvidia-smi", "--format=csv", "--query-gpu=index,name,driver_version,memory.total,memory.used,memory.free"])

INFO:.:__Python VERSION: 3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]
INFO:.:torch version: 1.9.0+cu111
INFO:.:CUDNN VERSION: 8005
INFO:.:Number CUDA Devices: 1
INFO:.:Devices:
INFO:.:Active CUDA Device: 0
INFO:.:Available device count: 1
INFO:.:Current cuda device: 0
INFO:.:Device: cuda:0


0

#### Utils

In [None]:
def download_file(packet_url, base_path="", extract=False, headers=None):
  if base_path != "":
    if not os.path.exists(base_path):
      os.mkdir(base_path)
  packet_file = os.path.basename(packet_url)
  with requests.get(packet_url, stream=True, headers=headers) as r:
      r.raise_for_status()
      with open(os.path.join(base_path,packet_file), 'wb') as f:
          for chunk in r.iter_content(chunk_size=8192):
              f.write(chunk)
  
  if extract:
    if packet_file.endswith(".zip"):
      with zipfile.ZipFile(os.path.join(base_path,packet_file)) as zfile:
        zfile.extractall(base_path)
    else:
      packet_name = packet_file.split('.')[0]
      with tarfile.open(os.path.join(base_path,packet_file)) as tfile:
        tfile.extractall(base_path)

## **<font color="darkred">GPT2 Double Head Model</font>**

#### Overview

We have seen how a Question Answering model works, we also saw how a Language generation model works. Let's attempt to combine some these ideas from the two models into one that can both answer questions as well as generate them. For this we will extend the GPT2 model.

**Causal Transformer**: 

We saw that GPT2 is the made up of only the Decoder with stacked transformer blocks. Also the model predicts words using only words from the left context. So if we look at our example on Emma.
<img src="https://storage.googleapis.com/public_colab_images/nlp/gpt2/causaltransformer02.png" width="800"/>

**Double Head Model**: 

Now how do we adapt this language model into a dialog task? In a question answering model we had to feed in a context and the model returned an answer. The language model generated text based on previous words. So if use the GPT2 model as a base and for the input we add some context to the data such as:
- Information about the dog, or its `persona`
- The `history` of the dialogue with the user
- The `answer` of the dog

And as a head we add:
- Language Model Head
- Multiple Choice Head

The GPT2 has by default one language model head which takes the hidden states from the final transform block and pass it to a linear layer to compute the logits. We then add another head called mutiple choice head, which takes the hidden states from the final transform block and summarizes the sequences to a single vector of a sequence hidden states. This could be done using `last` which is to take the last token hidden state, or `first` which is to take the first token hidden state, or `mean` which is to take the mean of all tokens hidden states.

<img src="https://storage.googleapis.com/public_colab_images/nlp/gpt2/gpt2doubleheadmodel.png" />


**Word Embeddings**: Word embeddings are where each word in the dataset is mapped to a numberical vector. Each of these vector has a sense of context between the words. So for exmaple words with simialr meaning or concepts come together in the vector space.

**Positional Embedding**: A transformer based model has no sense of the sequence of an input. So to give the model some sense of order we add a piece of information to each word about its position in the sentence. So positional embedding is a n-dimensional vector that contains information about a specific position in a sentence.

**Segment Embedding**: Our input consists of persona, history, and answer. So we want add information about each segment in the input.


**Finetuning Options**: 

There are multiple options to perform transfer learning and finetuing for our final dialog model:
<img src="https://storage.googleapis.com/public_colab_images/nlp/gpt2/gpt2dhfinetuning01.png" width="800"/>

- PERSONA-CHAT dataset size - 17,000
- Our dog dataset (small) 800

#### Dataset 

##### Example from PERSONA-CHAT dataset

PERSONA-CHAT is a large dataset of dialogs which was created by crowdsourcing personality sentences and asking paired crowd workers to chit-chat while playing the part of a given character

```
{
   "personality":[
      "my mom is my best friend .",
      "i have four sisters .",
      "i believe that mermaids are real .",
      "i love iced tea ."
   ],
   "utterances":[
      {
         "candidates":[
            "there was one person better than me , but i will keep trying to pass",
            "oh that's a yummy jpb",
            "i take it you are not getting along with him ?",
            "...",
            "good . where are you from ?",
            "right now i am doing an ocean liner .",
            "i am spending time with my 4 sisters what are you up to"
         ],
         "history":[
            "hi , how are you doing today ?"
         ]
      },
      {
         "candidates":[
            "that would be great . what do you do on the weekends ?",
            "i am sorry to hear about that . i am not married .",
            "...",
            "ah manic depressive . how to you cope ? meditation ? any hobbies ?",
            "hey ! what kind of music are your into ?",
            "that is a good show i watch that while drinking iced tea"
         ],
         "history":[
            "hi , how are you doing today ?",
            "i am spending time with my 4 sisters what are you up to",
            "wow , four sisters . just watching game of thrones ."
         ]
      },
      {
         "candidates":[
            "very long , she was with me when i colored my hair pink",
            "i like being alone and hitchhiking",
            "hi , how are you doing today ?",
            "actually been good and better than what i expected",
            "...",
            "my name is charlie . what kind of earrings ?",
            "of course . i'm listening .",
            "i'm a researcher i'm researching the fact that mermaids are real"
         ],
         "history":[
            "hi , how are you doing today ?",
            "i am spending time with my 4 sisters what are you up to",
            "wow , four sisters . just watching game of thrones .",
            "that is a good show i watch that while drinking iced tea",
            "i agree . what do you do for a living ?"
         ]
      }
   ]
}
```

##### Example dialog dataset generated for dogs

The persona dataset for dogs were generated using some basic metadata we have about the dogs. Nothing fancy but this helps to test out our dialog model

```
{
   "personality":[
      "I am Emma",
      "I am a Dog",
      "My gender is Female",
      "My weight is 55.0",
      "I was born on 2009",
      "I am 11.0 years old",
      "My breed is Retriever, Yellow Labrador",
      "My color is White",
      "I am house trained"
   ],
   "utterances":[
      {
         "candidates":[
            "i do , but mostly after work with the boys",
            "not if you inherit it and then reinvest . that s what trump did . lots do .",
            "...",
            "it it not a great experience , let me tell you .",
            "woof woof . i'm feeling great!"
         ],
         "history":[
            "hi , how are you ?"
         ]
      },
      {
         "candidates":[
            "hi , how are you doing today ?",
            "what do you read ? i just graduated college . i was chicago for school .",
            "..."
            "no i'm not right now but will be soon",
            "my name is Emma"
         ],
         "history":[
            "hi , how are you ?",
            "woof woof . i'm feeling great!",
            "what is your name ?"
         ]
      },
      {
         "candidates":[
            "nice ! i live near the gulf of mexico . youre a doctor ?",
            "i work at the ymca and i'm a member too .",
            "you must be creative , people like you do well in my field at ibm .",
            "..."
            "hi , how are you today",
            "i am a Dog"
         ],
         "history":[
            "hi , how are you ?",
            "woof woof . i'm feeling great!",
            "what is your name ?",
            "my name is Emma",
            "what are you ?"
         ]
      },
      {
         "candidates":[
            "great . how are you doing ?",
            "that is a good way to put it .",
            "..."
            "it happens . i find them all the time at my office .",
            "i am Female"
         ],
         "history":[
            "hi , how are you ?",
            "woof woof . i'm feeling great!",
            "what is your name ?",
            "my name is Emma",
            "what are you ?",
            "i am a Dog",
            "what is your gender ?"
         ]
      }
   ]
}
```

#### Load Pretrained Model/Tokenizer

We already have a pretrained model that was trained on the PERSON-CHAT dataset for 1 epoch (Takes around 2 hours)

In [None]:
# Download pretrained model 
model_url = "https://storage.googleapis.com/artifacts.ai5-c1-group1.appspot.com/data/transferlearning_gpt2doublehead.zip"
start_time = time.time()
download_file(model_url, base_path="models", extract=True)
execution_time = (time.time() - start_time)/60.0
logger.info("Download execution time (mins): %s",execution_time)

INFO:.:Download execution time (mins): 0.21209884484608968


In [None]:
# Load trained model
# model = GPT2DoubleHeadsModel.from_pretrained("gpt2")
# # Convert model parameter tensors to CUDA tensors
# model.to(device)
# # Load trained Tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")





# Load trained model
model = GPT2DoubleHeadsModel.from_pretrained('/content/models/transferlearning_gpt2doublehead')
# Convert model parameter tensors to CUDA tensors
model.to(device)
# Load trained Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("/content/models/transferlearning_gpt2doublehead")

#### Utils

In [None]:
# Tokens specific for GPT2 Double Head Model
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]
ATTR_TO_SPECIAL_TOKEN = {
    "bos_token": "<bos>",
    "eos_token": "<eos>",
    "pad_token": "<pad>",
    "additional_special_tokens": ["<speaker1>", "<speaker2>"],
}
MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"]
PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]

##### Util Functions

In [None]:
# Utils for tokenization & data preparation
process_count = 1
multiprocessing_chunksize = 500

def tokenize_multi(data):
  obj, tokenizer = data
  if isinstance(obj, str):
      return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
  if isinstance(obj, dict):
      return dict((n, tokenize_multi((o, tokenizer))) for n, o in obj.items())
  return list(tokenize_multi((o, tokenizer)) for o in obj)

def tokenize(obj):
  if isinstance(obj, str):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
  if isinstance(obj, dict):
    return dict((n, tokenize(o)) for n, o in obj.items())

  data = [(d, tokenizer) for d in obj]
  with Pool(process_count) as p:
    tokenized_data = list(
        tqdm(p.imap(tokenize_multi, data, chunksize=multiprocessing_chunksize), total=len(data))
    )
  return tokenized_data

def build_input_from_segments(persona, history, reply, tokenizer, lm_labels=False, with_eos=True):
  """ Build a sequence of input from 3 segments: persona, history and last reply. """
  bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
  sequence = [[bos] + list(chain(*persona))] + history + [reply + ([eos] if with_eos else [])]
  sequence = [sequence[0]] + [
      [speaker2 if (len(sequence) - i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])
  ]
  instance = {}
  instance["input_ids"] = list(chain(*sequence))
  instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
  instance["mc_token_ids"] = len(instance["input_ids"]) - 1
  instance["lm_labels"] = [-100] * len(instance["input_ids"])
  if lm_labels:
      instance["lm_labels"] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]
  return instance

def pad_dataset(dataset, padding=0):
  """ Pad the dataset. This could be optimized by defining a Dataset class and padding at the batch level,
  but this is simpler. """
  max_l = max(len(x) for x in dataset["input_ids"])
  for name in PADDED_INPUTS:
      dataset[name] = [x + [padding if name != "lm_labels" else -100] * (max_l - len(x)) for x in dataset[name]]
  return dataset

def prepare_datasets(dataset, num_candidates):
  datasets = defaultdict(list)
  for dialog in dataset:
    persona = dialog["personality"].copy()
    for _ in range(args.personality_permutations):
      for utterance in dialog["utterances"]:
          history = utterance["history"][-(2 * args.max_history + 1) :]
          for j, candidate in enumerate(utterance["candidates"][-num_candidates:]):
              lm_labels = bool(j == num_candidates - 1)
              instance = build_input_from_segments(persona, history, candidate, tokenizer, lm_labels)
              for input_name, input_array in instance.items():
                  datasets[input_name].append(input_array)
          datasets["mc_labels"].append(num_candidates - 1)
          datasets["n_candidates"] = num_candidates
      # permuted personalities
      persona = [persona[-1]] + persona[:-1]
  return datasets

def top_filtering(logits, top_k=0.0, top_p=0.9, threshold=-float("Inf"), filter_value=-float("Inf")):
  top_k = min(top_k, logits.size(-1))
  if top_k > 0:
      # Remove all tokens with a probability less than the last token in the top-k tokens
      indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
      logits[indices_to_remove] = filter_value

  if top_p > 0.0:
      # Compute cumulative probabilities of sorted tokens
      sorted_logits, sorted_indices = torch.sort(logits, descending=True)
      cumulative_probabilities = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

      # Remove tokens with cumulative probability above the threshold
      sorted_indices_to_remove = cumulative_probabilities > top_p
      # Shift the indices to the right to keep also the first token above the threshold
      sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
      sorted_indices_to_remove[..., 0] = 0

      # Back to unsorted indices and set them to -infinity
      indices_to_remove = sorted_indices[sorted_indices_to_remove]
      logits[indices_to_remove] = filter_value

  indices_to_remove = logits < threshold
  logits[indices_to_remove] = filter_value

  return logits

def generate_sequence(personality, history, tokenizer, model, current_output=None):
  with torch.no_grad():
    with amp.autocast():
      special_tokens_ids = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)
      if current_output is None:
          current_output = []

      # Args
      max_length = 20
      temperature = 0.7
      top_k = 0
      top_p = 0.9
      do_sample = True
      min_length = 1

      for i in range(max_length):
          instance = build_input_from_segments(
              personality, history, current_output, tokenizer, with_eos=False
          )

          input_ids = torch.tensor(instance["input_ids"], device=device).unsqueeze(0)
          token_type_ids = torch.tensor(instance["token_type_ids"], device=device).unsqueeze(0)

          logits = model(input_ids, token_type_ids=token_type_ids)
          logits = logits[0]

          logits = logits[0, -1, :] / temperature
          logits = top_filtering(logits, top_k=top_k, top_p=top_p)
          probs = F.softmax(logits, dim=-1)

          prev = torch.topk(probs, 1)[1] if not do_sample else torch.multinomial(probs, 1)
          if i < min_length and prev.item() in special_tokens_ids:
              while prev.item() in special_tokens_ids:
                  if probs.max().item() == 1:
                      break  # avoid infinite loop
                  prev = torch.multinomial(probs, num_samples=1)

          if prev.item() in special_tokens_ids:
              break
          current_output.append(prev.item())

  return current_output

#### Without finetuning

In [None]:
# Personality
test_personality=[
  'I am Yash',
  'I am a human',
  'My gender is male',
  'My weight is 53.0',
  'I was born on 2009',
  'I am 11 years old',
  'My breed is Retriever, Yellow Labrador',
  'My color is White/Yellow',
  'I am house trained','i like to play with toys']

# History
test_history = [
    "Hi",
    "woof woof"
]
print(test_personality)
print(test_history)

In [None]:
# New chat message
test_message = "what is your name?"

# Tokenize
personality = [tokenizer.encode(s.lower()) for s in test_personality]
history = [tokenizer.encode(s) for s in test_history]
history.append(tokenizer.encode(test_message))
# Generate output
output = generate_sequence(personality, history, tokenizer, model)
output_text = tokenizer.decode(output, skip_special_tokens=True)

print("Question:")
print(test_message)
print("Answer:")
print(output_text)

Question:
what is your name?
Answer:
i am ryan


#### With Finetuning

In [None]:
# Setup Model Training Arguments
parser = ArgumentParser()
parser.add_argument("--epochs", type=int, default=1, help="Number of training epochs")
parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training")
parser.add_argument("--validation_batch_size", type=int, default=4, help="Batch size for validation")
parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training")
parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history")
parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences")
parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Accumulate gradients on several steps")
parser.add_argument("--learning_rate", type=float, default=1e-05, help="Learning rate")
parser.add_argument("--lm_coef", type=float, default=2.0, help="LM loss coefficient")
parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient")
parser.add_argument("--weight_decay", type=float, default=0.0, help="Optimizer weight decay")
parser.add_argument("--warmup_steps", type=int, default=0, help="Number of warmup steps")
parser.add_argument("--warmup_ratio", type=float, default=0.06, help="Warmup ratio")
parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Adam optimizer epsilon")
parser.add_argument("--verbose", type=int, default=1, help="Verbose logging")
parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm")
parser.add_argument("--model_dir", type=str, default="model_outputs", help="Path to save model")

args = parser.parse_args("")
logger.info("Arguments: %s", args)

INFO:.:Arguments: Namespace(adam_epsilon=1e-08, epochs=1, gradient_accumulation_steps=1, learning_rate=4e-05, lm_coef=2.0, max_history=2, max_norm=1.0, mc_coef=1.0, model_dir='model_outputs', num_candidates=2, personality_permutations=1, train_batch_size=4, validation_batch_size=4, verbose=1, warmup_ratio=0.06, warmup_steps=0, weight_decay=0.0)


In [None]:
# # If you want to try to fine tune from GPT2 pretrained weights directly here is the code
# # Model
# model = GPT2DoubleHeadsModel.from_pretrained("gpt2")

# # Tokenizer
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# # Add special tokens to the tokenizer and model
# orig_num_tokens = len(tokenizer.encoder)
# # Add special tokens
# num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
# if num_added_tokens > 0:
#   model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens)

# # Convert model parameter tensors to CUDA tensors
# model.to(device)

# print("model type:",type(model))

#### Prepare Data

In [None]:
dataset_url = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"
start_time = time.time()
download_file(dataset_url, base_path="datasets", extract=False)
execution_time = (time.time() - start_time)/60.0

dataset_url = "https://storage.googleapis.com/artifacts.ai5-c1-group1.appspot.com/data/personadogchat.json"
start_time = time.time()
download_file(dataset_url, base_path="datasets", extract=False)
execution_time = (time.time() - start_time)/60.0

In [None]:
personachat_file = os.path.join("datasets","personadogchat.json")
with open(personachat_file, "r", encoding="utf-8") as f:
  personachat = json.loads(f.read())

In [None]:
subset_size = 50
# Tokenize dataset
train_processed = tokenize(personachat[:subset_size])

print("train count:",len(train_processed))
print(train_processed[:2])

train_num_candidates = len(train_processed[0]["utterances"][0]["candidates"])
if args.num_candidates > 0:
  train_num_candidates = min(args.num_candidates, train_num_candidates)

# Prepare dataset inputs & outputs
train_processed = prepare_datasets(train_processed, train_num_candidates)
print("After adding inputs/outputs:")
print("train_processed keys:", train_processed.keys())
print("input_ids:",len(train_processed["input_ids"][0]),train_processed["input_ids"][0])
print("token_type_ids:",len(train_processed["token_type_ids"][0]),train_processed["token_type_ids"][0])
print("mc_token_ids:",len(train_processed["mc_token_ids"]))
print("lm_labels:",len(train_processed["lm_labels"][0]),train_processed["lm_labels"][0])
print("mc_labels:",len(train_processed["mc_labels"]))
print("n_candidates:",train_processed["n_candidates"])

# Pad datasets
train_processed = pad_dataset(train_processed, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
print("After Padding:")
print("input_ids:",len(train_processed["input_ids"][0]),train_processed["input_ids"][0])
print("token_type_ids:",len(train_processed["token_type_ids"][0]),train_processed["token_type_ids"][0])
print("mc_token_ids:",len(train_processed["mc_token_ids"]))
print("lm_labels:",len(train_processed["lm_labels"][0]),train_processed["lm_labels"][0])
print("mc_labels:",len(train_processed["mc_labels"]))

  0%|          | 0/50 [00:00<?, ?it/s]

train count: 50
[{'personality': [[3666, 1438, 318, 18966, 13], [3666, 2479, 318, 718, 13], [40, 716, 3290, 13], [3666, 5279, 318, 4048, 13], [40, 10164, 7192, 13, 18, 8059, 13], [40, 716, 1086, 623, 287, 3124, 13], [3666, 15939, 318, 4990, 380, 964, 13]], 'utterances': [{'candidates': [[72, 1101, 257, 7394, 764, 345, 389, 262, 352, 301, 1048, 326, 1312, 1297, 764, 1315, 837, 12877, 764], [72, 588, 477, 6982, 286, 2647, 837, 475, 716, 407, 845, 5385, 351, 27296], [72, 1101, 407, 6405, 290, 616, 3988, 389, 7334, 764, 703, 546, 345, 5633], [72, 1101, 1804, 880, 837, 703, 389, 345, 5633], [72, 18854, 374, 31562, 3881, 290, 4836, 764, 345, 5633], [72, 588, 284, 3124, 290, 1312, 5806, 257, 14335, 764], [72, 1654, 2911, 523, 5238, 588, 673, 15063, 1223, 2089], [43669, 837, 663, 825, 407, 14262, 764, 597, 3352, 329, 262, 5041, 5633], [21638, 1659, 24486, 1659, 764, 1312, 1101, 4203, 1049, 0]], 'history': [[5303, 837, 703, 389, 345, 5633]]}, {'candidates': [[1219, 764, 466, 345, 423, 597, 1725

In [None]:
# Create Tensors
train_tensor_datasets = []
validate_tensor_datasets = []
for input_name in MODEL_INPUTS:
  train_tensor = torch.tensor(train_processed[input_name])
  if input_name != "mc_labels":
      train_tensor = train_tensor.view((-1, train_processed["n_candidates"]) + train_tensor.shape[1:])
  train_tensor_datasets.append(train_tensor)

# Tensor Dataset
train_tensor_dataset = TensorDataset(*train_tensor_datasets)

# Create Data Loaders
train_data_sampler = RandomSampler(train_tensor_dataset)
train_data_loader = DataLoader(train_tensor_dataset, sampler=train_data_sampler, batch_size=args.train_batch_size)

logger.info("Train DataLoader (Batch, Candidates, Seq length): {}".format(train_tensor_dataset.tensors[0].shape))

INFO:.:Train DataLoader (Batch, Candidates, Seq length): torch.Size([400, 2, 111])


#### Train

In [None]:
# Compute number of training steps
training_steps = len(train_data_loader) // args.gradient_accumulation_steps * args.epochs

warmup_steps = math.ceil(training_steps * args.warmup_ratio)
warmup_steps = warmup_steps if args.warmup_steps == 0 else args.warmup_steps
print("warmup_steps:", warmup_steps)

# Optimizer: Adam optimizer with weight decay
optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon)

# Learning rate scheduler
# Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
# a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=training_steps)

warmup_steps: 6


In [None]:
# Free Memory
torch.cuda.empty_cache()

disable = True if args.verbose == 0 else False
global_step = 0
training_progress_scores = None
tr_loss, logging_loss = 0.0, 0.0
train_iterator = trange(int(args.epochs), desc="Epoch", disable=disable)
epoch_number = 0
logging_steps = 50

# To train weights as float16
scaler = amp.GradScaler()

start_time = time.time()
# Set the gradients to zero before starting
model.zero_grad()
# Setup training loop
for _ in train_iterator:
    model.train()
    train_iterator.set_description(f'Epoch {epoch_number + 1} of {args.epochs}')
    # Get the batch of data
    batch_iterator = tqdm(
        train_data_loader,
        desc=f'Running Epoch {epoch_number} of {args.epochs}',
        disable=disable,
        mininterval=0,
    )
    for step, batch in enumerate(batch_iterator):
        batch = tuple(t.to(device) for t in batch)
        input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch

        with amp.autocast():
          # Get model output in forward pass
          model_outputs = model(
              input_ids,
              token_type_ids=token_type_ids,
              mc_token_ids=mc_token_ids,
              mc_labels=mc_labels,
              labels=lm_labels,
          )
          # Get mulitple choice head loss
          mc_loss = model_outputs["mc_loss"]
          # Get language model loss
          lm_loss = model_outputs["loss"]
          # Combine loss as a weighted loss
          loss = lm_loss * args.lm_coef + mc_loss * args.mc_coef

        current_loss = loss.item()
        print("\rRunning loss: %f" % current_loss, end="")

        # If gradients need be accumulated over several steps
        if args.gradient_accumulation_steps > 1:
          loss = loss / args.gradient_accumulation_steps

        scaler.scale(loss).backward()
        tr_loss += loss.item()

        if (step + 1) % args.gradient_accumulation_steps == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
            scaler.step(optimizer)
            scaler.update()

            # Update learning rate schedule
            scheduler.step()
            # Clear out the gradients
            model.zero_grad()
            global_step += 1

            if logging_steps > 0 and global_step % logging_steps == 0:
                logging_loss = tr_loss

    epoch_number += 1

execution_time = (time.time() - start_time)/60.0
logger.info("Execution time (mins): %s",execution_time)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/100 [00:00<?, ?it/s]

Running loss: 9.720963



Running loss: 0.998160

INFO:.:Execution time (mins): 1.6355935176213583


#### Predict

In [None]:
test_personality

["It's Nola. Woof Woof",
 'My age is 1.',
 'I am dog.',
 'I am a girl.',
 'I weigh 36.4 pounds.',
 'I am Brindle in color.',
 'I am a Terrier.']

In [None]:
# Personality
test_personality=personachat[-1]['personality']

# History
test_history = [
    "Hi",
    "woof woof"
]

print(test_personality)
print(test_history)

["It's Nola. Woof Woof", 'My age is 1.', 'I am dog.', 'I am a girl.', 'I weigh 36.4 pounds.', 'I am Brindle in color.', 'I am a Terrier.']
['Hi', 'woof woof']


In [None]:
def chat_with_dog(test_message):
  # Tokenize test inputs
  personality = [tokenizer.encode(s.lower()) for s in test_personality]
  history = [tokenizer.encode(s) for s in test_history]
  history.append(tokenizer.encode(test_message))
  test_history.append(test_message)
  # Generate output
  output = generate_sequence(personality, history, tokenizer, model)
  output_text = tokenizer.decode(output, skip_special_tokens=True)
  test_history.append(output_text)

  print("Question:")
  print(test_message)

  print("Answer:")
  print(output_text)

In [None]:
chat_with_dog("What is your name")

Question:
What is your name
Answer:
i am nola. woof woof


In [None]:
chat_with_dog("How old are you?")

Question:
What is your age?
Answer:
1. i am 36.


In [None]:
chat_with_dog("Do you like toys?")

In [None]:
chat_with_dog("what is your dream")

Question:
what is your dream
Answer:
i want to be a dog


In [None]:
chat_with_dog("what is your color")

Question:
what is your color
Answer:
i am a girl.


In [None]:
model_dir = "final_gpt2doublehead"
os.makedirs(model_dir, exist_ok=True)

model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

!zip -r final_gpt2doublehead.zip final_gpt2doublehead

!gsutil cp ./final_gpt2doublehead.zip gs://artifacts.ai5-c1-group1.appspot.com/data

  adding: final_gpt2doublehead/ (stored 0%)
  adding: final_gpt2doublehead/tokenizer_config.json (deflated 57%)
  adding: final_gpt2doublehead/vocab.json (deflated 63%)
  adding: final_gpt2doublehead/config.json (deflated 50%)
  adding: final_gpt2doublehead/special_tokens_map.json (deflated 42%)
  adding: final_gpt2doublehead/merges.txt (deflated 53%)
  adding: final_gpt2doublehead/added_tokens.json (deflated 42%)
  adding: final_gpt2doublehead/pytorch_model.bin (deflated 9%)
Copying file://./final_gpt2doublehead.zip [Content-Type=application/zip]...
==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads suc

## **<font color="darkred">Save Model/Tokenizer</font>**

In [None]:
# Save
model_dir = "trained_model"
os.makedirs(model_dir, exist_ok=True)

model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)

In [None]:
!zip -r finetuned_model_epochs_1.zip trained_model

## **<font color="darkred">References</font>**

### Research Papers
* [Attention is all you need (2017)](https://arxiv.org/abs/1706.03762)
* [GPT-2 (2019)](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)
* [Personalizing Dialogue Agents: I have a dog, do you have pets too?](http://arxiv.org/abs/1801.07243)

### Code

* [Building a State-of-the-Art Conversational AI with Transfer Learning](https://github.com/huggingface/transfer-learning-conv-ai)
* [Summary of the models](https://huggingface.co/transformers/model_summary.html)
* [Transformers source code](https://github.com/huggingface/transformers/tree/master/src/transformers)
* ComputeFest 2021


### Articles

* [How to build a State-of-the-Art Conversational AI with Transfer Learning](https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313)
* [The Illustrated GPT-2](http://jalammar.github.io/illustrated-gpt2/)
* [The Illustrated BERT, ELMo, and co.](http://jalammar.github.io/illustrated-bert/)