In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install transformers

Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 5.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 41.3 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 48.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 45.1 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.5 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3

In [None]:
import numpy as np
from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
                         TrainingArguments, Trainer
import torch
from torch.utils.data import Dataset


In [None]:
!pip install datasets

from datasets import load_dataset
dataset = load_dataset('daily_dialog')

def load_conversations(data):
  context = []
  response = []

  for i in range(len(dataset[data])):
    for j in range(len(dataset[data][i]['dialog'])-1):
      context.append(dataset[data][i]['dialog'][j])
      response.append(dataset[data][i]['dialog'][j+1])
  return context, response


Collecting datasets
  Downloading datasets-1.12.1-py3-none-any.whl (270 kB)
[?25l[K     |█▏                              | 10 kB 14.7 MB/s eta 0:00:01[K     |██▍                             | 20 kB 17.8 MB/s eta 0:00:01[K     |███▋                            | 30 kB 17.1 MB/s eta 0:00:01[K     |████▉                           | 40 kB 11.4 MB/s eta 0:00:01[K     |██████                          | 51 kB 5.9 MB/s eta 0:00:01[K     |███████▎                        | 61 kB 6.4 MB/s eta 0:00:01[K     |████████▌                       | 71 kB 6.1 MB/s eta 0:00:01[K     |█████████▊                      | 81 kB 6.8 MB/s eta 0:00:01[K     |███████████                     | 92 kB 5.1 MB/s eta 0:00:01[K     |████████████▏                   | 102 kB 5.5 MB/s eta 0:00:01[K     |█████████████▍                  | 112 kB 5.5 MB/s eta 0:00:01[K     |██████████████▋                 | 122 kB 5.5 MB/s eta 0:00:01[K     |███████████████▊                | 133 kB 5.5 MB/s eta 0:00:01

Downloading:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset daily_dialog/default (download: 4.27 MiB, generated: 8.23 MiB, post-processed: Unknown size, total: 12.50 MiB) to /root/.cache/huggingface/datasets/daily_dialog/default/1.0.0/c03444008e9508b8b76f1f6793742d37d5e5f83364f8d573c2747bff435ea55c...


Downloading:   0%|          | 0.00/4.48M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset daily_dialog downloaded and prepared to /root/.cache/huggingface/datasets/daily_dialog/default/1.0.0/c03444008e9508b8b76f1f6793742d37d5e5f83364f8d573c2747bff435ea55c. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}                  
MAXLEN          = 60 


In [None]:
class myDataset(Dataset):

  def __init__(self, data, tokenizer):

    context, response = [], []
    
    for k, v in data.items():
      context.append(v[0])
      response.append(v[1])

    self.tokenizer = tokenizer 
    self.response  = response
    self.context      = context

  def __len__(self):
    return len(self.context)
 
  def __getitem__(self, i):
        
    input = SPECIAL_TOKENS['bos_token'] + self.context[i] + \
            SPECIAL_TOKENS['sep_token'] + \
            self.response[i] + SPECIAL_TOKENS['eos_token']

    encodings_dict = tokenizer(input,                                   
                              truncation=True, 
                              max_length=MAXLEN, 
                              padding="max_length")   
        
    input_ids = encodings_dict['input_ids']
    attention_mask = encodings_dict['attention_mask']
        
    return {'label': torch.tensor(input_ids),
            'input_ids': torch.tensor(input_ids), 
            'attention_mask': torch.tensor(attention_mask)}

In [None]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')

tokenizer.add_special_tokens(SPECIAL_TOKENS)

config = AutoConfig.from_pretrained('gpt2', 
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    output_hidden_states=False)

model = AutoModelForPreTraining.from_pretrained('gpt2', config=config)
model.resize_token_embeddings(len(tokenizer))
model.cuda()


Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50262, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [None]:
context_train, response_train = load_conversations('train')
context_val, response_val = load_conversations('validation')

In [None]:
train_data = dict()
i=0

for context, response in zip (context_train, response_train):
  train_data[i] = [context, response]
  i += 1

#********************************************
val_data = dict()
i=0

for context, response in zip (context_val, response_val):
  val_data[i] = [context, response]
  i += 1

train_dataset = myDataset(train_data, tokenizer)
val_dataset = myDataset(val_data, tokenizer)

In [None]:
# load_model_path = '/content/drive/MyDrive/models/checkpoint-1000/pytorch_model.bin'
# model.load_state_dict(torch.load(load_model_path))


training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/models",
    num_train_epochs=3, 
    eval_steps = 2000, 
    save_steps=2000, 
    warmup_steps=500,
    prediction_loss_only=True,
    learning_rate = 5e-4,
    do_eval = True,
    evaluation_strategy = 'steps'
    )

trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 76052
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 28521


Step,Training Loss,Validation Loss
2000,1.6685,1.635943
4000,1.5317,1.585104
6000,1.4873,1.534668
8000,1.4212,1.499828
10000,1.1392,1.529146
12000,1.1228,1.494847
14000,1.0812,1.486009


***** Running Evaluation *****
  Num examples = 7069
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/models/checkpoint-2000
Configuration saved in /content/drive/MyDrive/models/checkpoint-2000/config.json
Model weights saved in /content/drive/MyDrive/models/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/models/checkpoint-2000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/models/checkpoint-2000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 7069
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/models/checkpoint-4000
Configuration saved in /content/drive/MyDrive/models/checkpoint-4000/config.json
Model weights saved in /content/drive/MyDrive/models/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/models/checkpoint-4000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/models/checkpoint-4000/specia

Step,Training Loss,Validation Loss
2000,1.6685,1.635943
4000,1.5317,1.585104
6000,1.4873,1.534668
8000,1.4212,1.499828
10000,1.1392,1.529146
12000,1.1228,1.494847
14000,1.0812,1.486009
16000,1.0407,1.475871
18000,1.0044,1.469475
20000,0.7669,1.58197


Saving model checkpoint to /content/drive/MyDrive/models/checkpoint-16000
Configuration saved in /content/drive/MyDrive/models/checkpoint-16000/config.json
Model weights saved in /content/drive/MyDrive/models/checkpoint-16000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/models/checkpoint-16000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/models/checkpoint-16000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 7069
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/models/checkpoint-18000
Configuration saved in /content/drive/MyDrive/models/checkpoint-18000/config.json
Model weights saved in /content/drive/MyDrive/models/checkpoint-18000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/models/checkpoint-18000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/models/checkpoint-18000/special_tokens_map.json
***** Running Evaluation *****
  Num examp

In [None]:
trainer.save_model('/content/drive/MyDrive/final model') 

Saving model checkpoint to /content/drive/MyDrive/models
Configuration saved in /content/drive/MyDrive/models/config.json
Model weights saved in /content/drive/MyDrive/models/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/models/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/models/special_tokens_map.json


In [None]:
load_model_path = '/content/drive/MyDrive/models/checkpoint-26000/pytorch_model.bin'

tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens(SPECIAL_TOKENS)

config = AutoConfig.from_pretrained('gpt2', 
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    output_hidden_states=False)
    
model = AutoModelForPreTraining.from_pretrained('gpt2', config=config)
model.resize_token_embeddings(len(tokenizer))

model.load_state_dict(torch.load(load_model_path))
model.cuda()


In [None]:
model.eval()

In [None]:
def generate_response(text):
  
  inp_context = SPECIAL_TOKENS['bos_token'] + text + SPECIAL_TOKENS['sep_token']      
  generated = torch.tensor(tokenizer.encode(inp_context)).unsqueeze(0)
  device = torch.device("cuda")
  generated = generated.to(device)
  sample_outputs = model.generate(generated, 
                                do_sample=True,
                                top_k=0,    
                                min_length=5, 
                                max_length=30,
                                num_return_sequences=10
                                )


  for i, sample_output in enumerate(sample_outputs):
    text_gen = tokenizer.decode(sample_output, skip_special_tokens=True)
    a = len(text) 
    print("{}: {}\n\n".format(i+1,  text_gen))

  return



In [None]:
generate_response('Where have you been?')

1: Where have you been? I went to diving school. 


2: Where have you been? I have worked in the field for 2 years. 


3: Where have you been? Out? What old Dutch habit does she follow? 


4: Where have you been? Project Management. I wanted to do some ot things in the job as head of Project Management. 


5: Where have you been? I started as an accountant! 


6: Where have you been? I have been busy emptying packing. 


7: Where have you been? Sightseeing shops. 


8: Where have you been? Actually I've been promoted to department manager. 


9: Where have you been? I've been keeping pearls on the mountain these years, I really like fishing. 


10: Where have you been? studies have shown me that Americans are more easily conned, aren ’ t they? 




In [None]:
generate_response('What is your favourite color?')

1: What is your favourite color? sushi. 


2: What is your favourite color?aghettiulous. What kinds of things do you like to do? 


3: What is your favourite color? Blue, yellow or something black? 


4: What is your favourite color? Well, it's yellow. It's been redecorating for some time now. 


5: What is your favourite color? Beauty costs, isn't it? 


6: What is your favourite color? Beautifully done. 


7: What is your favourite color? Tuesdays and Thursdays from seven thirty to twelve thirty. 


8: What is your favourite color? Light blue. 


9: What is your favourite color? Light pink. 


10: What is your favourite color? I like the straight legs. How do you pull it? 




In [None]:
generate_response('What do you want to eat?')

1: What do you want to eat? I'd love to try a sandwich with some kind of coffee. 


2: What do you want to eat? I'm totally fed up with it. 


3: What do you want to eat? I'm trying to make a quick breakfast but the machine won't work. 


4: What do you want to eat? They are all saints. 


5: What do you want to eat? Hotel- Mormon. 


6: What do you want to eat? I want a hamburger and ice cream. 


7: What do you want to eat? catalogue and quickest. 


8: What do you want to eat? What are you feeling about? 


9: What do you want to eat? For me? 


10: What do you want to eat? My order of banana pancakes. 


