In [1]:
%load_ext autoreload
%autoreload 2

> this notebook will follow the tutorial in:
https://blog.gopenai.com/fine-tuning-dialogpt-medium-on-daily-dialog-dataset-a-step-by-step-guide-4eaecc1b9323

In [2]:
!pip install -U transformers
!pip install datasets
!pip install -U accelerate

Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 KB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.26.1
    Uninstalling accelerate-0.26.1:
      Successfully uninstalled accelerate-0.26.1
Successfully installed accelerate-0.27.2


# make my own dataset

In [1]:
# https://huggingface.co/learn/nlp-course/chapter5/5
# https://huggingface.co/learn/nlp-course/chapter5/2

In [1]:
from datasets import load_dataset
from glob import glob

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
glob('data/*.json')

['data/DL103_2008.json',
 'data/0288702916.json',
 'data/DL320_2002.json',
 'data/L65_2013.json',
 'data/dlr7_2016-m.json',
 'data/0331103315.json',
 'data/DLR4_2012_A.json']

we can also set the splits

`data_files = {"train": "json_example.json", "test": "json_example.json"}`
`dataset = load_dataset("json", data_files=data_files, field="data")`

In [3]:
dataset = load_dataset("json", data_files=glob('data/*.json'))

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'page'],
        num_rows: 106
    })
})

In [6]:
dataset["train"][0]

{'text': ['8 3765 A conformidade das máquinas continua a ser certificada pelo fabricante, sendo alargada a possibilidade de escolha de procedimentos de avaliação de conformidade para o caso das máquinas definidas no anexo IV em que se exigem procedimentos específicos',
  'É ainda introduzido, no presente decreto -lei, um meca- nismo que permite a adopção de medidas específicas a nível comunitário, que exigem aos Estados membros a proibição ou a restrição da colocação no mercado de certos tipos de máquinas que apresentem os mesmos riscos para a saúde e a segurança das pessoas, quer devido a lacunas das normas har- monizadas pertinentes quer devido às suas características téc- nicas, ou submeter essas máquinas a condições especiais',
  'Foram ouvidos os órgãos de governo próprio das Re- giões Autónomas',
  'Foram ouvidas as associações representativas do sector',
  'Assim: Nos termos da alínea a) do n.º 1 do artigo 198.º da Cons- tituição, o Governo decreta o seguinte: CAPÍTULO I Disposi

In [4]:
# Concatenate all utterances within a dialogue and map to 'dialog' key
def concatenate_paragraphs(example):
    example['page'] = " ".join(example['text'])
    return example

dataset = dataset.map(concatenate_paragraphs)

> Note: not sure if this is really needed, but for simplicity will make a whole text per example

In [5]:
import numpy as np
import tempfile
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-medium')

  return self.fget.__get__(instance, owner)()


In [6]:
# Encode the dataset
# https://huggingface.co/docs/transformers/en/pad_truncation
def encode(examples):
    encoded = tokenizer(examples['page'], truncation=True, padding='max_length', max_length=128)
    encoded['labels'] = encoded['input_ids'][:]

    return encoded

encoded_dataset = dataset.map(encode, batched=True)

## Training

In [7]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=tempfile.mkdtemp(),   # output directory
    num_train_epochs=10,             # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=None,                # directory for storing logs
    fp16=True                        # use floating point 16 bit precision for training
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['train']
)

# Evaluation

In [None]:
# Evaluate before fine-tuning
pre_eval_results = trainer.evaluate(encoded_dataset['train'])

In [8]:
# Get predictions for validation set before fine tuning for 10 samples
pre_val_predictions = trainer.predict(encoded_dataset['train'].select(range(10)))

TO DO: Add the gpu to this model!!!!!

In [9]:
pre_val_predictions

PredictionOutput(predictions=array([[[ -8.5       , -13.9296875 , -15.6875    , ..., -13.1484375 ,
         -13.1640625 ,  -5.1015625 ],
        [ -7.6875    , -14.734375  , -16.953125  , ..., -14.328125  ,
         -11.5078125 ,   3.6601562 ],
        [ -6.625     , -14.3046875 , -14.0234375 , ..., -10.1328125 ,
          -8.9453125 ,   5.1484375 ],
        ...,
        [ -2.2929688 , -11.546875  ,  -9.7421875 , ...,  -7.6328125 ,
          -5.421875  ,  11.125     ],
        [ -0.03117371,  -9.1875    ,  -9.3046875 , ...,  -7.5273438 ,
          -6.7578125 ,  10.5625    ],
        [  9.671875  ,  -1.6328125 ,  -1.4101562 , ...,   6.6835938 ,
           5.8554688 ,  28.234375  ]],

       [[ -8.53125   , -13.8984375 , -15.671875  , ..., -13.078125  ,
         -13.203125  ,  -5.0820312 ],
        [ -5.421875  , -11.4375    , -12.7265625 , ..., -10.6484375 ,
          -9.140625  ,   0.55078125],
        [ -8.015625  , -16.140625  , -16.1875    , ..., -14.03125   ,
         -10.046875  ,