In [1]:
%load_ext autoreload
%autoreload 2

> this notebook will follow the tutorial in:
https://blog.gopenai.com/fine-tuning-dialogpt-medium-on-daily-dialog-dataset-a-step-by-step-guide-4eaecc1b9323

In [2]:
!pip install -U transformers
!pip install datasets
!pip install -U accelerate

Collecting transformers
  Downloading transformers-4.37.2-py3-none-any.whl (8.4 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.36.2
    Uninstalling transformers-4.36.2:
      Successfully uninstalled transformers-4.36.2
Successfully installed transformers-4.37.2


# make my own dataset

In [None]:
# https://huggingface.co/learn/nlp-course/chapter5/5
# https://huggingface.co/learn/nlp-course/chapter5/2

In [9]:
from datasets import load_dataset
from glob import glob

In [10]:
glob('data/*.json')

['data/DL103_2008.json',
 'data/0288702916.json',
 'data/DL320_2002.json',
 'data/L65_2013.json',
 'data/dlr7_2016-m.json',
 'data/0331103315.json',
 'data/DLR4_2012_A.json']

we can also set the splits

`data_files = {"train": "json_example.json", "test": "json_example.json"}`
`dataset = load_dataset("json", data_files=data_files, field="data")`

In [27]:
dataset = load_dataset("json", data_files=glob('data/*.json'))

In [28]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'page'],
        num_rows: 106
    })
})

In [13]:
dataset["train"][0]

{'text': ['8 3765 A conformidade das máquinas continua a ser certificada pelo fabricante, sendo alargada a possibilidade de escolha de procedimentos de avaliação de conformidade para o caso das máquinas definidas no anexo IV em que se exigem procedimentos específicos',
  'É ainda introduzido, no presente decreto -lei, um meca- nismo que permite a adopção de medidas específicas a nível comunitário, que exigem aos Estados membros a proibição ou a restrição da colocação no mercado de certos tipos de máquinas que apresentem os mesmos riscos para a saúde e a segurança das pessoas, quer devido a lacunas das normas har- monizadas pertinentes quer devido às suas características téc- nicas, ou submeter essas máquinas a condições especiais',
  'Foram ouvidos os órgãos de governo próprio das Re- giões Autónomas',
  'Foram ouvidas as associações representativas do sector',
  'Assim: Nos termos da alínea a) do n.º 1 do artigo 198.º da Cons- tituição, o Governo decreta o seguinte: CAPÍTULO I Disposi

In [29]:
# Concatenate all utterances within a dialogue and map to 'dialog' key
def concatenate_paragraphs(example):
    example['page'] = " ".join(example['text'])
    return example

dataset = dataset.map(concatenate_paragraphs)

> Note: not sure if this is really needed, but for simplicity will make a whole text per example

In [21]:
import numpy as np
import tempfile
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-medium')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-medium')

tokenizer_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 26.0/26.0 [00:00<00:00, 51.5kB/s]
vocab.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.04M/1.04M [00:00<00:00, 2.48MB/s]
merges.txt: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 456k/456k [00:00<00:00, 1.39MB/s]
config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 642/642 [00:00<00:00, 1.72MB/s]
pytorch_model.bin: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 863M/863M [02:34<00:00, 5.57MB/s]
  return self.fget.__get__(instance, owner)()
generation_config.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 124/124 [00:00<00:0

In [30]:
# Encode the dataset
# https://huggingface.co/docs/transformers/en/pad_truncation
def encode(examples):
    encoded = tokenizer(examples['page'], truncation=True, padding='max_length', max_length=128)
    encoded['labels'] = encoded['input_ids'][:]

    return encoded

encoded_dataset = dataset.map(encode, batched=True)

## Training

In [32]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=tempfile.mkdtemp(),   # output directory
    num_train_epochs=10,             # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=None,                # directory for storing logs
    fp16=True                        # use floating point 16 bit precision for training
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['train']
)

# Evaluation

In [33]:
# Evaluate before fine-tuning
pre_eval_results = trainer.evaluate(encoded_dataset['train'])

# Get predictions for validation set before fine tuning for 10 samples
pre_val_predictions = trainer.predict(encoded_dataset['train'].select(range(10)))

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.52 GiB. GPU 0 has a total capacty of 5.79 GiB of which 65.88 MiB is free. Including non-PyTorch memory, this process has 5.71 GiB memory in use. Of the allocated memory 5.46 GiB is allocated by PyTorch, and 155.49 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

TO DO: Add the gpu to this model!!!!!