In [2]:
!pip install -U transformers
!pip install datasets
!pip install -U accelerate



In [1]:
%load_ext autoreload
%autoreload 2

> this notebook will follow the tutorial in:
https://blog.gopenai.com/fine-tuning-dialogpt-medium-on-daily-dialog-dataset-a-step-by-step-guide-4eaecc1b9323

# make my own dataset

In [2]:
# https://huggingface.co/learn/nlp-course/chapter5/5
# https://huggingface.co/learn/nlp-course/chapter5/2

In [3]:
from datasets import load_dataset
from glob import glob

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
glob('data/*.json')

['data/dlr7_2016-m.json',
 'data/DL103_2008.json',
 'data/0331103315.json',
 'data/DL320_2002.json',
 'data/L65_2013.json',
 'data/DLR4_2012_A.json',
 'data/0288702916.json']

we can also set the splits

`data_files = {"train": "json_example.json", "test": "json_example.json"}`
`dataset = load_dataset("json", data_files=data_files, field="data")`

In [5]:
dataset = load_dataset("json", data_files=glob('data/*.json'))

# train test split
dataset = dataset["train"].train_test_split(test_size=0.20)

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'page'],
        num_rows: 84
    })
    test: Dataset({
        features: ['text', 'page'],
        num_rows: 22
    })
})

In [7]:
dataset["train"][0]

{'text': ['Diário da República, 1.ª série — N.º 120 — 24 de Junho de 2008 1.2.4.3 — Paragem de emergência. — A máquina deve estar equipada com um ou vários dispositivos de paragem de emergência por meio do ou dos quais possam ser evita- das situações de perigo iminentes ou existentes',
  'Estão excluídas desta obrigação: — As máquinas cujo dispositivo de paragem de emer- gência não permita reduzir o risco quer por não reduzir o tempo de obtenção da paragem normal quer por não per- mitir tomar as medidas específicas exigidas pelo risco; — As máquinas portáteis mantidas em posição e ou guiadas à mão',
  'Este dispositivo deve: — Conter dispositivos de comando claramente identi- ficáveis, bem visíveis e rapidamente acessíveis; — Provocar a paragem do processo perigoso num período de tempo tão reduzido quanto possível sem provocar riscos suplementares; — Eventualmente desencadear, ou permitir desencadear, determinados movimentos de protecção',
  'Quando se deixa de accionar o dispositivo d

In [8]:
# Concatenate all utterances within a dialogue and map to 'dialog' key
def concatenate_paragraphs(example):
    example['page'] = " ".join(example['text'])
    return example

# dataset = dataset.map(concatenate_paragraphs)

> Note: not sure if this is really needed, but for simplicity will make a whole text per example
>
> The DialogPT is based on short context, this doesn't lead to good results! Will be trying without concatenating as below

In [9]:
def flatten_list_of_dict(batch):
    return {"page": [ex_string for ex_list in batch["text"] for ex_string in ex_list]}

dataset = dataset.map(flatten_list_of_dict, batched=True, remove_columns=["text"])

Map: 100%|███████████████████████████████████████████████████████████████████████████████| 84/84 [00:00<00:00, 9002.49 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 4977.33 examples/s]


In [10]:
# re-joining the words separated by "-"
def text_processing(example):
    example['page'] = example['page'].replace("- ", "")
    return example

dataset = dataset.map(text_processing)

Map: 100%|██████████████████████████████████████████████████████████████████████████| 1485/1485 [00:00<00:00, 39928.08 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████| 367/367 [00:00<00:00, 36765.78 examples/s]


In [13]:
dataset["train"][:10]

{'page': ['Diário da República, 1.ª série — N.º 120 — 24 de Junho de 2008 1.2.4.3 — Paragem de emergência. — A máquina deve estar equipada com um ou vários dispositivos de paragem de emergência por meio do ou dos quais possam ser evitadas situações de perigo iminentes ou existentes',
  'Estão excluídas desta obrigação: — As máquinas cujo dispositivo de paragem de emergência não permita reduzir o risco quer por não reduzir o tempo de obtenção da paragem normal quer por não permitir tomar as medidas específicas exigidas pelo risco; — As máquinas portáteis mantidas em posição e ou guiadas à mão',
  'Este dispositivo deve: — Conter dispositivos de comando claramente identificáveis, bem visíveis e rapidamente acessíveis; — Provocar a paragem do processo perigoso num período de tempo tão reduzido quanto possível sem provocar riscos suplementares; — Eventualmente desencadear, ou permitir desencadear, determinados movimentos de protecção',
  'Quando se deixa de accionar o dispositivo de parage

### DETOUR!!

In [9]:
import os

os.environ["LD_LIBRARY_PATH"]=""

In [15]:
# TODO: FIX TORCH VERSION  -> this one was not the one original!!
!pip3 install torch==2.0.1

Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m50.1 MB/s[0m eta [36m0:00:0

## Encoding

In [11]:
import numpy as np
import tempfile
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-small')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-small')

In [12]:
# Encode the dataset
# https://huggingface.co/docs/transformers/en/pad_truncation
def encode(examples):
    encoded = tokenizer(examples['page'],
                        truncation=True, 
                        padding='max_length',
                        max_length=128
                       )
    encoded['labels'] = encoded['input_ids'][:]

    return encoded

encoded_dataset = dataset.map(encode, batched=True)

Map: 100%|███████████████████████████████████████████████████████████████████████████| 1477/1477 [00:00<00:00, 2606.05 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████| 375/375 [00:00<00:00, 2216.13 examples/s]


## Training

In [13]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=tempfile.mkdtemp(),   # output directory
    num_train_epochs=25,             # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=None,                # directory for storing logs
    fp16=True                        # use floating point 16 bit precision for training
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test']
)

# Evaluation

In [14]:
# Evaluate before fine-tuning
pre_eval_results = trainer.evaluate(encoded_dataset['test'])

In [15]:
# Get predictions for validation set before fine tuning for 10 samples
pre_val_predictions = trainer.predict(encoded_dataset['test'].select(range(10)))

## Fine tuning

In [16]:
# Fine-tune the model
trainer.train()

Step,Training Loss
500,3.7524
1000,1.7806
1500,1.4481
2000,1.2422
2500,1.1048
3000,1.0039
3500,0.9242
4000,0.8705
4500,0.8354


TrainOutput(global_step=4625, training_loss=1.423659377639358, metrics={'train_runtime': 684.5815, 'train_samples_per_second': 53.938, 'train_steps_per_second': 6.756, 'total_flos': 2412052070400000.0, 'train_loss': 1.423659377639358, 'epoch': 25.0})

In [17]:
# Get predictions for validation set before fine tuning for 10 samples
pre_val_predictions = trainer.predict(encoded_dataset['test'].select(range(10)))
# Evaluate after fine-tuning
post_eval_results = trainer.evaluate(encoded_dataset['test'])

# Print the evaluation losses before and after fine-tuning
print('Evaluation Results before fine-tuning :', pre_eval_results['eval_loss'])
print('Evaluation Results after fine-tuning  :', post_eval_results['eval_loss'])

# Get predictions for validation set before fine tuning for 10 samples
post_val_predictions = trainer.predict(encoded_dataset['test'].select(range(10)))

# Zip the pre and post tuning predictions
predictions = zip(pre_val_predictions.predictions, post_val_predictions.predictions)

Evaluation Results before fine-tuning : 9.061310768127441
Evaluation Results after fine-tuning  : 1.5355618000030518


## Results

In [22]:
for idx, (pre, post) in enumerate(predictions):
    pre_pred = tokenizer.decode(np.argmax(pre, axis=-1), skip_special_tokens=True)
    post_pred = tokenizer.decode(np.argmax(post, axis=-1), skip_special_tokens=True)
    ground_truth = encoded_dataset['test'][idx]["page"]
    
    print(f'Ground truth {idx} \n' + ground_truth + '\n')
    print('Pre-prediction \n' + "".join(pre_pred) + '\n')
    print('Post-prediction \n'+ "".join(post_pred) + '\n')
    print('----------------------------------------------------------------------------------------------------------------------\n')

Ground truth 0 
DAREPÚBLICA—ISÉRIE-A 8161 d) Empresa de manutenção de ascensores (EMA) a entidadequeefectuaeéresponsávelpelamanu- tenção das instalações, cujo estatuto constitui o anexo I a este diploma e que dele faz parte integrante; e) Entidade inspectora (EI) a empresa habilitada a efectuar inspecções a instalações, bem como a realizar inquéritos, peritagens, relatórios e pareceres, cujo estatuto constitui o anexo IV a este diploma e que dele faz parte integrante

Pre-prediction 
 —EPÚBLICA—ISÉRIE-A 8165 8) Apresresa de manutenção de ascensores,M) 1)idade gest souncuar)cnabilvel posutenêsehaã,as Ealações o porjo ascatuto dui- proprietexo IV aoja, que se faz parte integrante; e) Emidade queore deEMAI), eopresa dejailitar;ofetuar;petção�es peroalações, om

Post-prediction 
 —EPÚBLICA—ISÉRIE-A 8165 8) Apresresa de manutenção de ascensores,M) 1)idade gest souncuar)cnabilvel posutenêsehaã,as Ealações o porjo ascatuto dui- proprietexo IV aoja, que se faz parte integrante; e) Emidade queo

## GPT2 IN PORTUGUESpierreguillou/gpt2-small-portuguese

In [None]:
# https://huggingface.co/pierreguillou/gpt2-small-portuguese

## Encoding

In [14]:
import numpy as np
import tempfile
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('pierreguillou/gpt2-small-portuguese')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('pierreguillou/gpt2-small-portuguese')

tokenizer_config.json: 100%|████████████████████████████████████████████████████████████████████| 92.0/92.0 [00:00<00:00, 428kB/s]
vocab.json: 100%|███████████████████████████████████████████████████████████████████████████████| 850k/850k [00:01<00:00, 583kB/s]
merges.txt: 100%|███████████████████████████████████████████████████████████████████████████████| 508k/508k [00:00<00:00, 547kB/s]
special_tokens_map.json: 100%|████████████████████████████████████████████████████████████████████| 120/120 [00:00<00:00, 761kB/s]
config.json: 100%|███████████████████████████████████████████████████████████████████████████████| 666/666 [00:00<00:00, 1.67MB/s]
pytorch_model.bin: 100%|███████████████████████████████████████████████████████████████████████| 510M/510M [00:10<00:00, 50.8MB/s]
  return self.fget.__get__(instance, owner)()


In [15]:
# Encode the dataset
# https://huggingface.co/docs/transformers/en/pad_truncation
def encode(examples):
    encoded = tokenizer(examples['page'],
                        truncation=True, 
                        padding='max_length',
                        max_length=128
                       )
    encoded['labels'] = encoded['input_ids'][:]

    return encoded

encoded_dataset = dataset.map(encode, batched=True)

Map: 100%|███████████████████████████████████████████████████████████████████████████| 1485/1485 [00:00<00:00, 2834.13 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████| 367/367 [00:00<00:00, 2197.77 examples/s]


## Training

In [20]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=tempfile.mkdtemp(),   # output directory
    num_train_epochs=10,             # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=None,                # directory for storing logs
    fp16=True                        # use floating point 16 bit precision for training
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test']
)

# Evaluation

In [21]:
# Evaluate before fine-tuning
pre_eval_results = trainer.evaluate(encoded_dataset['test'])

In [22]:
# Get predictions for validation set before fine tuning for 10 samples
pre_val_predictions = trainer.predict(encoded_dataset['test'].select(range(10)))

## Fine tuning

In [23]:
# Fine-tune the model
trainer.train()

Step,Training Loss
500,1.3545
1000,0.9671
1500,0.7571


TrainOutput(global_step=1860, training_loss=0.9571244598716818, metrics={'train_runtime': 273.8264, 'train_samples_per_second': 54.231, 'train_steps_per_second': 6.793, 'total_flos': 970046668800000.0, 'train_loss': 0.9571244598716818, 'epoch': 10.0})

In [27]:
# Get predictions for validation set before fine tuning for 10 samples
pre_val_predictions = trainer.predict(encoded_dataset['test'].select(range(1)))
# Evaluate after fine-tuning
post_eval_results = trainer.evaluate(encoded_dataset['test'])

# Print the evaluation losses before and after fine-tuning
print('Evaluation Results before fine-tuning :', pre_eval_results['eval_loss'])
print('Evaluation Results after fine-tuning  :', post_eval_results['eval_loss'])

# Get predictions for validation set before fine tuning for 10 samples
post_val_predictions = trainer.predict(encoded_dataset['test'].select(range(1)))

# Zip the pre and post tuning predictions
predictions = zip(pre_val_predictions.predictions, post_val_predictions.predictions)

OutOfMemoryError: CUDA out of memory. Tried to allocate 780.00 MiB (GPU 0; 5.80 GiB total capacity; 3.86 GiB already allocated; 324.75 MiB free; 4.60 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

## Results

In [None]:
for idx, (pre, post) in enumerate(predictions):
    pre_pred = tokenizer.decode(np.argmax(pre, axis=-1), skip_special_tokens=True)
    post_pred = tokenizer.decode(np.argmax(post, axis=-1), skip_special_tokens=True)
    ground_truth = encoded_dataset['test'][idx]["page"]
    
    print(f'Ground truth {idx} \n' + ground_truth + '\n')
    print('Pre-prediction \n' + "".join(pre_pred) + '\n')
    print('Post-prediction \n'+ "".join(post_pred) + '\n')
    print('----------------------------------------------------------------------------------------------------------------------\n')