In [1]:
%load_ext autoreload
%autoreload 2

> this notebook will follow the tutorial in:
https://blog.gopenai.com/fine-tuning-dialogpt-medium-on-daily-dialog-dataset-a-step-by-step-guide-4eaecc1b9323

In [2]:
!pip install -U transformers
!pip install datasets
!pip install -U accelerate



# make my own dataset

In [2]:
# https://huggingface.co/learn/nlp-course/chapter5/5
# https://huggingface.co/learn/nlp-course/chapter5/2

In [12]:
from datasets import load_dataset
from glob import glob

In [13]:
glob('data/*.json')

['data/dlr7_2016-m.json',
 'data/DL103_2008.json',
 'data/0331103315.json',
 'data/DL320_2002.json',
 'data/L65_2013.json',
 'data/DLR4_2012_A.json',
 'data/0288702916.json']

we can also set the splits

`data_files = {"train": "json_example.json", "test": "json_example.json"}`
`dataset = load_dataset("json", data_files=data_files, field="data")`

In [14]:
dataset = load_dataset("json", data_files=glob('data/*.json'))

# train test split
dataset = dataset["train"].train_test_split(test_size=0.20)

In [15]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'page'],
        num_rows: 84
    })
    test: Dataset({
        features: ['text', 'page'],
        num_rows: 22
    })
})

In [16]:
dataset["train"][0]

{'text': ['Diário da República, 1.ª série — N.º 120 — 24 de Junho de 2008 A energia residual ou acumulada que possa subsistir após o isolamento da máquina deve poder ser dissipada sem risco para as pessoas',
  'A título de excepção ao requisito previsto nos parágrafos precedentes, determinados circuitos podem não ser isola- dos da sua fonte de energia a fim de permitir, por exemplo, a manutenção de peças, a salvaguarda de informações, a iluminação das partes internas, etc. Neste caso, devem ser tomadas disposições especiais para garantir a segurança dos operadores',
  '1.6.4 — Intervenção do operador. — A máquina deve ser concebida, fabricada e equipada de forma a limitar a necessidade de intervenção dos operadores. Sempre que não for possível evitar a intervenção de um operador, esta deve poder efectuar -se facilmente e com segurança',
  '1.6.5 — Limpeza das partes internas. — A máquina deve ser concebida e construída de modo a que a limpeza das suas partes internas que tenham contido

In [7]:
# Concatenate all utterances within a dialogue and map to 'dialog' key
def concatenate_paragraphs(example):
    example['page'] = " ".join(example['text'])
    return example

dataset = dataset.map(concatenate_paragraphs)

Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 84/84 [00:00<00:00, 5937.74 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 4360.19 examples/s]


> Note: not sure if this is really needed, but for simplicity will make a whole text per example
>
> The DialogPT is based on short context, this doesn't lead to good results! Will be trying without concatenating

### DETOUR!!

In [9]:
import os

os.environ["LD_LIBRARY_PATH"]=""

In [15]:
# TODO: FIX TORCH VERSION  -> this one was not the one original!!
!pip3 install torch==2.0.1

Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m52.4 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m50.1 MB/s[0m eta [36m0:00:0

## Encoding

In [8]:
import numpy as np
import tempfile
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('microsoft/DialoGPT-small')
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained('microsoft/DialoGPT-small')

In [9]:
# Encode the dataset
# https://huggingface.co/docs/transformers/en/pad_truncation
def encode(examples):
    encoded = tokenizer(examples['page'],
                        truncation=True, 
                        padding='max_length',
                        max_length=128
                       )
    encoded['labels'] = encoded['input_ids'][:]

    return encoded

encoded_dataset = dataset.map(encode, batched=True)

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 84/84 [00:00<00:00, 157.88 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 139.11 examples/s]


## Training

In [10]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=tempfile.mkdtemp(),   # output directory
    num_train_epochs=100,             # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir=None,                # directory for storing logs
    fp16=True                        # use floating point 16 bit precision for training
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'],
    eval_dataset=encoded_dataset['test']
)

# Evaluation

In [11]:
# Evaluate before fine-tuning
pre_eval_results = trainer.evaluate(encoded_dataset['test'])

../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [50,0,0], t

RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

In [None]:
# Get predictions for validation set before fine tuning for 10 samples
pre_val_predictions = trainer.predict(encoded_dataset['test'].select(range(10)))

## Fine tuning

In [14]:
# Fine-tune the model
trainer.train()

Step,Training Loss
500,4.8235
1000,0.7517


TrainOutput(global_step=1100, training_loss=2.563033887689764, metrics={'train_runtime': 157.4864, 'train_samples_per_second': 53.338, 'train_steps_per_second': 6.985, 'total_flos': 548713267200000.0, 'train_loss': 2.563033887689764, 'epoch': 100.0})

In [15]:
# Get predictions for validation set before fine tuning for 10 samples
pre_val_predictions = trainer.predict(encoded_dataset['test'].select(range(10)))
# Evaluate after fine-tuning
post_eval_results = trainer.evaluate(encoded_dataset['test'])

# Print the evaluation losses before and after fine-tuning
print('Evaluation Results before fine-tuning :', pre_eval_results['eval_loss'])
print('Evaluation Results after fine-tuning  :', post_eval_results['eval_loss'])

# Get predictions for validation set before fine tuning for 10 samples
post_val_predictions = trainer.predict(encoded_dataset['test'].select(range(10)))

# Zip the pre and post tuning predictions
predictions = zip(pre_val_predictions.predictions, post_val_predictions.predictions)

Evaluation Results before fine-tuning : 13.283446311950684
Evaluation Results after fine-tuning  : 5.027191162109375


## Results

In [16]:
for idx, (pre, post) in enumerate(predictions):
    pre_pred = tokenizer.decode(np.argmax(pre, axis=-1), skip_special_tokens=True)
    post_pred = tokenizer.decode(np.argmax(post, axis=-1), skip_special_tokens=True)
    ground_truth = encoded_dataset['test'][idx]["text"]
    
    print(f'Ground truth {idx} \n' + "; ".join(ground_truth) + '\n')
    print('Pre-prediction \n' + "".join(pre_pred) + '\n')
    print('Post-prediction \n'+ "".join(post_pred) + '\n')
    print('----------------------------------------------------------------------------------------------------------------------\n')

Ground truth 0 
Diário da República, 1.ª série — N.º 112 — 9 de junho de 2017 Artigo 47.º Entrada em vigor O presente decreto -lei entre em vigor no dia seguinte ao da sua publicação; Visto e aprovado em Conselho de Ministros de 27 de abril de 2017. — António Luís Santos da Costa — Augusto Ernesto Santos Silva — Mário José Gomes de Freitas Centeno — Manuel de Herédia Caldeira Cabral; Promulgado em 6 de junho de 2017; Publique -se; O Presidente da República, MARCELO REBELO DE SOUSA; Referendado em 7 de junho de 2017; O Primeiro-Ministro, António Luís Santos da Costa; ANEXO I Requisitos essenciais de segurança e de saúde Observações preliminares: 1 — As obrigações previstas pelos requisitos essen- ciais de segurança e de saúde só se aplicam se existir o risco correspondente para o ascensor, ou o componente de segurança para ascensores, considerado quando este for utilizado nas condições previstas pelo instalador ou pelo fabricante; 2 — Os requisitos essenciais de segurança e de saúde do 