### Prepare the dataset
1 line per document

In [6]:
from pathlib import Path
import os

import pandas as pd

In [7]:
DirPpath = Path(os.path.abspath('')).parent
file = str(DirPpath) + "\DataCollection\corpus.csv"

CorpusDF = pd.read_csv(file)

In [8]:
CorpusDF.isna().sum().sum()

0

In [9]:
CorpusDF = CorpusDF.dropna()

CorpusDF['0'] = CorpusDF['0'].apply(lambda x: x.replace('skip to main content this site uses cookies to offer you a better browsing experience. find out more on how we use cookies. accept all cookies accept only essential cookies an official website of the european union', ''))

In [10]:
with open('corpus.txt', 'w', encoding='utf-8') as f:
    for ID, content in zip(CorpusDF['Unnamed: 0'].values, CorpusDF['0'].values):
        f.write('\n'.join([str(ID), content]))

### Extract vocabulary: 

In [11]:
from tokenizers import BertWordPieceTokenizer
import transformers
import tokenizers

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# Essentials
# LOCAL_INPUT_PATH is mapped to S3 input location for covid news articles 
LOCAL_INPUT_PATH = 'corpus.txt' 
# LOCAL_OUTPUT_PATH is mapped to S3 output location where we want to save the custom vocabulary after training the tokenizer
LOCAL_OUTPUT_PATH = 'vocab'
VOCAB_SIZE = 30522

In [13]:
paths = 'corpus.txt'

tokenizer = BertWordPieceTokenizer()
tokenizer.train(files=paths, vocab_size=VOCAB_SIZE)


In [14]:
DirPpath = Path(os.path.abspath('')).parent
OutputPath = str(DirPpath) + "\Fine-tuning"

tokenizer.save_model(OutputPath)


['c:\\Users\\ecaudron001\\Documents\\GitHub\\LLM-for-Tourism\\Fine-tuning\\vocab.txt']

In [15]:
tokenizer = BertWordPieceTokenizer('vocab.txt')

# Evaluate custom tokenizer 
print('Evaluating custom tokenizer')
test_sentence = 'covid is a virus'
print(f'Test sentence: {test_sentence}')
tokens = tokenizer.encode(test_sentence).tokens
print(f'Encoded sentence: {tokens}')
token_id = tokenizer.token_to_id('covid')
print(f'Token ID for token (covid) = {token_id}')
vocab_size = tokenizer.get_vocab_size()
print(f'Vocabulary size = {vocab_size}')

Evaluating custom tokenizer
Test sentence: covid is a virus
Encoded sentence: ['[CLS]', 'covid', 'is', 'a', 'virus', '[SEP]']
Token ID for token (covid) = 1240
Vocabulary size = 30522


### Preprocess MLM custom

In [16]:
from transformers import BertTokenizerFast
from transformers import BertConfig
from pathlib import Path
import transformers 
import logging
import sys
import os
from datasets import load_dataset
from datasets import DatasetDict
import datasets

In [17]:
# Essentials
# LOCAL_INPUT_PATH is mapped to S3 input location for covid news articles 
LOCAL_INPUT_PATH = '/opt/ml/processing/input' 
# LOCAL_OUTPUT_PATH is mapped to S3 output location where we want to save the processed input data (COVID articles)
LOCAL_OUTPUT_PATH = 'c:\\Users\\ecaudron001\\Documents\\GitHub\\LLM-for-Tourism\\Fine-tuning'
MAX_LENGTH = 512
CHUNK_SIZE = 128
N_GPUS = 1

In [18]:
# Re-create BERT WordPiece tokenizer using the saved custom vocabulary from the previous job
config = BertConfig()
print(f'Re-creating BERT tokenizer using custom vocabulary from [vocab.txt]')
tokenizer = BertTokenizerFast.from_pretrained('c:\\Users\\ecaudron001\\Documents\\GitHub\\LLM-for-Tourism\\Fine-tuning', config=config)
tokenizer.model_max_length = MAX_LENGTH
tokenizer.init_kwargs['model_max_length'] = MAX_LENGTH
print(f'Tokenizer: {tokenizer}')

Re-creating BERT tokenizer using custom vocabulary from [vocab.txt]


Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


Tokenizer: BertTokenizerFast(name_or_path='c:\Users\ecaudron001\Documents\GitHub\LLM-for-Tourism\Fine-tuning', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [19]:
# Read dataset and collate to create mini batches for Masked Language Model (MLM) training
print('Reading and collating input data to create mini batches for Masked Language Model (MLM) training')
dataset = load_dataset('text', data_files='corpus.txt', split='train', cache_dir='/tmp/cache')
print(f'Dataset: {dataset}')

# Split dataset into train and validation splits 
print('Splitting dataset into train and validation splits')
train_test_splits = dataset.train_test_split(shuffle=True, seed=123, test_size=0.1)
data_splits = DatasetDict({'train': train_test_splits['train'], 
                           'validation': train_test_splits['test']})
print(f'Data splits: {data_splits}')

Reading and collating input data to create mini batches for Masked Language Model (MLM) training


Downloading data files: 100%|██████████| 1/1 [00:00<?, ?it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 158.55it/s]
Generating train split: 178246 examples [00:00, 378683.94 examples/s]


Dataset: Dataset({
    features: ['text'],
    num_rows: 178246
})
Splitting dataset into train and validation splits
Data splits: DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 160421
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 17825
    })
})


In [20]:
# Tokenize dataset
def tokenize(article, tokenizer = tokenizer):
    tokenized_article = tokenizer(article['text'])
    if tokenizer.is_fast:
        tokenized_article['word_ids'] = [tokenized_article.word_ids(i) for i in range(len(tokenized_article['input_ids']))]
    return tokenized_article


print('Tokenizing dataset splits')
num_proc = int(os.cpu_count()/N_GPUS)
print(f'Total number of processes = {num_proc}')
tokenized_datasets = data_splits.map(tokenize, batched=True, num_proc=num_proc, remove_columns=['text'])
print(f'Tokenized datasets: {tokenized_datasets}')


Tokenizing dataset splits
Total number of processes = 8


Map (num_proc=8): 100%|██████████| 160421/160421 [00:29<00:00, 5454.03 examples/s] 
Map (num_proc=8): 100%|██████████| 17825/17825 [00:18<00:00, 968.76 examples/s] 


Tokenized datasets: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 160421
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids'],
        num_rows: 17825
    })
})


In [21]:
# Concat and chunk dataset 
def concat_and_chunk(articles, CHUNK_SIZE = CHUNK_SIZE):
    # Concatenate all texts
    concatenated_examples = {key: sum(articles[key], []) for key in articles.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(articles.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length//CHUNK_SIZE) * CHUNK_SIZE
    # Split by chunks of max_len
    chunked_articles = {key: [text[i : i+CHUNK_SIZE] for i in range(0, total_length, CHUNK_SIZE)] for key, text in concatenated_examples.items()}
    # Create a new labels column
    chunked_articles['labels'] = chunked_articles['input_ids'].copy()
    return chunked_articles
    
print('Concatenating and chunking the datasets to a fixed length')
chunked_datasets = tokenized_datasets.map(concat_and_chunk, batched=True, num_proc=num_proc)
print(f'Chunked datasets: {chunked_datasets}')

Concatenating and chunking the datasets to a fixed length


Map (num_proc=8): 100%|██████████| 160421/160421 [00:30<00:00, 5341.79 examples/s] 
Map (num_proc=8): 100%|██████████| 17825/17825 [00:16<00:00, 1059.03 examples/s]


Chunked datasets: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 20064
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 2204
    })
})


In [22]:
# Save chunked datasets to local disk (EBS volume)
print(f'Saving chunked datasets to local disk {LOCAL_OUTPUT_PATH}')
chunked_datasets.save_to_disk(f'{LOCAL_OUTPUT_PATH}')

# Validate if datasets were saved correctly
print('Validating if datasets were saved correctly')
reloaded_dataset = datasets.load_from_disk(f'{LOCAL_OUTPUT_PATH}')
print(f'Reloaded dataset: {reloaded_dataset}')

Saving chunked datasets to local disk c:\Users\ecaudron001\Documents\GitHub\LLM-for-Tourism\Fine-tuning


Saving the dataset (0/1 shards):   0%|          | 0/20064 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 20064/20064 [00:00<00:00, 126617.64 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2204/2204 [00:00<00:00, 60287.51 examples/s]


Validating if datasets were saved correctly
Reloaded dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 20064
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 2204
    })
})


In [97]:
len(reloaded_dataset['train'][10]["labels"])

128