# DelphBERT: Create pre-training dataset

## Import Dataset

In [1]:
import torch
torch.cuda.empty_cache()
from dataclasses import dataclass, field
from datasets import load_from_disk, Sequence, Features, Value

from transformers import (PreTrainedTokenizer, HfArgumentParser, AutoTokenizer)

In [2]:
PATH_RAW_FILES = "/home/leonardovida/data/volume_1/data-histaware/merged_articles/1970s"
PATH_TOKENIZER_DIR = "/home/leonardovida/data/volume_1/data-histaware/tokenizer"
PATH_DATASET_DIR = "/home/leonardovida/data/volume_1/data-histaware/dataset"
PATH_MODEL_DIR = "/home/leonardovida/data/volume_1/data-histaware/model"

In [3]:
dataset = load_from_disk(PATH_DATASET_DIR)

In [4]:
dataset = dataset.remove_columns(['p', 'recordIdentifier', 'sentences', 'sents_len', 'subject'])

#train_dataset, test_dataset = dataset(split=['train', 'test'])
#test_dataset = test_dataset.shuffle().select(range(10000))

In [5]:
a_file = open("/home/leonardovida/data/volume_1/data-histaware/dataset/data.1970.txt")
file_contents = a_file.read()
contents_split = file_contents.splitlines()
contents_split = contents_split[1:10000]

NameError: name 'pd' is not defined

In [6]:
import pandas as pd
contents_split = pd.DataFrame(contents_split)

In [8]:
contents_split.loc[1][0]

'"Het pleidooi van de groep komt nadat onlangs een ""groep zigeuners, onder leiding van Koka Petaio, in geen enkele Rijnmondgemeente welkom bleek te zijn."'

In [9]:
contents_split.to_csv("/home/leonardovida/data/volume_1/data-histaware/dataset/data.1970_test.txt", header=None, index=None, sep='\n', mode='a')
with open("/home/leonardovida/data/volume_1/data-histaware/dataset/data.1970_test.txt") as f:
    lines = f.readlines()

['"""Het pleidooi van de groep komt nadat onlangs een """"groep zigeuners, onder leiding van Koka Petaio, in geen enkele Rijnmondgemeente welkom bleek te zijn."""\n', 'Er moet een gemeente bereid worden gevonden, bijvoorbeeld Rotterdam, om een stuk grond een tijdelijke bestemming te geven als zigeuenerkampement. Dat Rijnmond deze bestemming voorlopig in het streekplan opnemen en GS van ZuidHolland dit goed7 keuren..\n', ".' .'' . 718.00 IX draaischijf, 18.35 Plezier met Charlie Chaplin. 19.10 Plaatselijke tijd, 19.45 Journaal. .  ..'.'20.15 EEN GEVAL VOOR GORON 21.40 TOT IN HET LAATSTE DORP 22.25 DAGBOEK R.K. KERK 22.40 JOURNAAL 23.00 APROPOS FILM Actualiteiten uil de filmbranch 23.45 JOURNAAL SHBWJ\n", "APOTHEKEN Voor spoedgevallen telefoon 166845. .' TANDARTSEN L.. A. v. d. Berg, Kastanjestrafet 1012, Spijkenisse (van 17 17.30 uur).\n", "Twee ministers uit het kabinetDen Uyl hebben met wisselend succes hun eerste confrontatie met de NAVO achter de rug Vredeling (defensie) in Brussel 

In [10]:
for l in range(1, 10, 1):
    print(lines[l])

"""Het pleidooi van de groep komt nadat onlangs een """"groep zigeuners, onder leiding van Koka Petaio, in geen enkele Rijnmondgemeente welkom bleek te zijn."""

Er moet een gemeente bereid worden gevonden, bijvoorbeeld Rotterdam, om een stuk grond een tijdelijke bestemming te geven als zigeuenerkampement. Dat Rijnmond deze bestemming voorlopig in het streekplan opnemen en GS van ZuidHolland dit goed7 keuren..

.' .'' . 718.00 IX draaischijf, 18.35 Plezier met Charlie Chaplin. 19.10 Plaatselijke tijd, 19.45 Journaal. .  ..'.'20.15 EEN GEVAL VOOR GORON 21.40 TOT IN HET LAATSTE DORP 22.25 DAGBOEK R.K. KERK 22.40 JOURNAAL 23.00 APROPOS FILM Actualiteiten uil de filmbranch 23.45 JOURNAAL SHBWJ

APOTHEKEN Voor spoedgevallen telefoon 166845. .' TANDARTSEN L.. A. v. d. Berg, Kastanjestrafet 1012, Spijkenisse (van 17 17.30 uur).

Twee ministers uit het kabinetDen Uyl hebben met wisselend succes hun eerste confrontatie met de NAVO achter de rug Vredeling (defensie) in Brussel en Van der. Stoel 

Remove not useful columns

## HuggingFace way

In [16]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from transformers import RobertaTokenizerFast

tokenizer = ByteLevelBPETokenizer(
    f"{PATH_TOKENIZER_DIR}/1970/vocab.json",
    f"{PATH_TOKENIZER_DIR}/1970/merges.txt",
)

tokenizer = RobertaTokenizerFast.from_pretrained(f"{PATH_TOKENIZER_DIR}/1970", max_len=512)

## MLM

In [17]:
import logging
import math
import os
import sys
import torch
import warnings
from tqdm.notebook import tqdm
from dataclasses import dataclass, field
from typing import Optional

from datasets import load_dataset, Dataset

import transformers
from transformers import (
                          CONFIG_MAPPING,
                          MODEL_FOR_MASKED_LM_MAPPING,
                          MODEL_FOR_CAUSAL_LM_MAPPING,
                          PreTrainedTokenizer,
                          TrainingArguments,
                          AutoConfig,
                          AutoTokenizer,
                          AutoModelWithLMHead,
                          AutoModelForCausalLM,
                          AutoModelForMaskedLM,
                          LineByLineTextDataset,
                          TextDataset,
                          DataCollatorForLanguageModeling,
                          DataCollatorForWholeWordMask,
                          DataCollatorForPermutationLanguageModeling,
                          PretrainedConfig,
                          Trainer,
                          set_seed,
                          )

from transformers.trainer_utils import get_last_checkpoint, is_main_process
from transformers.utils import check_min_version

# Set seed for reproducibility,
set_seed(123)

# Look for gpu to use. Will use `cpu` by default if no gpu found.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

In [18]:
class ModelDataArguments(object):
    """Define model and data configuration needed to perform pretraining.

    Eve though all arguments are optional there still needs to be a certain 
    number of arguments that require values attributed.

    Arguments:

    train_data_file (:obj:`str`, `optional`): 
      Path to your .txt file dataset. If you have an example on each line of 
      the file make sure to use line_by_line=True. If the data file contains 
      all text data without any special grouping use line_by_line=False to move 
      a block_size window across the text file.
      This argument is optional and it will have a `None` value attributed 
      inside the function.

    eval_data_file (:obj:`str`, `optional`): 
      Path to evaluation .txt file. It has the same format as train_data_file.
      This argument is optional and it will have a `None` value attributed 
      inside the function.

    line_by_line (:obj:`bool`, `optional`, defaults to :obj:`False`): 
      If the train_data_file and eval_data_file contains separate examples on 
      each line then line_by_line=True. If there is no separation between 
      examples and train_data_file and eval_data_file contains continuous text 
      then line_by_line=False and a window of block_size will be moved across 
      the files to acquire examples.
      This argument is optional and it has a default value.

    mlm (:obj:`bool`, `optional`, defaults to :obj:`False`): 
      Is a flag that changes loss function depending on model architecture. 
      This variable needs to be set to True when working with masked language 
      models like bert or roberta and set to False otherwise. There are 
      functions that will raise ValueError if this argument is 
      not set accordingly.
      This argument is optional and it has a default value.

    whole_word_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
      Used as flag to determine if we decide to use whole word masking or not. 
      Whole word masking means that whole words will be masked during training 
      instead of tokens which can be chunks of words.
      This argument is optional and it has a default value.

    mlm_probability(:obj:`float`, `optional`, defaults to :obj:`0.15`): 
      Used when training masked language models. Needs to have mlm set to True. 
      It represents the probability of masking tokens when training model.
      This argument is optional and it has a default value.

    plm_probability (:obj:`float`, `optional`, defaults to :obj:`float(1/6)`): 
      Flag to define the ratio of length of a span of masked tokens to 
      surrounding context length for permutation language modeling. 
      Used for XLNet.
      This argument is optional and it has a default value.

    max_span_length (:obj:`int`, `optional`, defaults to :obj:`5`): 
      Flag may also be used to limit the length of a span of masked tokens used 
      for permutation language modeling. Used for XLNet.
      This argument is optional and it has a default value.

    block_size (:obj:`int`, `optional`, defaults to :obj:`-1`): 
      It refers to the windows size that is moved across the text file. 
      Set to -1 to use maximum allowed length.
      This argument is optional and it has a default value.

    overwrite_cache (:obj:`bool`, `optional`, defaults to :obj:`False`): 
      If there are any cached files, overwrite them.
      This argument is optional and it has a default value.

    model_type (:obj:`str`, `optional`): 
      Type of model used: bert, roberta, gpt2. 
      More details: https://huggingface.co/transformers/pretrained_models.html
      This argument is optional and it will have a `None` value attributed 
      inside the function.

    model_config_name (:obj:`str`, `optional`):
      Config of model used: bert, roberta, gpt2. 
      More details: https://huggingface.co/transformers/pretrained_models.html
      This argument is optional and it will have a `None` value attributed 
      inside the function.

    tokenizer_name: (:obj:`str`, `optional`)
      Tokenizer used to process data for training the model. 
      It usually has same name as model_name_or_path: bert-base-cased, 
      roberta-base, gpt2 etc.
      This argument is optional and it will have a `None` value attributed 
      inside the function.

    model_name_or_path (:obj:`str`, `optional`): 
      Path to existing transformers model or name of 
      transformer model to be used: bert-base-cased, roberta-base, gpt2 etc. 
      More details: https://huggingface.co/transformers/pretrained_models.html
      This argument is optional and it will have a `None` value attributed 
      inside the function.

    model_cache_dir (:obj:`str`, `optional`): 
      Path to cache files to save time when re-running code.
      This argument is optional and it will have a `None` value attributed 
      inside the function.

    Raises:

        ValueError: If `CONFIG_MAPPING` is not loaded in global variables.

        ValueError: If `model_type` is not present in `CONFIG_MAPPING.keys()`.

        ValueError: If `model_type`, `model_config_name` and 
          `model_name_or_path` variables are all `None`. At least one of them 
          needs to be set.

        warnings: If `model_config_name` and `model_name_or_path` are both 
          `None`, the model will be trained from scratch.

        ValueError: If `tokenizer_name` and `model_name_or_path` are both 
          `None`. We need at least one of them set to load tokenizer.

    """
    def __init__(self, train_data_file=None, eval_data_file=None, 
               line_by_line=False, mlm=False, mlm_probability=0.15, 
               whole_word_mask=False, plm_probability=float(1/6), 
               max_span_length=5, block_size=-1, overwrite_cache=False, 
               model_type=None, model_config_name=None, tokenizer_name=None, 
               model_name_or_path=None, model_cache_dir=None):
     
    # Make sure CONFIG_MAPPING is imported from transformers module.
        if 'CONFIG_MAPPING' not in globals():
            raise ValueError('Could not find `CONFIG_MAPPING` imported! Make sure' \
                           ' to import it from `transformers` module!')

        # Make sure model_type is valid.
        if (model_type is not None) and (model_type not in CONFIG_MAPPING.keys()):
            raise ValueError('Invalid `model_type`! Use one of the following: %s' %
                           (str(list(CONFIG_MAPPING.keys()))))

        # Make sure that model_type, model_config_name and model_name_or_path 
        # variables are not all `None`.
        if not any([model_type, model_config_name, model_name_or_path]):
            raise ValueError('You can`t have all `model_type`, `model_config_name`,' \
                           ' `model_name_or_path` be `None`! You need to have' \
                           'at least one of them set!')

        # Check if a new model will be loaded from scratch.
        if not any([model_config_name, model_name_or_path]):
          # Setup warning to show pretty. This is an overkill
          warnings.formatwarning = lambda message,category,*args,**kwargs: \
                                   '%s: %s\n' % (category.__name__, message)
          # Display warning.
          warnings.warn('You are planning to train a model from scratch! 🙀')

        # Check if a new tokenizer wants to be loaded.
        # This feature is not supported!
        if not any([tokenizer_name, model_name_or_path]):
          # Can't train tokenizer from scratch here! Raise error.
          raise ValueError('You want to train tokenizer from scratch! ' \
                        'That is not possible yet! You can train your own ' \
                        'tokenizer separately and use path here to load it!')

        # Set all data related arguments.
        self.train_data_file = train_data_file
        self.eval_data_file = eval_data_file
        self.line_by_line = line_by_line
        self.mlm = mlm
        self.whole_word_mask = whole_word_mask
        self.mlm_probability = mlm_probability
        self.plm_probability = plm_probability
        self.max_span_length = max_span_length
        self.block_size = block_size
        self.overwrite_cache = overwrite_cache

        # Set all model and tokenizer arguments.
        self.model_type = model_type
        self.model_config_name = model_config_name
        self.tokenizer_name = tokenizer_name
        self.model_name_or_path = model_name_or_path
        self.model_cache_dir = model_cache_dir

        return

In [19]:
def get_model_config(args: ModelDataArguments):
    """
    Get model configuration.

    Using the ModelDataArguments return the model configuration.

    Arguments:

    args (:obj:`ModelDataArguments`):
      Model and data configuration arguments needed to perform pretraining.

    Returns:

    :obj:`PretrainedConfig`: Model transformers configuration.

    Raises:

    ValueError: If `mlm=True` and `model_type` is NOT in ["bert", 
          "roberta", "distilbert", "camembert"]. We need to use a masked 
          language model in order to set `mlm=True`.
    """
 
    # Check model configuration.
    if args.model_config_name is not None:
        # Use model configure name if defined.
        model_config = AutoConfig.from_pretrained(args.model_config_name, 
                                                  cache_dir=args.model_cache_dir)

    elif args.model_name_or_path is not None:
        # Use model name or path if defined.
        model_config = AutoConfig.from_pretrained(args.model_name_or_path, 
                                                  cache_dir=args.model_cache_dir)

    else:
    # Use config mapping if building model from scratch.
        model_config = CONFIG_MAPPING[args.model_type]()

    # Make sure `mlm` flag is set for Masked Language Models (MLM).
    if (model_config.model_type in ["bert", "roberta", "distilbert", 
                                  "camembert"]) and (args.mlm is False):
        raise ValueError('BERT and RoBERTa-like models do not have LM heads')

    # Adjust block size for xlnet.
    if model_config.model_type == "xlnet":
    # xlnet used 512 tokens when training.
        args.block_size = 512
        # setup memory length
        model_config.mem_len = 1024

    return model_config

def get_model_config(args: ModelDataArguments):
    """Get model configuration.

    Using the ModelDataArguments return the model configuration.

    Arguments:

    args (:obj:`ModelDataArguments`):
      Model and data configuration arguments needed to perform pretraining.

    Returns:

    :obj:`PretrainedConfig`: Model transformers configuration.

    Raises:

    ValueError: If `mlm=True` and `model_type` is NOT in ["bert", 
          "roberta", "distilbert", "camembert"]. We need to use a masked 
          language model in order to set `mlm=True`.
    """

    # Check model configuration.
    if args.model_config_name is not None:
    # Use model configure name if defined.
        model_config = AutoConfig.from_pretrained(args.model_config_name, 
                                          cache_dir=args.model_cache_dir)

    elif args.model_name_or_path is not None:
    # Use model name or path if defined.
        model_config = AutoConfig.from_pretrained(args.model_name_or_path, 
                                          cache_dir=args.model_cache_dir)

    else:
    # Use config mapping if building model from scratch.
        model_config = CONFIG_MAPPING[args.model_type]()

    # Make sure `mlm` flag is set for Masked Language Models (MLM).
    if (model_config.model_type in ["bert", "roberta", "distilbert", 
                                  "camembert"]) and (args.mlm is False):
        raise ValueError('BERT and RoBERTa-like models do not have LM heads')

    # Adjust block size for xlnet.
    if model_config.model_type == "xlnet":
    # xlnet used 512 tokens when training.
        args.block_size = 512
        # setup memory length
        model_config.mem_len = 1024

    return model_config


def get_tokenizer(args: ModelDataArguments):
    """
    Get model tokenizer.

    Using the ModelDataArguments return the model tokenizer and change 
    `block_size` form `args` if needed.

    Arguments:

    args (:obj:`ModelDataArguments`):
      Model and data configuration arguments needed to perform pretraining.

    Returns:

    :obj:`PreTrainedTokenizer`: Model transformers tokenizer.

    """

    # Check tokenizer configuration.
    if args.tokenizer_name:
    # Use tokenizer name if define.
        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, 
                                                  cache_dir=args.model_cache_dir)

    elif args.model_name_or_path:
    # Use tokenizer name of path if defined.
        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path, 
                                                  cache_dir=args.model_cache_dir)

    # Setp data block size.
    if args.block_size <= 0:
    # Set block size to maximum length of tokenizer.
    # Input block size will be the max possible for the model.
    # Some max lengths are very large and will cause a
        args.block_size = tokenizer.model_max_length
    else:
    # Never go beyond tokenizer maximum length.
        args.block_size = min(args.block_size, tokenizer.model_max_length)

    return tokenizer
  

def get_model(args: ModelDataArguments, model_config):
    """
    Get model.

    Using the ModelDataArguments return the actual model.

    Arguments:

    args (:obj:`ModelDataArguments`):
      Model and data configuration arguments needed to perform pretraining.

    model_config (:obj:`PretrainedConfig`):
      Model transformers configuration.

    Returns:

    :obj:`torch.nn.Module`: PyTorch model.

    """

    # Make sure MODEL_FOR_MASKED_LM_MAPPING and MODEL_FOR_CAUSAL_LM_MAPPING are 
    # imported from transformers module.
    if ('MODEL_FOR_MASKED_LM_MAPPING' not in globals()) and \
        ('MODEL_FOR_CAUSAL_LM_MAPPING' not in globals()):
        raise ValueError('Could not find `MODEL_FOR_MASKED_LM_MAPPING`')

    # Check if using pre-trained model or train from scratch.
    if args.model_name_or_path:
    # Use pre-trained model.
        if type(model_config) in MODEL_FOR_MASKED_LM_MAPPING.keys():
            return AutoModelForMaskedLM.from_pretrained(
                            args.model_name_or_path,
                            from_tf=bool(".ckpt" in args.model_name_or_path),
                            config=model_config,
                            cache_dir=args.model_cache_dir,
                            )
        elif type(model_config) in MODEL_FOR_CAUSAL_LM_MAPPING.keys():
            # Causal language modeling head.
            return AutoModelForCausalLM.from_pretrained(
                                              args.model_name_or_path, 
                                              from_tf=bool(".ckpt" in 
                                                            args.model_name_or_path),
                                              config=model_config, 
                                              cache_dir=args.model_cache_dir)
        else:
            raise ValueError(
                'Invalid `model_name_or_path`! It should be in %s or %s!' % 
                (str(MODEL_FOR_MASKED_LM_MAPPING.keys()), 
                 str(MODEL_FOR_CAUSAL_LM_MAPPING.keys())))
    else:
        # Use model from configuration - train from scratch.
        print("Training new model from scratch!")
        return AutoModelWithLMHead.from_config(config)


def get_dataset(dataset: Dataset, text_column_name: str, tokenizer: PreTrainedTokenizer, 
                evaluate: bool=False):
    """
    Process dataset file into PyTorch Dataset.

    Using the ModelDataArguments return the actual model.

    Arguments:

    args (:obj:`ModelDataArguments`):
    Model and data configuration arguments needed to perform pretraining.

    tokenizer (:obj:`PreTrainedTokenizer`):
    Model transformers tokenizer.

    evaluate (:obj:`bool`, `optional`, defaults to :obj:`False`):
    If set to `True` the test / validation file is being handled.
    If set to `False` the train file is being handled.

    Returns:

    :obj:`Dataset`: PyTorch Dataset that contains file's data.

    """
    def tokenize_function(examples):
        return tokenizer(examples[text_column_name], return_special_tokens_mask=True)

    #Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
    # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
    # efficient when it receives the `special_tokens_mask`.
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        num_proc=9
    )

    # Main data processing function that will concatenate all texts from our dataset and generate chunks of
    # max_seq_length.
    #def group_texts(examples):
    #    # Concatenate all texts.
    #    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    #    total_length = len(concatenated_examples[list(examples.keys())[0]])
    #    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    #    # customize this part to your needs.
    #    total_length = (total_length // max_seq_length) * max_seq_length
    #    # Split by chunks of max_len.
    #    result = {
    #        k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
    #        for k, t in concatenated_examples.items()
    #    }
    #    return result

    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
    # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
    # might be slower to preprocess.
    #
    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

    tokenized_dataset = tokenized_dataset.map(
        #group_texts,
        #batched=True,
        num_proc=9,
    )

    train_dataset = tokenized_dataset["train"]

    return train_dataset


def get_collator(args: ModelDataArguments, model_config: PretrainedConfig, 
                 tokenizer: PreTrainedTokenizer):
    """
    Get appropriate collator function.

    Collator function will be used to collate a PyTorch Dataset object.

    Arguments:

    args (:obj:`ModelDataArguments`):
      Model and data configuration arguments needed to perform pretraining.

    model_config (:obj:`PretrainedConfig`):
      Model transformers configuration.

    tokenizer (:obj:`PreTrainedTokenizer`):
      Model transformers tokenizer.

    Returns:

    :obj:`data_collator`: Transformers specific data collator.

    """

    # Configure data for rest of model types.
    if args.mlm and args.whole_word_mask:
      # Use whole word masking.
      return DataCollatorForWholeWordMask(
                                          tokenizer=tokenizer, 
                                          mlm_probability=args.mlm_probability,
                                          )
    else:
        return DataCollatorForLanguageModeling(
            tokenizer=tokenizer, 
            mlm=args.mlm, 
            mlm_probability=args.mlm_probability,
            pad_to_multiple_of=None,
        )

In [20]:
PATH_RAW_FILES = "/home/leonardovida/data/volume_1/data-histaware/merged_articles/1970s"
PATH_TOKENIZER_DIR = "/home/leonardovida/data/volume_1/data-histaware/tokenizer"
PATH_DATASET_DIR = "/home/leonardovida/data/volume_1/data-histaware/dataset"
PATH_MODEL_DIR = "/home/leonardovida/data/volume_1/data-histaware/model"

# See comments in `ModelDataArguments` class.
model_data_args = ModelDataArguments(
                                    train_data_file=f'{PATH_DATASET_DIR}/data.1970.txt', 
                                    #eval_data_file='/content/test.txt', 
                                    line_by_line=True, 
                                    mlm=True,
                                    whole_word_mask=True,
                                    mlm_probability=0.15,
                                    plm_probability=float(1/6), 
                                    max_span_length=5,
                                    block_size=50, 
                                    overwrite_cache=False, 
                                    model_type='bert', 
                                    model_config_name='bert-base-cased', 
                                    tokenizer_name='bert-base-cased', 
                                    model_name_or_path='bert-base-cased', 
                                    model_cache_dir='/home/leonardovida/data/volume_1/huggingface_cache/',
)

# Define arguments for training
# Note: I only used the arguments I care about. `TrainingArguments` contains
# a lot more arguments. For more details check the awesome documentation:
# https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments
training_args = TrainingArguments(
                          # The output directory where the model predictions 
                          # and checkpoints will be written.
                          output_dir=PATH_MODEL_DIR,

                          # Overwrite the content of the output directory.
                          overwrite_output_dir=True,

                          # Whether to run training or not.
                          do_train=True, 
                          
                          # Whether to run evaluation on the dev or not.
                          do_eval=False,
                          
                          # Batch size GPU/TPU core/CPU training.
                          per_device_train_batch_size=4,
                          
                          # Batch size  GPU/TPU core/CPU for evaluation.
                          per_device_eval_batch_size=100,

                          # evaluation strategy to adopt during training
                          # `no`: No evaluation during training.
                          # `steps`: Evaluate every `eval_steps`.
                          # `epoch`: Evaluate every end of epoch.
                          evaluation_strategy='steps',

                          # How often to show logs. I will se this to 
                          # plot history loss and calculate perplexity.
                          logging_steps=700,

                          # Number of update steps between two 
                          # evaluations if evaluation_strategy="steps".
                          # Will default to the same value as l
                          # logging_steps if not set.
                          eval_steps = None,
                          
                          # Set prediction loss to `True` in order to 
                          # return loss for perplexity calculation.
                          prediction_loss_only=True,

                          # The initial learning rate for Adam. 
                          # Defaults to 5e-5.
                          learning_rate = 5e-5,

                          # The weight decay to apply (if not zero).
                          weight_decay=0,

                          # Epsilon for the Adam optimizer. 
                          # Defaults to 1e-8
                          adam_epsilon = 1e-8,

                          # Maximum gradient norm (for gradient 
                          # clipping). Defaults to 0.
                          max_grad_norm = 1.0,
                          # Total number of training epochs to perform 
                          # (if not an integer, will perform the 
                          # decimal part percents of
                          # the last epoch before stopping training).
                          num_train_epochs = 2,

                          # Number of updates steps before two checkpoint saves. 
                          # Defaults to 500
                          save_steps = -1,
                          )

In [21]:
# Load model configuration.
print('Loading model configuration...')
config = get_model_config(model_data_args)

# Load model tokenizer.
print('Loading model`s tokenizer...')
tokenizer = get_tokenizer(model_data_args)

# Loading model.
print('Loading actual model...')
model = get_model(model_data_args, config)

# Resize model to fit all tokens in tokenizer.
model.resize_token_embeddings(len(tokenizer))

Loading model configuration...
Loading model`s tokenizer...
Loading actual model...


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(28996, 768, padding_idx=0)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["p_clean"],
                     padding=True,
                     truncation=True,
                     max_length=200,
                     return_special_tokens_mask=True)

dataset = dataset.map(tokenize_function, num_proc=9)

     

HBox(children=(FloatProgress(value=0.0, description='#8', max=465134.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='#3', max=465134.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='#7', max=465134.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='#1', max=465134.0, style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='#0', max=465135.0, style=ProgressStyle(description_width=…








In [None]:
# Setup train dataset if `do_train` is set.
print('Creating train dataset...')
#train_dataset = get_dataset(
#    dataset = dataset,
#    text_column_name = "p_clean",
#    #ModelDataArguments = model_data_args,
#    tokenizer = tokenizer, 
#    evaluate = False) if training_args.do_train else None

# Setup evaluation dataset if `do_eval` is set.
#print('Creating evaluate dataset...')
#eval_dataset = get_dataset(model_data_args, tokenizer=tokenizer, evaluate=True) if training_args.do_eval else None

# Get data collator to modify data format depending on type of model used.
data_collator = get_collator(model_data_args, config, tokenizer)

# Check how many logging prints you'll have. This is to avoid overflowing the 
# notebook with a lot of prints. Display warning to user if the logging steps 
# that will be displayed is larger than 100.
if (len(train_dataset) // training_args.per_device_train_batch_size \
    // training_args.logging_steps * training_args.num_train_epochs) > 100:
  # Display warning.
  warnings.warn('Your `logging_steps` value will will do a lot of printing!' \
                ' Consider increasing `logging_steps` to avoid overflowing' \
                ' the notebook with a lot of prints!')

## Train

In [None]:
# Initialize Trainer.
print('Loading `trainer`...')
trainer = Trainer(model=model,
                  args=training_args,
                  data_collator=data_collator,
                  train_dataset=train_dataset,
                  #eval_dataset=eval_dataset,
                  )


# Check model path to save.
if training_args.do_train:
    print('Start training...')

    # Setup model path if the model to train loaded from a local path.
    model_path = (model_data_args.model_name_or_path 
                if model_data_args.model_name_or_path is not None and 
                os.path.isdir(model_data_args.model_name_or_path) 
                else None
                )
    # Run training.
    trainer.train(model_path=model_path)
    # Save model.
    trainer.save_model()

    # For convenience, we also re-save the tokenizer to the same directory,
    # so that you can share your model easily on huggingface.co/models =).
    if trainer.is_world_process_zero():
    tokenizer.save_pretrained(training_args.output_dir)

## Plot Training

In [None]:
# Keep track of train and evaluate loss.
loss_history = {'train_loss':[], 'eval_loss':[]}

# Keep track of train and evaluate perplexity.
# This is a metric useful to track for language models.
perplexity_history = {'train_perplexity':[], 'eval_perplexity':[]}

# Loop through each log history.
for log_history in trainer.state.log_history:

    if 'loss' in log_history.keys():
        # Deal with trianing loss.
        loss_history['train_loss'].append(log_history['loss'])
        perplexity_history['train_perplexity'].append(math.exp(log_history['loss']))

    elif 'eval_loss' in log_history.keys():
        # Deal with eval loss.
        loss_history['eval_loss'].append(log_history['eval_loss'])
        perplexity_history['eval_perplexity'].append(math.exp(log_history['eval_loss']))

# Plot Losses.
plot_dict(loss_history,
          start_step=training_args.logging_steps, 
          step_size=training_args.logging_steps,
          use_title='Loss', 
          use_xlabel='Train Steps',
          use_ylabel='Values',
          magnify=2)
print()

# Plot Perplexities.
plot_dict(perplexity_history, start_step=training_args.logging_steps, 
          step_size=training_args.logging_steps, use_title='Perplexity', 
          use_xlabel='Train Steps', use_ylabel='Values', magnify=2)

# check if `do_eval` flag is set.
if training_args.do_eval:
    # capture output if trainer evaluate.
    eval_output = trainer.evaluate()
    # compute perplexity from model loss.
    perplexity = math.exp(eval_output["eval_loss"])
    print('\nEvaluate Perplexity: {:10,.2f}'.format(perplexity))
else:
    print('No evaluation needed. No evaluation data provided, `do_eval=False`!')

# Semi Auto

In [11]:
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)
configuration = RobertaConfig() # standard

model = RobertaForMaskedLM(config=config)

print(model.config)

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.4.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}



In [20]:
import gc

gc.collect()

from transformers import LineByLineTextDataset
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=f"{PATH_DATASET_DIR}/data.1970.txt",
    block_size=4,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir=f"{PATH_MODEL_DIR}",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_gpu_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

MemoryError: 

In [None]:
%%time
trainer.train()
trainer.save_model(f"{PATH_MODEL_DIR}")

In [48]:
model_name_or_path = None
model_type = "wietsedv/bert-base-dutch-cased" # BertForMaskedLM) # (RobertaConfig, RobertaForMaskedLM),
tokenizer_name = f"{PATH_TOKENIZER_DIR}/1970.json"
cache_dir = '/home/leonardovida/data/volume_1/huggingface_cache/'
use_fast_tokenizer = True
model_revision = "main"
use_auth_token = False
dataset_name = None
dataset_config_name = None
train_file = dataset["train"]
mlm_probability = 0.15
#validation_file = dataset["validation"]

# Set seed before initializing model.
set_seed(2021)

# Load model config
config_kwargs = {
    "cache_dir": cache_dir,
    "use_auth_token": use_auth_token,
}

config = AutoConfig.from_pretrained(model_type, **config_kwargs)
#config = CONFIG_MAPPING[model_type]()

# Load Tokenizer config
tokenizer_kwargs = {
    "cache_dir": cache_dir,
    "use_fast": use_fast_tokenizer,
    "use_auth_token": True if use_auth_token else None,
}

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, **tokenizer_kwargs)

# Load model
model = AutoModelForMaskedLM.from_config(
    config,
    cache_dir=cache_dir)

model.resize_token_embeddings(len(tokenizer))


# Preprocessing the datasets.
# First we tokenize all the texts.

column_names = datasets["train"].column_names

text_column_name = "text" if "text" in column_names else column_names[0]

max_seq_length = tokenizer.model_max_length

def tokenize_function(examples):
    return tokenizer(examples[text_column_name], return_special_tokens_mask=True)

#Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
# efficient when it receives the `special_tokens_mask`.
tokenized_datasets = datasets.map(
    tokenize_function,
    #batched=True,
    num_proc=9,
    remove_columns=column_names,
)

# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // max_seq_length) * max_seq_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
        for k, t in concatenated_examples.items()
    }
    return result

# Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
# remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
# might be slower to preprocess.
#
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

tokenized_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=9,
)

train_dataset = tokenized_datasets["train"]

# Data collator
# This one will take care of randomly masking the tokens.
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=mlm_probability,
    pad_to_multiple_of=None,
)

# Initialize our Trainer for training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=None,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Training
if True:
    checkpoint = None
    train_result = trainer.train()
    trainer.save_model()  # Saves the tokenizer too for easy upload
    metrics = train_result.metrics

    max_train_samples = len(train_dataset)
    metrics["train_samples"] = min(max_train_samples, len(train_dataset))

    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

ValueError: Unrecognized model in /home/leonardovida/data/volume_1/data-histaware/tokenizer/1970.json. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: speech_to_text, wav2vec2, m2m_100, convbert, led, blenderbot-small, retribert, ibert, mt5, t5, mobilebert, distilbert, albert, bert-generation, camembert, xlm-roberta, pegasus, marian, mbart, mpnet, bart, blenderbot, reformer, longformer, roberta, deberta-v2, deberta, flaubert, fsmt, squeezebert, bert, openai-gpt, gpt2, transfo-xl, xlnet, xlm-prophetnet, prophetnet, xlm, ctrl, electra, encoder-decoder, funnel, lxmert, dpr, layoutlm, rag, tapas