Training a transformer on the Wolof-French parallel corpus: Test 1
--------------------------------

Tutorials:

- [google_colab_tutorial](https://colab.research.google.com/github/google/trax/blob/master/trax/intro.ipynb)

- [Trax_documentation](https://trax-ml.readthedocs.io/en/latest/index.html)

In [1]:
# # some installations
# !pip install evaluate -q
# !pip install sacrebleu -q
# !pip install transformers -q
# !pip install tokenizers -q
# !pip install nlpaug -q
# !pip install -q wandb --upgrade
# !pip install -q sentencepiece
# !pip install -q -U trax

In [4]:
# define the wandb environment without notebook
%env WANDB_LOG_MODEL=true
%env WANDB_API_KEY=53c099408fab02d1e4fff7386e8dfc1e759689a1

env: WANDB_LOG_MODEL=true
env: WANDB_API_KEY=53c099408fab02d1e4fff7386e8dfc1e759689a1


In [2]:
import sys

# add the main directory path
# path = "/content/drive/MyDrive/Memoire/subject2/"

# sys.path.extend([path])

import re
import trax
import evaluate
import numpy as np
import pandas as pd
from typing import *
import trax.fastmath as fsnp
from trax.supervised import training
from wolof_translate.utils.sent_corrections import *
from wolof_translate.utils.split_with_valid import split_data
from transformers import T5TokenizerFast, PreTrainedTokenizerFast
from wolof_translate.utils.improvements.end_marks import add_end_mark
from wolof_translate.utils.sent_transformers import TransformerSequences

  from .autonotebook import tqdm as notebook_tqdm


The following steps are necessary:

1. Loading data with Trax (see [tensorflow_dataset](https://www.tensorflow.org/guide/data?hl=fr))
2. Initializing the model
3. Initializing and beginning the training on some steps: identifying the optimizer, the loss function
4. Evaluating the model

## Load the data

Let us add the sentencepiece tokenizer that we will use.

In [6]:
tk_path = f'wolof-translate/wolof_translate/tokenizers/t5_tokenizers/tokenizer_v5.model' # the path

tokenizer = T5TokenizerFast(vocab_file = tk_path) # the tokenizer

Let us create bellow a generator which will load the tokenized sentences.

In [7]:
def load_tokens(path: str, tokenizer: PreTrainedTokenizerFast, input_column: str = 'french',
                target_column: str = 'wolof', max_length: int = 21,
                transformations: dict = {'french': None, 'wolof': None}):
  """Load the tokens

  Args:
    path (str): Path of the dataset. A csv file.
    input_column (str): The input column. Defaults to 'french'.
    target_column (str): The target column. Defaults to 'wolof'.
    max_length (int): The max length. Defaults to 21.
    transformations (dict): The transformations to make. Defaults to {'french': None, 'wolof': None}
  """

  # load the data set
  data_set = pd.read_csv(path)

  # recuperate the inputs
  inputs = data_set[input_column]

  # recuperate the targets
  targets = data_set[target_column]

  # load the sentences
  for i in range(len(inputs)):

    # recuperate the sentences
    input = inputs[i]

    target = targets[i]

    # transform the sentences
    if transformations[input_column]:

      input = transformations[input_column](input)[0]

    if transformations[target_column]:

      target = transformations[target_column](target)[0]

    # tokenize the sentences
    input_tokens = tokenizer(input, truncation = True, max_length = max_length,
                             padding = 'max_length')['input_ids']

    target_tokens = tokenizer(target, truncation = True, max_length = max_length,
                              padding = 'max_length')['input_ids']

    # return the tokens
    yield (np.array(input_tokens), np.array(target_tokens), np.ones(len(target_tokens)))




We can create a data pipeline for pre processing the sentences.

In [8]:
# split the datas between train, test and validation sets
split_data(random_state = 0, data_directory = f'{path}new_data/', csv_file = 'ad_sentences.csv')

# initialize the datasets' paths
train_path = f'data/extractions/new_data/train_set.csv'

valid_path = f'data/extractions/new_data/valid_set.csv'

# initialize the batch size (default 16)
batch_size = 16

# initialize the generator
train_generator = iter(list(load_tokens(train_path, tokenizer)))

valid_generator = iter(list(load_tokens(valid_path, tokenizer)))

# the shuffler
shuffler = trax.data.Shuffle(100)

# the batch sampler
batch = trax.data.Batch(batch_size)

# initialize the data pipelines
train_pipeline = trax.data.Serial(
    shuffler,
    batch,
    trax.data.AddLossWeights(tokenizer.pad_token_id)
)

# initialize the data pipelines
valid_pipeline = trax.data.Serial(
    batch,
    trax.data.AddLossWeights(tokenizer.pad_token_id)
)

## The model

We will use the transformer with the following parameters:

- vocab_size: that's of the tokenizer
- d model: default
- d ff: default
- n heads: default
- n encoders: default
- n decoders: default
- max len: default
- drop out: default
- mode: default

In [9]:
# initialize the model
model = trax.models.Transformer(len(tokenizer))

## Training task

The Adafactor will be used as optimizer and the cross entropy loss function. For the evaluation we will use the Cross entropy loss, the BLEU score and the Accuracy score.

In [59]:
# creating the BLEU score layer

class BLEU(trax.layers.Layer):

  def __init__(self, tokenizer):

    super().__init__(n_in = 2, name = 'Bleu')

    self._tokenizer = tokenizer

    self._special_tokens = self._tokenizer.convert_ids_to_tokens(self._tokenizer.all_special_ids)

    self._name = 'Bleu'

    self._metric = evaluate.load('sacrebleu')

    # self._n_in = 2

    # self._n_out = 1

  def postprocess_text(self, preds, labels):

    preds = [pred.strip() for pred in preds]

    labels = [[label.strip()] for label in labels]

    return preds, labels

  def forward(self, preds, labels):

    preds = np.argmax(preds, axis = -1)

    # labels = inputs[1]

    decoded_preds = self._tokenizer.batch_decode(preds, skip_special_tokens=True) if not self.decoder else self.decoder(preds)

    labels = np.where(labels != -100, labels, self._tokenizer.pad_token_id)

    decoded_labels = self._tokenizer.batch_decode(labels, skip_special_tokens=True) if not self.decoder else self.decoder(labels)

    decoded_preds, decoded_labels = self.postprocess_text(decoded_preds, decoded_labels)
    
    result = self._metric.compute(predictions=decoded_preds, references=decoded_labels)

    result = np.round(result["score"], 4)

    return np.array(result)
  
  def decoder(self, labels):

    if labels.ndim < 2:
            
        labels = labels[None, :]

    sentences = self._tokenizer.batch_decode(labels, skip_special_tokens=True)

    return [re.sub('|'.join(self._special_tokens), '', sentence) for sentence in sentences]



  and should_run_async(code)


In [63]:
# optimizer
optimizer = trax.optimizers.Adafactor(1e-4)

# loss
loss_fn = trax.layers.CrossEntropyLossWithLogSoftmax()
accuracy = trax.layers.Accuracy()
bleu = BLEU(tokenizer)

# initialize the training task
training_task = training.TrainTask(
    labeled_data=train_pipeline(train_generator),
    loss_layer = loss_fn,
    optimizer = optimizer,
    n_steps_per_checkpoint=50
)

# initialize the validation task
validation_task = training.EvalTask(
    labeled_data = valid_pipeline(valid_generator),
    metrics = [loss_fn, accuracy]
)

In [64]:
# metric((np.array([[0.3, 0.7], [0.1, 0.9], [0.5, 0.5]]), np.array([[3], [1], [9]]), np.array([[1], [1], [1]])))

## Training

In [67]:
# initialize the output directory
output_dir = f'{path}training/outputs'

!rm -rf {output_dir}

# initialize the training loop
training_loop = training.Loop(model,
                              training_task,
                              eval_tasks = [validation_task],
                              checkpoint_high_metric = 'Accuracy',
                              output_dir = output_dir)

Run the training task.

In [None]:
training_loop.run(2000)

  with gzip.GzipFile(fileobj=f, compresslevel=compresslevel) as gzipf:
