## Imports and setup

In [1]:
from IPython import display
!pip install evaluate
!pip install nltk
!pip install transformers
!pip install sacrebleu
!pip install sacremoses
!pip install datasets
!pip install wandb
!pip install sentencepiece
display.clear_output()

In [2]:
import nltk
nltk.download('punkt')
display.clear_output()

In [78]:
import datasets
import evaluate
import nltk
import numpy as np
import os
import pandas as pd
import random
import sentencepiece
import sacrebleu
import sacremoses
import torch.nn.functional as F
import tqdm
import transformers
import torch
import wandb

from torch.utils.data import Subset
from transformers import AdamW
from transformers import get_scheduler
from transformers import TrainingArguments, Trainer, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer, AutoModelWithLMHead
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import AutoConfig



In [4]:
torch.cuda.is_available()

True

## Configuration

Pretraining on the GPT2 Swahili model 'flax-community/gpt2-swahili'



In [173]:
# Parameters for mul-en models
config = {
    'metric_for_best_model': 'eval_loss',
    'metric_for_best_model_dir': 'min',
    'train_batch_size': 1,
    'gradient_accumulation_steps': 1,
    'max_input_length': 32,
    'max_target_length': 32,
    'eval_batch_size': 1,
    'eval_pretrained_model': True,
    'learning_rate': 1e-4,
    'num_train_epochs': 3,
    'label_smoothing_factor': 0.1,
    'use_cache': False
}

config['wandb_project'] = f'salt-monolingual'
config['wandb_entity'] = f'sunbird'
config['model_checkpoint'] = f'flax-community/gpt2-swahili'

# What training data to use

# Evaluate roughly every 10 minutes
eval_steps_interval = 1000


print(f'Evaluating every {eval_steps_interval} training steps.')



Evaluating every 1000 training steps.


In [175]:


training_args = TrainingArguments(
    weight_decay = 0.01,
    output_dir="./gpt2-luo", #The output directory
    save_total_limit = 3,
    fp16 = torch.cuda.is_available(),
    load_best_model_at_end=True,
    run_name = f'monolingual-acholi',
    eval_steps = eval_steps_interval,
    save_steps = eval_steps_interval,
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=config['num_train_epochs'], # number of training epochs
    per_device_train_batch_size=config["train_batch_size"], # batch size for training
    per_device_eval_batch_size=config["eval_batch_size"],  # batch size for evaluation
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    metric_for_best_model = config['metric_for_best_model'],
    report_to = 'none',
    logging_dir = f'monolingual-acholi',
    label_smoothing_factor = config['label_smoothing_factor'],
    #predict_with_generate = True,
    evaluation_strategy = 'steps',
    gradient_accumulation_steps = config['gradient_accumulation_steps'],
    learning_rate = config['learning_rate'],

)



PyTorch: setting up devices


In [176]:
config['training_dir'] = '/content/monolingual_acholi'
config['training_subset_ids'] = [
        'acholi-online.txt', 
        'misc.txt',
        'train_flores_luo.src', 
        #'train_mt560_luo.src', 
    ]
#'val_ach.src', 'test_ach.src' are not included in training to avoid leakage


config['valid_subset_ids'] = [
        'val_ach.src'
            ]

config['test_subset_ids'] = [
        'test_ach.src'
    ]

MT560 is much bigger than the other training sets, so oversample the rest (by 5x) to balance it out.

# Set up datasets

Download the raw text data.

In [60]:
if not os.path.exists('v7-dataset'):
    !wget https://sunbird-translate.s3.us-east-2.amazonaws.com/v7-dataset.zip
    !unzip v7-dataset.zip
    display.clear_output()

In [9]:
!git clone https://mekaneeky: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@github.com/SunbirdAI/parallel-text-EDA/

Cloning into 'parallel-text-EDA'...
remote: Enumerating objects: 207, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 207 (delta 6), reused 11 (delta 3), pack-reused 193[K
Receiving objects: 100% (207/207), 4.86 MiB | 8.15 MiB/s, done.
Resolving deltas: 100% (99/99), done.


In [10]:
!mkdir monolingual_acholi
!cp /content/parallel-text-EDA/back_translation/data/ach/* /content/monolingual_acholi/
!cp /content/v7-dataset/v7.0/supervised/mul-en/train_flores_luo.src /content/monolingual_acholi/
#!cp /content/v7-dataset/v7.0/supervised/mul-en/train_mt560_luo.src /content/monolingual_acholi/
!cp /content/v7-dataset/v7.0/supervised/mul-en/val_ach.src /content/monolingual_acholi/
!cp /content/v7-dataset/v7.0/supervised/mul-en/test_ach.src /content/monolingual_acholi/


In [79]:
def sentence_format(input):
    '''Ensure capital letter at the start and full stop at the end.'''
    input = input[0].capitalize() + input[1:]
    if input[-1] not in ['.', '!', '?']:
        input = input + '.'
    return input

def preprocess(examples):

    normalizer = sacremoses.MosesPunctNormalizer()

    inputs_ids= [sentence_format(normalizer.normalize(text))
              for text in examples["input_ids"]]
    
    examples = tokenizer(
        inputs_ids,padding="max_length", 
        max_length=config["max_input_length"], 
        truncation=True, 
        return_overflowing_tokens=True, 
        return_length=True,
        return_tensors="pt"
        )
    
    input_batch = []
    for length, input_ids in zip(examples["length"], examples["input_ids"]):
        if length == config["max_input_length"]:
            input_batch.append(input_ids)

    return {"input_ids": input_batch}
    
    #examples["labels"] = examples["input_ids"].detach().clone()

    return examples

def postprocess(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels


In [80]:
sum([sum(1 for line in open('/content/monolingual_acholi/acholi-online.txt')),
     sum(1 for line in open('/content/monolingual_acholi/misc.txt')),
     sum(1 for line in open('/content/monolingual_acholi/rupiny.txt')),
     sum(1 for line in open('/content/v7-dataset/v7.0/supervised/mul-en/train_flores_luo.src')),
     #sum(1 for line in open('/content/v7-dataset/v7.0/supervised/mul-en/train_mt560_luo.src'))
     
     ])


10642

In [177]:
random.seed(42)

def _files_to_list(path):
    lines_file = []
    with open(path) as file:
        lines_file = file.readlines()
        lines_file = [line.rstrip() for line in lines_file]
    
    return lines_file
    
def dataset_from_src_tgt_files(data_dir = config["training_dir"], validation_cutoff = 8500, test_cutoff = 9000):
    """
        validation_cutoff: use first n lines as validation
    """

    train_text= []
    valid_text = []
    test_text = []
    for file_name in config["training_subset_ids"]:
        path = os.path.join(data_dir, file_name)
        file_text  = _files_to_list(path )
        random.shuffle(file_text)
        train_text.extend( file_text[:validation_cutoff])
        valid_text.extend( file_text[validation_cutoff:test_cutoff])
        test_text.extend( file_text[test_cutoff:])

    for file_name in config["valid_subset_ids"]:
        path = os.path.join(data_dir, file_name)
        file_text  = _files_to_list(path )
        valid_text.extend( file_text )


    for file_name in config["test_subset_ids"]:
        path = os.path.join(data_dir, file_name)
        file_text  = _files_to_list(path )
        test_text.extend( file_text )


    return datasets.Dataset.from_dict({'input_ids': train_text}), \
           datasets.Dataset.from_dict({'input_ids': valid_text}), \
           datasets.Dataset.from_dict({'input_ids': test_text}), 

Pre-process the raw text datasets.

In [82]:
train_data_raw, valid_data_raw, test_data_raw =   dataset_from_src_tgt_files()

In [133]:
pretrained_config.use_cache = False

In [178]:

pretrained_config = AutoConfig.from_pretrained("flax-community/gpt2-swahili")
pretrained_config.task_specific_params["text-generation"]["max_length"] = config["max_input_length"]
pretrained_config.use_cache = config["use_cache"]
tokenizer = AutoTokenizer.from_pretrained("flax-community/gpt2-swahili")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelWithLMHead.from_pretrained("flax-community/gpt2-swahili", config=pretrained_config)
model.resize_token_embeddings(len(tokenizer))

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False)
training_args.vocab_size = tokenizer.vocab_size
#data_collator = transformers.DataCollatorForSeq2Seq(tokenizer, model = model) 
#metric = datasets.load_metric('sacrebleu')
display.clear_output()

In [179]:
train_data  = train_data_raw.map(
    preprocess, batched=True)

validation_data  = valid_data_raw.map(
    preprocess, batched=True)

subset_validation_data = Subset(validation_data, [i for i in range(500)])

test_data = test_data_raw.map(
    preprocess, batched=True)


  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/5 [00:00<?, ?ba/s]

In [167]:
validation_data

Dataset({
    features: ['input_ids'],
    num_rows: 4356
})

In [168]:
train_data[0].keys()

dict_keys(['input_ids'])

Launch the training.

In [169]:
try:
    text = output
except:
    text = "Sina pesa "
encoded_input = tokenizer(text, return_tensors='pt')
tokens= model.generate(encoded_input["input_ids"])
output = tokenizer.decode(tokens.squeeze(), skip_special_tokens=True)
print(output)

Generate config GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "pad_token_id": 1,
  "transformers_version": "4.26.0",
  "use_cache": false
}

Input length of input_ids is 29, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


Sina pesa!""Unanidai nini?""Unanidai pesa nyingi sana, na mimi nina pesa nyingi sana, na nina pesa nyingi sana,


In [170]:
outp = model(encoded_input["input_ids"].cuda())

RuntimeError: ignored

In [None]:
print("The max model length is {} for this model, although the actual embedding size for GPT small is 768".format(tokenizer.model_max_length))
print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))

In [180]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=subset_validation_data,
    #compute_metrics= compute_metrics
    )

Using cuda_amp half precision backend


In [None]:
trainer.train()

***** Running training *****
  Num examples = 12951
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 38853
  Number of trainable parameters = 124446720
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
1000,6.2742,
2000,5.7567,


***** Running Evaluation *****
  Num examples = 500
  Batch size = 1
Saving model checkpoint to ./gpt2-luo/checkpoint-1000
Configuration saved in ./gpt2-luo/checkpoint-1000/config.json
Configuration saved in ./gpt2-luo/checkpoint-1000/generation_config.json
Model weights saved in ./gpt2-luo/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./gpt2-luo/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./gpt2-luo/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [gpt2-luo/checkpoint-100] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 500
  Batch size = 1
Saving model checkpoint to ./gpt2-luo/checkpoint-2000
Configuration saved in ./gpt2-luo/checkpoint-2000/config.json
Configuration saved in ./gpt2-luo/checkpoint-2000/generation_config.json
Model weights saved in ./gpt2-luo/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in ./gpt2-luo/checkpoint-2000/tokenizer_config.json
Special tokens file saved i

Garbage collection cell to free up VRAM, run only if you want to delete the model data from memory and start again

In [164]:

import gc
gc.collect()
torch.cuda.empty_cache()
model.cpu()
del model


In [141]:
1/0


ZeroDivisionError: ignored