In [1]:
pip install sentencepiece



In [4]:
import sentencepiece as spm
import json
from transformers import MarianMTModel, MarianTokenizer

# Creating a tokenizer and a vocabulary
def create_tokenizer(corpus_path, tokenizer_model_prefix):
  spm.SentencePieceTrainer.train(
      input = corpus_path,
      model_prefix = tokenizer_model_prefix,
      vocab_size = 4200, # increase size if you gather more data
      pad_id=0,
      unk_id=1,
      bos_id=2,
      eos_id=3
  )

def create_vocabulary(tok_prefix_source, tok_prefix_target):
  sp_processor_source = spm.SentencePieceProcessor()
  sp_processor_source.load(tok_prefix_source + '.model')

  sp_processor_target = spm.SentencePieceProcessor()
  sp_processor_target.load(tok_prefix_target + '.model')

  vocab_source, vocab_target = {}, {}

  for id in range(sp_processor_source.get_piece_size()):
    token_source = sp_processor_source.id_to_piece(id)
    vocab_source[token_source] = id

    token_target = sp_processor_target.id_to_piece(id)
    vocab_target[token_target] = id

  return vocab_source, vocab_target

def combine_vocabularies(vocab_source, vocab_target, USE_PRETRAINED):
  combined_vocab = {}
  if USE_PRETRAINED:
    output_file = 'vocab' + '_pretrained.json'

  else:
    output_file = 'vocab.json'

  for idx, (token, _) in enumerate(vocab_target.items()):
    combined_vocab[token] = int(idx)

  idx_start_source_vocab, counter_non_duplicates = len(combined_vocab), 0 # keeping count of non duplicate items in the dictionary and also maintaining their index

  for idx, (token, _) in enumerate(vocab_source.items()):
    if token not in combined_vocab:
      combined_vocab[token] = idx_start_source_vocab + counter_non_duplicates

      counter_non_duplicates += 1

  with open(output_file, 'w') as vocab_file:
    json.dump(combined_vocab, vocab_file, indent=4)

create_tokenizer('romani.txt', 'target')
create_tokenizer('romanian.txt', 'source')

vocab_source, vocab_target = create_vocabulary('source', 'target')
combine_vocabularies(vocab_source, vocab_target, False)

In [5]:
from transformers import MarianTokenizer


tokenizer_custom = MarianTokenizer(
    source_spm="source.model",
    target_spm="target.model",
    vocab="vocab.json"
)

tokenizer_custom.save_pretrained('romanian_romani_tokenizer')



('romanian_romani_tokenizer/tokenizer_config.json',
 'romanian_romani_tokenizer/special_tokens_map.json',
 'romanian_romani_tokenizer/vocab.json',
 'romanian_romani_tokenizer/source.spm',
 'romanian_romani_tokenizer/target.spm',
 'romanian_romani_tokenizer/added_tokens.json')

In [6]:
model_pretrain = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-roa-en')
tokenizer_pretrained = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-roa-en')

dict_vocab_pretrain = tokenizer_pretrained.get_vocab()
dict_vocab_custom = tokenizer_custom.get_vocab()

combine_vocabularies(dict_vocab_pretrain, dict_vocab_custom, True)

tokenizer_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/800k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/779k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

In [7]:
tokenizer_custom_pretrained = MarianTokenizer(
    source_spm="source.model",
    target_spm="target.model",
    vocab="vocab_pretrained.json"
)

tokenizer_custom_pretrained.save_pretrained('romanian_romani_tokenizer_from_pretrained')

('romanian_romani_tokenizer_from_pretrained/tokenizer_config.json',
 'romanian_romani_tokenizer_from_pretrained/special_tokens_map.json',
 'romanian_romani_tokenizer_from_pretrained/vocab.json',
 'romanian_romani_tokenizer_from_pretrained/source.spm',
 'romanian_romani_tokenizer_from_pretrained/target.spm',
 'romanian_romani_tokenizer_from_pretrained/added_tokens.json')