In [1]:
import sentencepiece as spm

In [3]:
vocab_sizes = [8000, 16000, 32000]

for size in vocab_sizes:
    print("Training: ", size)
    spm.SentencePieceTrainer.train(
        input='/root/zindi/data/all_dyu.txt',
        model_prefix=f'dyula_{size}',
        vocab_size=size,
        character_coverage=1.0,
        model_type='bpe',
        input_sentence_size=1000000,
        shuffle_input_sentence=True
    )

    # Load the model and test it on some sample text
    sp = spm.SentencePieceProcessor()
    sp.load(f'dyula_{size}.model')
    
    sample_text = "an ni sɔgɔma hɛrɛ sira somɔgɔw do?"
    print(f"Vocab size: {size}")
    print(f"Tokenized: {sp.encode(sample_text, out_type=str)}")
    print()

Training:  8000
Vocab size: 8000
Tokenized: ['▁an', '▁ni', '▁sɔgɔma', '▁hɛrɛ', '▁sira', '▁so', 'mɔgɔw', '▁do', '?']

Training:  16000
Vocab size: 16000
Tokenized: ['▁an', '▁ni', '▁sɔgɔma', '▁hɛrɛ', '▁sira', '▁so', 'mɔgɔw', '▁do', '?']

Training:  32000
Vocab size: 32000
Tokenized: ['▁an', '▁ni', '▁sɔgɔma', '▁hɛrɛ', '▁sira', '▁somɔgɔw', '▁do', '?']



In [4]:
vocab_sizes = [8000, 16000, 32000]

for size in vocab_sizes:
    spm.SentencePieceTrainer.train(
        input='/root/zindi/data/all_fr.txt',
        model_prefix=f'fr_{size}',
        vocab_size=size,
        character_coverage=1.0,
        model_type='bpe',
        input_sentence_size=1000000,
        shuffle_input_sentence=True
    )

    # Load the model and test it on some sample text
    sp = spm.SentencePieceProcessor()
    sp.load(f'fr_{size}.model')
    
    sample_text = "Votre opérateur téléphonique local devrait pouvoir vous en dire plus au sujet de la connexion à ce service."
    print(f"Vocab size: {size}")
    print(f"Tokenized: {sp.encode(sample_text, out_type=str)}")
    print()

Vocab size: 8000
Tokenized: ['▁Votre', '▁opér', 'ateur', '▁téléph', 'onique', '▁local', '▁devrait', '▁pouvoir', '▁vous', '▁en', '▁dire', '▁plus', '▁au', '▁sujet', '▁de', '▁la', '▁connexion', '▁à', '▁ce', '▁service', '.']

Vocab size: 16000
Tokenized: ['▁Votre', '▁opér', 'ateur', '▁téléphonique', '▁local', '▁devrait', '▁pouvoir', '▁vous', '▁en', '▁dire', '▁plus', '▁au', '▁sujet', '▁de', '▁la', '▁connexion', '▁à', '▁ce', '▁service', '.']

Vocab size: 32000
Tokenized: ['▁Votre', '▁opérateur', '▁téléphonique', '▁local', '▁devrait', '▁pouvoir', '▁vous', '▁en', '▁dire', '▁plus', '▁au', '▁sujet', '▁de', '▁la', '▁connexion', '▁à', '▁ce', '▁service', '.']



In [6]:

# Train Dyula SentencePiece model
spm.SentencePieceTrainer.train(
    input='/root/zindi/data/all_dyu.txt',
    model_prefix='dyu',
    vocab_size=16000,
    character_coverage=1.0,
    model_type='bpe',
    input_sentence_size=1000000,
    shuffle_input_sentence=True
)

# Train French SentencePiece model
spm.SentencePieceTrainer.train(
    input='/root/zindi/data/all_fr.txt',
    model_prefix='fr',
    vocab_size=16000,
    character_coverage=1.0,
    model_type='bpe',
    input_sentence_size=1000000,
    shuffle_input_sentence=True
)

In [8]:
# Create combined vocabulary
def load_vocab(file_path):
    vocab = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            piece = line.strip().split('\t')[0]
            if not piece.startswith('<'):  # Skip special tokens
                vocab.append(piece)
    return vocab

dyula_vocab = load_vocab('/root/zindi/common/dyu.vocab')
french_vocab = load_vocab('/root/zindi/common/fr.vocab')

# Combine vocabularies
combined_vocab = list(set(dyula_vocab + french_vocab))

# Add special tokens
special_tokens = ['<s>', '</s>', '<pad>', '<unk>']
combined_vocab = special_tokens + combined_vocab

# Write combined vocabulary to file
with open('combined_vocab.txt', 'w', encoding='utf-8') as f:
    for piece in combined_vocab:
        f.write(f"{piece}\n")

In [16]:
from transformers import MarianTokenizer

# Create the tokenizer
tokenizer = MarianTokenizer.from_pretrained(
    "Helsinki-NLP/opus-mt-af-fr",  # We're using this as a base and will override its components
    src_vocab_file="dyu.vocab",  # Path to the Dyula vocabulary file
    tgt_vocab_file="fr.vocab",  # Path to the French vocabulary file
    source_spm="dyu.model",  # Path to the Dyula SentencePiece model file
    target_spm="fr.model",  # Path to the French SentencePiece model file
    source_lang="dyu",
    target_lang="fr"
)

# Adjust special tokens if necessary
tokenizer.unk_token = "<unk>"
tokenizer.pad_token = "<pad>"
tokenizer.bos_token = "<s>"
tokenizer.eos_token = "</s>"

# Save the tokenizer
tokenizer.save_pretrained('./dyula_french_tokenizer')



('./dyula_french_tokenizer/tokenizer_config.json',
 './dyula_french_tokenizer/special_tokens_map.json',
 './dyula_french_tokenizer/vocab.json',
 './dyula_french_tokenizer/source.spm',
 './dyula_french_tokenizer/target.spm',
 './dyula_french_tokenizer/added_tokens.json')