In [7]:
# √âtape 1 : Chargement et aper√ßu du texte
with open("wolof.txt", "r", encoding="utf-8") as f:
    wolof_text = f.read()

# Afficher les 500 premiers caract√®res
print(" First 500 Characters:\n")
print(wolof_text[:500])
# 
print("Last 500 characters:\n")
print(wolof_text[-500:])

 First 500 Characters:

Ndawu r√©ew boobu di Gaal de defa mel ni de√±oo naxarlu ‚Äònir√≥√≥ ak ay walakaana‚Äô
Njakkare amna si yenn Ndaw ci lu √±eel soppi seen tur boobu di MWPs ( Ndawu pencum r√©eww boobu di Gaal ).
Li jogloo jooju mbir bokk na ci pexe ngir soppi tuuru pencum ndawu r√©ew mi defaat ko Peccum r√©ew bu Gaal.
Ci seen mbootaayu politik, ndawu r√©ew boobu jakknanu ci ni mbir. mi mune dem ba soof
Kenn ci ndawu parti boobu tuddu Ligeey neena ay √±oo√±am jakk na√±u lool ndax mbir yi ‚Äòndof ak ay dof day lay niru.‚Äô
Ngir tarikat
Last 500 characters:

celot et un Chat sauvage. 
Gar yi 
Gnathostomata Super-classe Tetrapoda Classe Mammalia Sous-classe Theria Infra-classe Eutheria Ordre Carnivora Sous-ordre Caniformia Famille Canidae Genre Canis Esp√®ce Canis lupus Linnaeus, 1758 R√©partition g√©ographique Aire de r√©partition des sous-esp√®ces sauvages non f√©rales. Boy gi 
Fransois Hollande [1955] (ci Faraa√±se mooy Fran√ßois Hollande) Nekkoon na nitu guur gu Fraas, moo sos jur

### üîπ √âtape 2 : Entra√Æner un tokenizer adapt√© au wolof (avec SentencePiece)

In [8]:
pip install sentencepiece


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [9]:
import sentencepiece as spm

# Nom du mod√®le SentencePiece √† cr√©er
tokenizer_model_prefix = "wolof_tokenizer"

# Entra√Æner le tokenizer sur ton fichier `wolof.txt`
spm.SentencePieceTrainer.train(
    input='wolof.txt',
    model_prefix=tokenizer_model_prefix,
    vocab_size=8000,  # Tu peux ajuster √ßa selon la taille de ton corpus
    model_type='bpe',  # Peut aussi √™tre 'unigram'
    character_coverage=1.0,  # Pour conserver tous les caract√®res (utile pour wolof)
    pad_id=0,
    unk_id=1,
    bos_id=2,
    eos_id=3
)


sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: wolof.txt
  input_format: 
  model_prefix: wolof_tokenizer
  model_type: BPE
  vocab_size: 8000
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 1
  bos_id: 2
  eos_id: 3
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ‚Åá 
  enable_differential_privacy: 0
  different

### üîπ √âtape 3 : Tokeniser tout le texte wolof et cr√©er les datasets train / validation

In [14]:
import sentencepiece as spm

# Charger le tokenizer entra√Æn√©
sp = spm.SentencePieceProcessor()
sp.load("wolof_tokenizer.model")

# Charger le texte brut
with open("wolof.txt", "r", encoding="utf-8") as f:
    wolof_text = f.read()

# Encoder tout le texte en ID de tokens
token_ids = sp.encode(wolof_text, out_type=int)

# # V√©rifier la taille
# print(f"Nombre total de tokens : {len(token_ids)}")

# Fractionner en 90% train, 10% validation
train_ratio = 0.9
split_index = int(len(token_ids) * train_ratio)

train_ids = token_ids[:split_index]
val_ids = token_ids[split_index:]

# Sauvegarder les jeux de donn√©es encod√©s
import pickle

with open("train_tokens.pkl", "wb") as f:
    pickle.dump(train_ids, f)

with open("val_tokens.pkl", "wb") as f:
    pickle.dump(val_ids, f)

print("‚úÖ Jeux de donn√©es encod√©s sauvegard√©s.")

# Nombre de caract√®res
total_chars = len(wolof_text)

# Nombre de tokens
total_tokens = len(token_ids)

print(f"Nombre total de caract√®res : {total_chars}")
print(f"Nombre total de tokens     : {total_tokens}")

‚úÖ Jeux de donn√©es encod√©s sauvegard√©s.
Nombre total de caract√®res : 965763
Nombre total de tokens     : 236093


### üîπ √âtape 4 : Cr√©er les DataLoaders pour l‚Äôentra√Ænement et la validation

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader

# Charger les tokens encod√©s
import pickle

with open("train_tokens.pkl", "rb") as f:
    train_ids = pickle.load(f)

with open("val_tokens.pkl", "rb") as f:
    val_ids = pickle.load(f)

# Configuration
context_length = 256  # s√©quence de 256 tokens
batch_size = 4        # ajustable

# Dataset personnalis√©
class TokenDataset(Dataset):
    def __init__(self, tokens, context_length):
        self.tokens = tokens
        self.context_length = context_length

    def __len__(self):
        return len(self.tokens) - self.context_length

    def __getitem__(self, idx):
        chunk = self.tokens[idx:idx + self.context_length + 1]
        x = torch.tensor(chunk[:-1], dtype=torch.long)
        y = torch.tensor(chunk[1:], dtype=torch.long)
        return x, y

# Cr√©ation des DataLoaders
train_dataset = TokenDataset(train_ids, context_length)
val_dataset = TokenDataset(val_ids, context_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

print("‚úÖ DataLoaders cr√©√©s avec succ√®s.")


‚úÖ DataLoaders cr√©√©s avec succ√®s.


In [4]:
# ...apr√®s la cr√©ation des DataLoaders...

# --- Sanity check pour la taille des datasets ---
if len(train_ids) < context_length:
    print("‚ö†Ô∏è Pas assez de tokens pour le train loader. "
          "Diminue 'context_length' ou augmente 'train_ratio'.")

if len(val_ids) < context_length:
    print("‚ö†Ô∏è Pas assez de tokens pour le validation loader. "
          "Diminue 'context_length' ou diminue 'train_ratio'.")

# --- Comptage des tokens utilis√©s par les DataLoaders ---
train_tokens = 0
for input_batch, target_batch in train_loader:
    train_tokens += input_batch.numel()

val_tokens = 0
for input_batch, target_batch in val_loader:
    val_tokens += input_batch.numel()

print(f"Training tokens: {train_tokens}")
print(f"Validation tokens: {val_tokens}")
print(f"All tokens: {train_tokens + val_tokens}")

Training tokens: 54330112
Validation tokens: 5978624
All tokens: 60308736
