In [14]:
from pathlib import Path
from collections import defaultdict

caption_file = Path("../data/raw/Flickr8k_text/Flickr8k.token.txt")
captions = defaultdict(list)

with open(caption_file, "r") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        img_id_full, caption = line.split('\t')
        img_id = img_id_full.split('#')[0]
        captions[img_id].append(caption)

print(f"üìä Nombre d'images avec l√©gendes : {len(captions)}")

üìä Nombre d'images avec l√©gendes : 8092


In [15]:
# üì¶ Dictionnaire image_id ‚Üí [liste de l√©gendes]
captions = defaultdict(list)

with open(caption_file, "r") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        img_id_full, caption = line.split('\t')
        img_id = img_id_full.split('#')[0]
        captions[img_id].append(caption)

print(f"üìä Nombre d'images avec l√©gendes : {len(captions)}")

üìä Nombre d'images avec l√©gendes : 8092


In [16]:
import re

def clean_caption(caption: str) -> str:
    caption = caption.lower()
    caption = re.sub(r"[^a-z\s]", "", caption)  # supprime ponctuation, chiffres, accents
    caption = re.sub(r"\s+", " ", caption).strip()  # normalise les espaces
    return caption

In [17]:
captions_cleaned = {}

for img_id, caps in captions.items():
    cleaned = [f"<start> {clean_caption(c)} <end>" for c in caps]
    captions_cleaned[img_id] = cleaned

# Affichage d‚Äôun exemple
example_id = next(iter(captions_cleaned))
print("üñºÔ∏è Image ID :", example_id)
for c in captions_cleaned[example_id]:
    print("-", c)

üñºÔ∏è Image ID : 1000268201_693b08cb0e.jpg
- <start> a child in a pink dress is climbing up a set of stairs in an entry way <end>
- <start> a girl going into a wooden building <end>
- <start> a little girl climbing into a wooden playhouse <end>
- <start> a little girl climbing the stairs to her playhouse <end>
- <start> a little girl in a pink dress going into a wooden cabin <end>


In [18]:
from collections import Counter

# Extraire tous les mots de toutes les l√©gendes
all_captions = []
for caps in captions_cleaned.values():
    all_captions.extend(caps)

# Tokenisation simple par split()
words = []
for cap in all_captions:
    words.extend(cap.split())

# Compter les mots
word_counts = Counter(words)

# Afficher les plus fr√©quents
print("üî¢ 20 mots les plus fr√©quents :")
print(word_counts.most_common(20))

üî¢ 20 mots les plus fr√©quents :
[('a', 62989), ('<start>', 40460), ('<end>', 40460), ('in', 18975), ('the', 18419), ('on', 10744), ('is', 9345), ('and', 8852), ('dog', 8136), ('with', 7765), ('man', 7266), ('of', 6713), ('two', 5639), ('white', 3940), ('black', 3832), ('boy', 3581), ('are', 3505), ('woman', 3403), ('girl', 3328), ('to', 3173)]


In [19]:
# Hyperparam√®tre : fr√©quence minimale
min_word_freq = 5

# Vocabulaire filtr√©
vocab = [word for word, count in word_counts.items() if count >= min_word_freq]

# Ajout de tokens sp√©ciaux
vocab = ['<pad>', '<unk>'] + sorted(vocab)

# Dictionnaires
word2idx = {word: idx for idx, word in enumerate(vocab)}
idx2word = {idx: word for word, idx in word2idx.items()}

print(f"üìö Taille finale du vocabulaire : {len(vocab)}")

üìö Taille finale du vocabulaire : 2988


In [20]:
import pickle

tokenizer_path = Path("../data/vocab/tokenizer.pkl")
tokenizer_path.parent.mkdir(parents=True, exist_ok=True)

with open(tokenizer_path, "wb") as f:
    pickle.dump({
        "word2idx": word2idx,
        "idx2word": idx2word,
        "vocab": vocab,
        "min_freq": min_word_freq
    }, f)

print("‚úÖ Tokenizer sauvegard√© dans :", tokenizer_path)

‚úÖ Tokenizer sauvegard√© dans : ../data/vocab/tokenizer.pkl


# Test du vocab 

## √âtape 1 ‚Äî Charger le tokenizer sauvegard√©

In [25]:
import pickle
from pathlib import Path

with open(Path("../data/vocab/tokenizer.pkl"), "rb") as f:
    tokenizer_data = pickle.load(f)

word2idx = tokenizer_data["word2idx"]
idx2word = tokenizer_data["idx2word"]
vocab = tokenizer_data["vocab"]

print("üî¢ Taille du vocab :", len(vocab))
print("üß† Extrait :", vocab[:10])

üî¢ Taille du vocab : 2988
üß† Extrait : ['<pad>', '<unk>', '<end>', '<start>', 'a', 'abandoned', 'about', 'above', 'accordion', 'acrobatic']


## √âtape 2 ‚Äî Tester l‚Äôencodage d‚Äôune l√©gende

In [22]:
def encode_caption(caption, word2idx):
    return [word2idx.get(word, word2idx["<unk>"]) for word in caption.split()]
    
sample_caption = "<start> a man is riding a bicycle <end>"
encoded = encode_caption(sample_caption, word2idx)

print("üßæ Caption :", sample_caption)
print("üî¢ Encod√©e :", encoded)

üßæ Caption : <start> a man is riding a bicycle <end>
üî¢ Encod√©e : [3, 4, 1517, 1309, 2096, 4, 222, 2]


## √âtape 3 ‚Äî Tester le d√©codage inverse

In [23]:
def decode_caption(indices, idx2word):
    return " ".join([idx2word.get(idx, "<unk>") for idx in indices])

print("üîÅ D√©cod√©e :", decode_caption(encoded, idx2word))

üîÅ D√©cod√©e : <start> a man is riding a bicycle <end>


## √âtape 4 ‚Äî V√©rifier des cas particuliers

In [24]:
print("üîç Token '<start>' :", word2idx["<start>"])
print("üîç Token '<pad>' :", word2idx["<pad>"])
print("üîç Mot rare inconnu :", word2idx.get("dragon", word2idx["<unk>"]))

üîç Token '<start>' : 3
üîç Token '<pad>' : 0
üîç Mot rare inconnu : 761
