In [24]:
# ============================================================
# LOAD DATA
# ============================================================
import pandas as pd
import numpy as np
import re
import pickle
import os
from collections import Counter

print("="*70)
print("IMAGE CAPTIONING - TEXT PREPROCESSING")
print("="*70)

print("\nLoading Dataset")

CAPTION_FILE = r'D:\CODE\Image-Captioning-Project\data\captions.txt'
OUTPUT_DIR = r'D:\CODE\Image-Captioning-Project\features\text_preprocessing'

df = pd.read_csv(CAPTION_FILE, sep=',')
print(f"Total rows loaded: {len(df)}")
print(f"Columns: {df.columns.tolist()}")

print(f"\nFirst 5 rows:")
print(df.head())

IMAGE CAPTIONING - TEXT PREPROCESSING

Loading Dataset
Total rows loaded: 40455
Columns: ['image', 'caption']

First 5 rows:
                       image  \
0  1000268201_693b08cb0e.jpg   
1  1000268201_693b08cb0e.jpg   
2  1000268201_693b08cb0e.jpg   
3  1000268201_693b08cb0e.jpg   
4  1000268201_693b08cb0e.jpg   

                                             caption  
0  Seorang anak dengan gaun merah muda sedang men...  
1       Seorang gadis pergi ke sebuah bangunan kayu.  
2  Seorang gadis kecil memanjat ke sebuah rumah b...  
3  Seorang gadis kecil menaiki tangga ke rumah be...  
4  Seorang gadis kecil dengan gaun merah muda mas...  


In [25]:
# ============================================================
# TEXT PREPROCESSING FUNCTION
# ============================================================
print("\n" + "="*70)
print("Text Preprocessing")

def preprocess_caption(caption):
    """
    Preprocessing caption text:
    - Lowercase semua huruf
    - Hapus karakter khusus (angka, tanda baca)
    - Hapus extra spaces
    - Tambahkan start dan end tokens
    """
    # Convert to string
    caption = str(caption)

    # Convert to lowercase
    caption = caption.lower()

    # Remove special characters and numbers, keep only letters
    caption = re.sub(r'[^a-z\s]', '', caption)

    # Remove multiple spaces
    caption = re.sub(r'\s+', ' ', caption)

    # Strip whitespace
    caption = caption.strip()

    # Add start and end sequence tokens
    caption = 'startseq ' + caption + ' endseq'

    return caption

# Apply preprocessing
df['caption_clean'] = df['caption'].apply(preprocess_caption)

print("\nPreprocessing completed!")
print(f"Sample preprocessed captions:")
for i in range(min(3, len(df))):
    print(f"\nOriginal  : {df['caption'].iloc[i]}")
    print(f"Processed : {df['caption_clean'].iloc[i]}")


Text Preprocessing

Preprocessing completed!
Sample preprocessed captions:

Original  : Seorang anak dengan gaun merah muda sedang menaiki seperangkat tangga dengan jalan masuk.
Processed : startseq seorang anak dengan gaun merah muda sedang menaiki seperangkat tangga dengan jalan masuk endseq

Original  : Seorang gadis pergi ke sebuah bangunan kayu.
Processed : startseq seorang gadis pergi ke sebuah bangunan kayu endseq

Original  : Seorang gadis kecil memanjat ke sebuah rumah bermain kayu.
Processed : startseq seorang gadis kecil memanjat ke sebuah rumah bermain kayu endseq


In [26]:
# ============================================================
# BUILD VOCABULARY
# ============================================================
print("\n" + "="*70)
print("Building Vocabulary")

# Collect all words from all captions
all_captions = df['caption_clean'].tolist()
all_words = []

for caption in all_captions:
    all_words.extend(caption.split())

# Count word frequencies
word_counts = Counter(all_words)

print(f"\nTotal words (with repetition): {len(all_words)}")
print(f"Unique words: {len(word_counts)}")

# Filter words by minimum frequency
MIN_WORD_FREQ = 5

vocab = [word for word, count in word_counts.items() if count >= MIN_WORD_FREQ]
vocab = sorted(vocab)

print(f"\nVocabulary size after filtering (freq >= {MIN_WORD_FREQ}): {len(vocab)}")

# Create word to index mapping
word_to_idx = {}
word_to_idx['<PAD>'] = 0  # Padding token
idx = 1

for word in vocab:
    word_to_idx[word] = idx
    idx += 1

word_to_idx['<UNK>'] = idx  # Unknown token

# Create index to word mapping
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

vocab_size = len(word_to_idx)
print(f"Final vocabulary size: {vocab_size}")
print(f"\nTop 20 most common words:")
for word, count in word_counts.most_common(20):
    print(f"  {word}: {count} times")


Building Vocabulary

Total words (with repetition): 450153
Unique words: 7238

Vocabulary size after filtering (freq >= 5): 2564
Final vocabulary size: 2566

Top 20 most common words:
  startseq: 40455 times
  endseq: 40455 times
  di: 27859 times
  seorang: 16308 times
  dengan: 10992 times
  anjing: 10221 times
  dan: 8721 times
  pria: 7388 times
  anak: 6887 times
  dua: 5573 times
  seekor: 5233 times
  wanita: 4410 times
  hitam: 4124 times
  orang: 4025 times
  gadis: 3979 times
  putih: 3901 times
  yang: 3562 times
  bermain: 3450 times
  merah: 3378 times
  berjalan: 3349 times


In [27]:
# ============================================================
# CONVERT CAPTIONS TO SEQUENCES
# ============================================================
print("\n" + "="*70)
print("Converting Captions to Sequences")

def caption_to_sequence(caption, word_to_idx):
    """Convert caption text to sequence of word indices"""
    words = caption.split()
    sequence = []

    for word in words:
        if word in word_to_idx:
            sequence.append(word_to_idx[word])
        else:
            sequence.append(word_to_idx['<UNK>'])

    return sequence

# Apply conversion
df['caption_seq'] = df['caption_clean'].apply(
    lambda x: caption_to_sequence(x, word_to_idx)
)

# Calculate max caption length
max_length = max(len(seq) for seq in df['caption_seq'])

print(f"\nMax caption length: {max_length} tokens")
print(f"\nSample sequences:")
for i in range(min(3, len(df))):
    print(f"\nCaption: {df['caption_clean'].iloc[i]}")
    print(f"Sequence: {df['caption_seq'].iloc[i]}")


Converting Captions to Sequences

Max caption length: 32 tokens

Sample sequences:

Caption: startseq seorang anak dengan gaun merah muda sedang menaiki seperangkat tangga dengan jalan masuk endseq
Sequence: [2274, 2159, 35, 545, 712, 1587, 1640, 2100, 1388, 2565, 2326, 545, 873, 1270, 670]

Caption: startseq seorang gadis pergi ke sebuah bangunan kayu endseq
Sequence: [2274, 2159, 699, 1851, 987, 2097, 134, 986, 670]

Caption: startseq seorang gadis kecil memanjat ke sebuah rumah bermain kayu endseq
Sequence: [2274, 2159, 699, 994, 1327, 987, 2097, 2041, 304, 986, 670]


In [28]:
# ============================================================
# CREATE IMAGE-TO-CAPTIONS MAPPING
# ============================================================
print("\n" + "="*70)
print("STEP 5: Creating Image-to-Captions Mapping")

# Group captions by image
image_to_captions = {}

for idx, row in df.iterrows():
    image_name = row['image']
    caption_clean = row['caption_clean']

    if image_name not in image_to_captions:
        image_to_captions[image_name] = []

    image_to_captions[image_name].append(caption_clean)

print(f"\nTotal unique images: {len(image_to_captions)}")
print(f"Average captions per image: {len(df) / len(image_to_captions):.2f}")

# Sample mapping
sample_image = list(image_to_captions.keys())[0]
print(f"\nSample - Image: {sample_image}")
for i, cap in enumerate(image_to_captions[sample_image][:3], 1):
    print(f"  Caption {i}: {cap}")



STEP 5: Creating Image-to-Captions Mapping

Total unique images: 8091
Average captions per image: 5.00

Sample - Image: 1000268201_693b08cb0e.jpg
  Caption 1: startseq seorang anak dengan gaun merah muda sedang menaiki seperangkat tangga dengan jalan masuk endseq
  Caption 2: startseq seorang gadis pergi ke sebuah bangunan kayu endseq
  Caption 3: startseq seorang gadis kecil memanjat ke sebuah rumah bermain kayu endseq


In [29]:
# ============================================================
# SAVE PREPROCESSING ARTIFACTS
# ============================================================
print("\n" + "="*70)
print("Saving Preprocessing Artifacts")
print(f"Save location: {OUTPUT_DIR}")

# Save preprocessed dataframe
output_file = os.path.join(OUTPUT_DIR, 'captions_preprocessed.csv')
df.to_csv(output_file, index=False)
print(f"Saved: {output_file}")

# Save vocabulary mappings
with open(os.path.join(OUTPUT_DIR, 'word_to_idx.pkl'), 'wb') as f:
    pickle.dump(word_to_idx, f)
print(f"Saved: {os.path.join(OUTPUT_DIR, 'word_to_idx.pkl')}")

with open(os.path.join(OUTPUT_DIR, 'idx_to_word.pkl'), 'wb') as f:
    pickle.dump(idx_to_word, f)
print(f"Saved: {os.path.join(OUTPUT_DIR, 'idx_to_word.pkl')}")

# Save image-to-captions mapping
with open(os.path.join(OUTPUT_DIR, 'image_to_captions.pkl'), 'wb') as f:
    pickle.dump(image_to_captions, f)
print(f"Saved: {os.path.join(OUTPUT_DIR, 'image_to_captions.pkl')}")

# Save preprocessing config
preprocessing_config = {
    'vocab_size': vocab_size,
    'max_length': max_length,
    'min_word_freq': MIN_WORD_FREQ,
    'total_images': len(image_to_captions),
    'total_captions': len(df)
}

with open(os.path.join(OUTPUT_DIR, 'preprocessing_config.pkl'), 'wb') as f:
    pickle.dump(preprocessing_config, f)
print(f"Saved: {os.path.join(OUTPUT_DIR, 'preprocessing_config.pkl')}")


Saving Preprocessing Artifacts
Save location: D:\CODE\Image-Captioning-Project\features\text_preprocessing
Saved: D:\CODE\Image-Captioning-Project\features\text_preprocessing\captions_preprocessed.csv
Saved: D:\CODE\Image-Captioning-Project\features\text_preprocessing\word_to_idx.pkl
Saved: D:\CODE\Image-Captioning-Project\features\text_preprocessing\idx_to_word.pkl
Saved: D:\CODE\Image-Captioning-Project\features\text_preprocessing\image_to_captions.pkl
Saved: D:\CODE\Image-Captioning-Project\features\text_preprocessing\preprocessing_config.pkl


In [30]:
# ============================================================
# SUMMARY
# ============================================================
print("\n" + "="*70)
print("PREPROCESSING SUMMARY")
print("="*70)
print(f"Total images: {preprocessing_config['total_images']}")
print(f"Total captions: {preprocessing_config['total_captions']}")
print(f"Vocabulary size: {preprocessing_config['vocab_size']}")
print(f"Max caption length: {preprocessing_config['max_length']} tokens")
print(f"Min word frequency: {preprocessing_config['min_word_freq']}")
print(f"Special tokens: <PAD> (0), <UNK> ({word_to_idx['<UNK>']})") 
print(f"Sequence tokens: startseq, endseq")

print("\n" + "="*70)
print("PREPROCESSING COMPLETED SUCCESSFULLY!")
print("="*70)


PREPROCESSING SUMMARY
Total images: 8091
Total captions: 40455
Vocabulary size: 2566
Max caption length: 32 tokens
Min word frequency: 5
Special tokens: <PAD> (0), <UNK> (2565)
Sequence tokens: startseq, endseq

PREPROCESSING COMPLETED SUCCESSFULLY!
