## Model Training

In [None]:
# Week 5-6: Image Captioning - Model Training & Evaluation
# Notebook-friendly (no argparse). Set paths below.

import os
import pickle
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from glob import glob

import nltk
nltk.download('punkt')
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical, Sequence
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

#  CONFIG - set your paths 
images_dir = "./images"
features_dir = "./features"
processed_dir = "./processed"
models_dir = "./models"
os.makedirs(models_dir, exist_ok=True)

# Quick-run options:
SAMPLE = 0
BATCH_SIZE = 16
EPOCHS = 5
STEPS_PER_EPOCH = None

# Load processed artifacts
print("Loading tokenizer and metadata...")
with open(os.path.join(processed_dir, "tokenizer.pkl"), "rb") as f:
    tokenizer = pickle.load(f)
with open(os.path.join(processed_dir, "metadata.json"), "r") as f:
    meta = json.load(f)
vocab_size = meta['vocab_size']
max_length = meta['max_length']
print(f"Vocab size: {vocab_size}, Max length: {max_length}")

# Load cleaned captions into dict: image -> [captions]
cleaned_csv = os.path.join(processed_dir, "cleaned_captions.csv")
df_caps = pd.read_csv(cleaned_csv)
descriptions = {}
for img, group in df_caps.groupby('image'):
    descriptions[img] = group['caption'].tolist()

# Load processed_images list if exists; else use keys from descriptions
p_images_path = os.path.join(processed_dir, "processed_images.pkl")
if os.path.exists(p_images_path):
    with open(p_images_path, "rb") as f:
        processed_images = pickle.load(f)
else:
    processed_images = list(descriptions.keys())

if SAMPLE and SAMPLE > 0:
    processed_images = processed_images[:SAMPLE]

print(f"Number of images for training: {len(processed_images)}")

# Helper: Data generator
class DataGenerator(Sequence):
    """Generates batches of (img_features, partial_seq) -> next_word one-hot"""
    def __init__(self, image_list, descriptions, features_dir, tokenizer, max_length, vocab_size, batch_size=64, shuffle=True):
        self.image_list = image_list
        self.descriptions = descriptions
        self.features_dir = features_dir
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.indexes = np.arange(len(self.image_list))
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.image_list) / float(self.batch_size)))

    def __getitem__(self, idx):
        batch_idx = self.indexes[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_imgs = [self.image_list[i] for i in batch_idx]
        X1, X2, y = self.__data_generation(batch_imgs)
        return (X1, X2), y

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indexes)

    def __data_generation(self, batch_imgs):
        X1_list, X2_list, y_list = [], [], []
        for img_name in batch_imgs:
            feat_path = os.path.join(self.features_dir, img_name + '.npy')
            if not os.path.exists(feat_path):
                continue
            img_feature = np.load(feat_path)
            
            # ensure flattened 1D vector
            img_feature = img_feature.reshape(-1)
            caps = self.descriptions.get(img_name, [])
            for cap in caps:
                seq = self.tokenizer.texts_to_sequences([cap])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq_padded = pad_sequences([in_seq], maxlen=self.max_length)[0]
                    X1_list.append(img_feature)
                    X2_list.append(in_seq_padded)
                    
                    # one-hot output
                    out_vec = np.zeros(self.vocab_size, dtype='uint8')
                    if out_seq < self.vocab_size:
                        out_vec[out_seq] = 1
                    y_list.append(out_vec)
        if len(X1_list) == 0:
            
            # avoid zero-size arrays
            return np.zeros((0,2048)), np.zeros((0,self.max_length)), np.zeros((0,self.vocab_size))
        X1 = np.vstack(X1_list)
        X2 = np.vstack(X2_list)
        y = np.vstack(y_list)
        return X1, X2, y

# Build Model
def define_model(vocab_size, max_length):
    # Image feature extractor (encoder)
    inputs1 = Input(shape=(2048,), name='image_input')
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # Sequence model (decoder)
    inputs2 = Input(shape=(max_length,), name='seq_input')
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # Decoder (combine)
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

print("Defining model...")
model = define_model(vocab_size, max_length)
model.summary()

#  Prepare Training and Use generator
train_gen = DataGenerator(processed_images, descriptions, features_dir, tokenizer, max_length, vocab_size, batch_size=BATCH_SIZE, shuffle=True)

# Calculate approximate steps per epoch. The generator __len__ provides batches per epoch
steps = len(train_gen)
print(f"Steps per epoch (generator length): {steps}")

# Callbacks
checkpoint_path = os.path.join(models_dir, 'caption_model_best.h5')
checkpoint = ModelCheckpoint(
    os.path.join(models_dir, 'caption_model_best.keras'),
    save_best_only=True
)
early = EarlyStopping(monitor='loss', patience=5, verbose=1)

# Train
print("Starting training...")
history = model.fit(train_gen, epochs=EPOCHS, callbacks=[checkpoint, early])

# Save final model
final_model_path = os.path.join(models_dir, 'caption_model_final.keras')
model.save(final_model_path)
print(f"Model saved to {final_model_path}")

# Helper: generate caption given feature
def generate_caption(model, tokenizer, photo_feature, max_length):
    in_text = '<start>'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([photo_feature.reshape(1,-1), sequence], verbose=0)
        yhat_index = np.argmax(yhat)
        
        # map index to word
        word = None
        for w, idx in tokenizer.word_index.items():
            if idx == yhat_index:
                word = w
                break
        if word is None:
            break
        in_text += ' ' + word
        if word == 'end' or word == '<end>':
            break
    return in_text

# Evaluate with BLEU (on small subset)
def evaluate_model_bleu(model, tokenizer, descriptions, features_dir, test_images, max_length, num_samples=200):
    smoothie = SmoothingFunction().method4
    scores = []
    sampled = test_images[:num_samples] if len(test_images) > num_samples else test_images
    for img_name in sampled:
        feat_path = os.path.join(features_dir, img_name + '.npy')
        if not os.path.exists(feat_path):
            continue
        photo = np.load(feat_path).reshape(-1)
        y_pred = generate_caption(model, tokenizer, photo, max_length)
        # cleanup predicted: remove <start> and <end>
        pred_tokens = [t for t in y_pred.split() if t not in ('<start>', '<end>')]
        references = []
        # descriptions[img_name] are cleaned captions including <start> and <end>
        for ref in descriptions.get(img_name, []):
            references.append([w for w in ref.split() if w not in ('<start>', '<end>')])
        if not references:
            continue
        # BLEU-1..4
        score = sentence_bleu(references, pred_tokens, smoothing_function=smoothie)
        scores.append(score)
    return np.mean(scores) if scores else 0.0

# Optionally evaluate using the saved best model (if training ended)
best_model_path = checkpoint_path if os.path.exists(checkpoint_path) else final_model_path
print("Loading best model for evaluation:", best_model_path)
best_model_path = os.path.join(models_dir, 'caption_model_best.keras')
best_model = load_model(best_model_path)

bleu_score = evaluate_model_bleu(best_model, tokenizer, descriptions, features_dir, processed_images, max_length, num_samples=100)
print(f"BLEU (avg) on sample: {bleu_score:.4f}")

# Test on one image
test_img = processed_images[0]
feat = np.load(os.path.join(features_dir, test_img + '.npy')).reshape(-1)
caption = generate_caption(best_model, tokenizer, feat, max_length)
print("Test image:", test_img)
print("Generated caption:", caption)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bollejayanthsriteja\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package punkt is already up-to-date!


Loading tokenizer and metadata...
Vocab size: 8832, Max length: 38
Number of images for training: 8092
Defining model...


Steps per epoch (generator length): 506
Starting training...


  self._warn_if_super_not_called()


Epoch 1/5
[1m506/506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.2097 - loss: 5.0786

  if self._should_save_model(epoch, batch, logs, filepath):


[1m506/506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1223s[0m 2s/step - accuracy: 0.2715 - loss: 4.3029
Epoch 2/5
[1m506/506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m900s[0m 2s/step - accuracy: 0.3553 - loss: 3.3226
Epoch 3/5
[1m506/506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m857s[0m 2s/step - accuracy: 0.3782 - loss: 3.0309
Epoch 4/5
[1m506/506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1816s[0m 4s/step - accuracy: 0.3938 - loss: 2.8484
Epoch 5/5
[1m506/506[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m860s[0m 2s/step - accuracy: 0.4051 - loss: 2.7092
Model saved to ./models\caption_model_final.keras
Loading best model for evaluation: ./models\caption_model_final.keras
BLEU (avg) on sample: 0.1401
Test image: 1000268201_693b08cb0e.jpg
Generated caption: <start> a child is sitting on a bench <end>
