# Image Captioning Notebook
This notebook explains the full workflow behind the image caption generator, including model loading, preprocessing, feature extraction, and caption generation.

In [None]:
import numpy as np
import pickle
from PIL import Image
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Load Tokenizer, Models, and Feature Extractor

In [None]:
def load_all():
    with open('tokenizer.pkl', 'rb') as f:
        tokenizer = pickle.load(f)
    vocab_size = len(tokenizer.word_index) + 1
    max_length = 36

    input_img_features = Input(shape=(2048,), name="image_input")
    feat_model = Dropout(0.4)(input_img_features)
    feat_model = Dense(256, activation='relu')(feat_model)

    input_text = Input(shape=(max_length,), name="text_input")
    text_model = Embedding(vocab_size, 256, mask_zero=True)(input_text)
    text_model = Dropout(0.4)(text_model)
    text_model = LSTM(256)(text_model)

    decoder = add([feat_model, text_model])
    decoder = Dense(256, activation='relu')(decoder)
    output = Dense(vocab_size, activation='softmax')(decoder)

    model = Model(inputs=[input_img_features, input_text], outputs=output)
    model.load_weights("image_captioning_model_weights.weights.h5")

    base_model = InceptionV3(weights='imagenet')
    feature_extractor = Model(base_model.input, base_model.layers[-2].output)

    return model, feature_extractor, tokenizer, max_length

## Preprocess Image

In [None]:
def preprocess_pil_image(image_pil):
    image = image_pil.resize((299, 299))
    if image.mode != "RGB":
        image = image.convert("RGB")
    img_array = tf.keras.preprocessing.image.img_to_array(image)
    img_array = np.expand_dims(img_array, axis=0)
    return preprocess_input(img_array)

## Extract Features

In [None]:
def extract_features_from_pil(image_pil, feature_extractor):
    img_preprocessed = preprocess_pil_image(image_pil)
    features = feature_extractor.predict(img_preprocessed, verbose=0)
    return features

## Convert Token ID to Word

In [None]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

## Generate Caption

In [None]:
def generate_caption(model, tokenizer, image_features, max_length):
    in_text = '<start>'
    for i in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length, padding='post')
        yhat = model.predict([image_features, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = word_for_id(yhat, tokenizer)
        if word is None:
            break
        in_text += ' ' + word
        if word == '<end>':
            break
    return in_text.replace('<start>', '').replace('<end>', '').strip()