In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import InceptionV3
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, LSTM, Embedding, Input, Add, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from PIL import Image
import matplotlib.pyplot as plt
import io
import ipywidgets as widgets
from IPython.display import display, clear_output

# Load pre-trained InceptionV3 for feature extraction
try:
    base_model = InceptionV3(weights='imagenet')
    cnn_model = Model(base_model.input, base_model.layers[-2].output)
except Exception as e:
    print(f"Error loading InceptionV3: {e}")
    raise

# Simple untrained caption model
image_input = Input(shape=(2048,))
image_dense = Dense(256, activation='relu')(image_input)
image_dense = Dropout(0.5)(image_dense)
sequence_input = Input(shape=(5,))  # Small fixed length
embedding = Embedding(500, 256, mask_zero=True)(sequence_input)
lstm = LSTM(256)(embedding)
decoder = Add()([image_dense, lstm])
decoder = Dense(256, activation='relu')(decoder)
output = Dense(500, activation='softmax')(decoder)
caption_model = Model(inputs=[image_input, sequence_input], outputs=output)
caption_model.compile(loss='categorical_crossentropy', optimizer='adam')

# Dummy tokenizer (no dataset needed)
def generate_caption(features):
    in_text = 'startseq'
    for _ in range(5):  # Fixed length
        sequence = [1] * len(in_text.split())  # Dummy sequence
        sequence = pad_sequences([sequence], maxlen=5)
        yhat = caption_model.predict([features, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = 'hi' if yhat < 499 else 'endseq'  # Dummy vocabulary
        if word == 'endseq':
            break
        in_text += ' ' + word
    return in_text.replace('startseq ', '')

# Upload and Generate Caption
upload = widgets.FileUpload(accept='.jpg,.png', multiple=False)
button = widgets.Button(description="Generate Caption")
output = widgets.Output()

def process_image(image_data):
    try:
        img = Image.open(io.BytesIO(image_data)).resize((299, 299))
        img_array = np.array(img) / 255.0
        img_array = np.expand_dims(img_array, axis=0)
        features = cnn_model.predict(img_array, verbose=0)
        return img, features
    except Exception as e:
        print(f"Error processing image: {e}")
        return None, None

def on_button_clicked(b):
    with output:
        clear_output()
        if not upload.value:
            print("Please upload an image.")
            return
        image_data = list(upload.value.values())[0]['content']
        img, features = process_image(image_data)
        if img is not None and features is not None:
            caption = generate_caption(features)
            plt.figure(figsize=(8, 8))
            plt.imshow(img)
            plt.title(f"Caption: {caption}")
            plt.axis('off')
            plt.show()
        else:
            print("Failed to process image.")

button.on_click(on_button_clicked)
display(upload, button, output)



FileUpload(value={}, accept='.jpg,.png', description='Upload')

Button(description='Generate Caption', style=ButtonStyle())

Output()