Importing the dependencies

In [1]:
from transformers import pipeline
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
from gtts import gTTS
import IPython.display as ipd

Install Required Libraries

In [2]:
!pip install gTTS
!pip install pyttsx3

Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting click<8.2,>=7.1 (from gTTS)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: click, gTTS
  Attempting uninstall: click
    Found existing installation: click 8.3.0
    Uninstalling click-8.3.0:
      Successfully uninstalled click-8.3.0
Successfully installed click-8.1.8 gTTS-2.5.4
Collecting pyttsx3
  Downloading pyttsx3-2.99-py3-none-any.whl.metadata (6.2 kB)
Downloading pyttsx3-2.99-py3-none-any.whl (32 kB)
Installing collected packages: pyttsx3
Successfully installed pyttsx3-2.99


Complete PipeLine

In [5]:

# IMAGE ➜ CAPTION ➜ STORY ➜ AUDIO  (Full Pipeline)

# STEP 1 — LOAD BLIP (pretrained)

device = "cuda" if torch.cuda.is_available() else "cpu"

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained(
    "Salesforce/blip-image-captioning-base"
).to(device)

print("BLIP loaded successfully!")


# STEP 2 — FUNCTION: Generate Caption

def generate_caption(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(image, return_tensors="pt").to(device)

    with torch.no_grad():
        output = model.generate(**inputs, max_length=40)

    caption = processor.decode(output[0], skip_special_tokens=True)
    return caption



# STEP 3 — Function: Convert Caption → Story

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

story_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
story_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small").to("cpu")

def generate_story(caption):
    prompt = (
        f"Write an engaging, creative story (5-6 sentences) based on this image description:\n"
        f"{caption}"
    )

    inputs = story_tokenizer(prompt, return_tensors="pt", truncation=True).to("cpu")
    outputs = story_model.generate(
        **inputs,
        max_new_tokens=180,
        temperature=0.9,
        do_sample=True,
    )

    story = story_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return story


# STEP 4 — Convert Story → AUDIO (TTS)

from gtts import gTTS
from IPython.display import Audio

def text_to_audio(text, filename="story_audio.mp3"):
    tts = gTTS(text=text, lang="en")
    tts.save(filename)
    return Audio(filename, autoplay=True)


# RUN FULL PIPELINE

IMAGE_PATH = "/content/image(2).jpg"   # image

caption = generate_caption(IMAGE_PATH)
print("Generated Caption:\n", caption)

story = generate_story(caption)
print("\nGenerated Story:\n", story)

audio_file = text_to_audio(story)

audio_file


BLIP loaded successfully!
Generated Caption:
 children playing in the playground

Generated Story:
 The kids are playing at the playground in the park. They are in a stroller in front of a teenager. They can see some of the kids play in the playground because of a boy that's out of the t-shirt. The kids play on a playground after a little while because it's very busy. There is an old man playing on a playground and his friends are playing outside. They are a little bit more interested in the children. This kid is playing on the playground because he has a boyfriend who plays in the swing. They have fun with the kids, though they don't seem to like the kids. The children are very excited to be playing indoors. If they are playing indoors it would be fun playing indoors. The kids are very excited about the kids playing outside as well. They can see the


Gradio interface

In [6]:
import gradio as gr

def pipeline(image_path):
    caption = generate_caption(image_path)
    story = generate_story(caption)
    audio = text_to_audio(story, "temp.mp3")
    return caption, story, "temp.mp3"

gr.Interface(fn=pipeline,
             inputs=gr.Image(type="filepath"),
             outputs=[gr.Textbox(), gr.Textbox(), gr.Audio()]).launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b082f56336bafb84db.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


