In [16]:
import os
import torch
import nest_asyncio
from dotenv import load_dotenv
from IPython.display import Markdown, display
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

from llama_index.core import Settings, VectorStoreIndex, PromptTemplate
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.schema import Document  # ✅ Import Document class


In [17]:
nest_asyncio.apply()


In [18]:
input_dir_path = '/teamspace/studios/this_studio/screenshots_output'

# Check if directory exists
if not os.path.exists(input_dir_path):
    raise ValueError(f"Directory {input_dir_path} does not exist!")


In [19]:
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)


In [20]:
def generate_image_description(image_path):
    """Generates a more Pokémon Red-accurate caption using BLIP."""
    image = Image.open(image_path).convert("RGB")  # Open image
    inputs = processor(image, return_tensors="pt").to(device)  # Process image

    # ✅ Give BLIP a Pokémon-specific instruction
    prompt = (
        "This is a screenshot from Pokémon Red, a classic Game Boy RPG. "
        "It might show battles, routes, towns, menus, or important events. "
        "Describe the scene as if explaining it to a Pokémon player."
    )

    out = model.generate(**inputs, max_new_tokens=75)  # Generate caption with a longer response
    caption = processor.decode(out[0], skip_special_tokens=True)  # Convert to text

    # Combine Pokémon context + BLIP caption
    pokemon_caption = f"Pokémon Red Scene: {caption}"
    
    return pokemon_caption


In [21]:
docs = []
image_files = [f for f in os.listdir(input_dir_path) if f.endswith(".png")]

if len(image_files) == 0:
    raise ValueError(f"No PNG files found in {input_dir_path}.")


In [22]:
for image_file in image_files:
    image_path = os.path.join(input_dir_path, image_file)
    caption = generate_image_description(image_path)

    # ✅ Print for debugging
    print(f"Processing: {image_file}")
    print(f"Generated Caption: {caption}\n{'-'*50}")

    # ✅ Store caption & metadata correctly
    doc = Document(
        text=f"Pokémon Red Scene: {caption}",
        metadata={
            "image_path": image_path,
            "generated_caption": caption  # Ensure easy retrieval
        }
    )
    docs.append(doc)


Processing: screenshot_0136.png
Generated Caption: Pokémon Red Scene: the diagram shows the different types of the different types of the different types of the different types of the different
--------------------------------------------------
Processing: screenshot_0272.png
Generated Caption: Pokémon Red Scene: a screenshot of a computer screen with a bunch of flowers
--------------------------------------------------
Processing: screenshot_0574.png
Generated Caption: Pokémon Red Scene: a black and white pattern with a black and white background
--------------------------------------------------
Processing: screenshot_0737.png
Generated Caption: Pokémon Red Scene: a screenshot of a game with a bunch of birds
--------------------------------------------------
Processing: screenshot_0205.png
Generated Caption: Pokémon Red Scene: a screenshot of a game with a bunch of birds
--------------------------------------------------
Processing: screenshot_0342.png
Generated Caption: Pokémon Red 

In [23]:
llm = Ollama(model="deepseek-v2:16b", request_timeout=120.0)
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5", trust_remote_code=True)


In [24]:
Settings.embed_model = embed_model
index = VectorStoreIndex.from_documents(docs, show_progress=True)


Parsing nodes:   0%|          | 0/1000 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

In [25]:
Settings.llm = llm
query_engine = index.as_query_engine()


In [26]:
qa_prompt_tmpl_str = (
    "You are analyzing images from Pokémon Red, a classic Game Boy RPG. "
    "Each image has an automatically generated caption that briefly describes it. "
    "Your task is to expand this caption into a **detailed, immersive** description that includes game locations, Pokémon, characters, and player actions.\n"
    "For example, if the caption mentions a battle, provide details on the Pokémon involved, their moves, and the setting.\n"
    "If the caption describes a town, expand on what buildings are there, which NPCs might appear, and what the player might be doing.\n"
    "\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------a\n"
    "Query: {query_str}\n"
    "Answer: "
)



In [27]:
qa_prompt_tmpl = PromptTemplate(qa_prompt_tmpl_str)
query_engine.update_prompts(
    {"response_synthesizer:text_qa_template": qa_prompt_tmpl}
)


In [28]:
query_text = "Expand the descriptions of three Pokémon Red images, providing detailed locations, characters, and possible player actions."
response = query_engine.query(query_text)


HTTPStatusError: Client error '404 Not Found' for url 'http://localhost:11434/api/chat'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/404

In [None]:
print(response)
display(Markdown(str(response)))


<think>
Alright, I'm trying to figure out how to help this user. They provided some query about expanding Pokémon Red image captions into more immersive descriptions. Hmm, they gave examples with specific image paths but both had the same generic caption. 

First, maybe they're a fan of Pokémon Red and wants better descriptions for their images. They might be creating content like fan art, guides, or just enhancing their gaming experience by adding more details.

Looking at their query again, it says "expand" the captions into detailed ones including game locations, Pokémon, characters, and player actions. The examples they gave were not very descriptive—just repeating the same sentence with similar phrases. So, I need to come up with more creative and vivid descriptions based on that.

But wait, without seeing the actual images or the specific details from them, it's a bit challenging. They didn't provide much context beyond the image paths and captions. Maybe they expect me to genera

<think>
Alright, I'm trying to figure out how to help this user. They provided some query about expanding Pokémon Red image captions into more immersive descriptions. Hmm, they gave examples with specific image paths but both had the same generic caption. 

First, maybe they're a fan of Pokémon Red and wants better descriptions for their images. They might be creating content like fan art, guides, or just enhancing their gaming experience by adding more details.

Looking at their query again, it says "expand" the captions into detailed ones including game locations, Pokémon, characters, and player actions. The examples they gave were not very descriptive—just repeating the same sentence with similar phrases. So, I need to come up with more creative and vivid descriptions based on that.

But wait, without seeing the actual images or the specific details from them, it's a bit challenging. They didn't provide much context beyond the image paths and captions. Maybe they expect me to generate possible expansions based on common elements in Pokémon Red scenes—like forests, towns, battle arenas, etc.

I should think about typical game locations: starter towns, forests near cities, maybe hidden villages. Also, considering popular areas like the Fossil Beach or the underground caves. For characters, there are the main Pokémon like Pikachu, Jigglypuff, and Eevee, plus other regional badges holders. Player actions could involve exploring, battling, or just relaxing in a town.

I need to create three examples, each with a different scene: maybe one from a forest area, one from a bustling town, and another with some hidden features like caves. Each expansion should highlight specific Pokémon and locations that players might encounter or interact with.

Also, I should make sure each description includes actions the player could take, like battling, exploring, or just relaxing to give a sense of immersion.

Alright, time to draft some detailed descriptions for each image path they provided, even though they didn't specify. Hopefully, this will give them a good starting point and inspiration for creating their own expansions.
</think>

Certainly! It seems like you're working on enhancing Pokémon Red screenshots with more immersive details. However, I notice that the captions you've shared are very generic: "Pokémon Red Scene: Pokémon Red Scene: the game's map, with the location of the location" or "Pokémon Red Scene: a screenshot of a game with a map and a map." Could you clarify if these screenshots have specific details about locations, characters, or events? If so, I'd be happy to help expand them into detailed descriptions!