## BLIP image captioning

In [2]:
import torch
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests
from io import BytesIO

In [13]:
# To run this code, first install the dependencies:
# pip install transformers torch torchvision pillow



def download_image(url: str) -> Image.Image:
    """
    Downloads an image from a URL and returns a PIL Image.
    """
    response = requests.get(url)
    image = Image.open(BytesIO(response.content)).convert("RGB")
    return image

def main():
    # Option 1: Use a local image by providing a path
    image = Image.open("../data/tqa_train_val_test/train/teaching_images/biomes_6557.png").convert("RGB")
    
    # Option 2: Download an image from a URL (example image URL)
    # image_url = "https://raw.githubusercontent.com/salesforce/BLIP/main/demo.jpg"
    # image = download_image(image_url)
    
    # Initialize BLIP processor and model with a custom cache directory
    cache_directory = "../models"
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base", cache_dir=cache_directory)
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base", cache_dir=cache_directory)
    
    # Preprocess the image
    inputs = processor(image, return_tensors="pt", max_length=200, )
    
    # Generate caption (you can adjust parameters like max_length or num_beams if needed)
    output_ids = model.generate(**inputs)
    
    # Decode the generated ids to text
    caption = processor.decode(output_ids[0], skip_special_tokens=True)
    
    print("Generated Caption:")
    print(caption)



In [14]:
if __name__ == "__main__":
    main()


Generated Caption:
a pyramid with the names of different types of plants and animals


## LLAVA 


In [1]:
import os
# Optionally restrict to one GPU if needed:
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq

In [2]:

# Load a local image
image = Image.open("../data/tqa_train_val_test/train/teaching_images/acid_rain_formation_6507.png").convert("RGB")

# Define your model name or local checkpoint path
model_name = "llava-hf/llava-1.5-7b-hf"
cache_directory = "../models"

# Load processor
processor = AutoProcessor.from_pretrained(model_name, cache_dir=cache_directory)

# ✅ Set required processing attributes to avoid deprecation errors
if not hasattr(processor, "patch_size"):
    processor.patch_size = 14  # Adjust based on the model version
if not hasattr(processor, "vision_feature_select_strategy"):
    processor.vision_feature_select_strategy = "default"  # Options: "default", "first", "max"

# Force model to load sequentially on one GPU
model = AutoModelForVision2Seq.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="sequential",
    cache_dir=cache_directory
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:

# Use structured prompt format
prompt = "### Human: <image>\nPlease describe this diagram in detail.\n### Assistant:"

# Process image and text
inputs = processor(image, prompt, return_tensors="pt")

# Ensure inputs are on the correct device
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# Generate response
output_ids = model.generate(**inputs, max_new_tokens=500, num_beams=5)

# Decode response
response = processor.batch_decode(output_ids, skip_special_tokens=True)[0]

print("LLaVA Response:")
print(response)

LLaVA Response:
### Human:  
Please describe this diagram in detail.
### Assistant:

In the image, there is a diagram illustrating the process of acid rain formation. Acid rain is formed when pollutants, such as sulfur dioxide and nitrogen oxides, are released into the atmosphere. These pollutants react with moisture in the air to form sulfuric and nitric acids, which then fall to the ground as acid rain.

The diagram shows the various stages of acid rain formation, including the release of pollutants into the atmosphere, the formation of sulfuric and nitric acids in the presence of moisture, and the precipitation of these acids as acid rain. Additionally, the diagram highlights the impact of acid rain on the environment, such as the corrosion of buildings, statues, and other structures, as well as the harm it can cause to plants and aquatic life.

Overall, the image provides a comprehensive overview of the process of acid rain formation and its consequences on the environment.


## Image captioning folder wise

In [2]:
import os
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from tqdm import tqdm

# Optionally restrict to a single GPU. Adjust the device number as needed.
# os.environ["CUDA_VISIBLE_DEVICES"] = "2"

# Define paths and model/checkpoint names.
model_name = "llava-hf/llava-1.5-7b-hf"
cache_directory = "../models"
input_folder = "../data/tqa_train_val_test/test/teaching_images"  # Folder with images
output_folder = "../data/tqa_train_val_test/test/teaching_images_llava_captions"  # Folder to save txt outputs

# Create the output folder if it doesn't exist.
os.makedirs(output_folder, exist_ok=True)

In [3]:
# Load processor and set required attributes to avoid deprecation warnings.
processor = AutoProcessor.from_pretrained(model_name, cache_dir=cache_directory)
if not hasattr(processor, "patch_size"):
    processor.patch_size = 14  # Adjust as needed for your model version.
if not hasattr(processor, "vision_feature_select_strategy"):
    processor.vision_feature_select_strategy = "default"  # Options: "default", "first", "max"

# Force the model to load sequentially on one GPU.
model = AutoModelForVision2Seq.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="sequential",
    cache_dir=cache_directory
)

# Use a structured prompt format.
prompt = "### Human: <image>\nPlease describe this diagram in detail.\n### Assistant:"

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
def generate_description(image_path: str) -> str:
    """
    Given an image path, load the image, generate a response with LLaVA,
    and return the cleaned description (with prompt tokens and instructions removed).
    """
    try:
        image = Image.open(image_path).convert("RGB")
    except Exception as e:
        print(f"Error loading {image_path}: {e}")
        return ""
    
    inputs = processor(image, prompt, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate response.
    output_ids = model.generate(**inputs, max_new_tokens=256, num_beams=5)
    response = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
    
    # Clean up the response by removing the structured prompt parts.
    lines = response.splitlines()
    clean_lines = []
    for line in lines:
        if line.strip().startswith("###"):
            continue
        if "Please describe this diagram in detail." in line:
            line = line.replace("Please describe this diagram in detail.", "")
        if line.strip():
            clean_lines.append(line.strip())
    clean_response = "\n".join(clean_lines).strip()
    
    return clean_response

def process_folder(input_dir: str, output_dir: str):
    """
    Process all image files in input_dir, generate a description for each,
    and save the output in output_dir as a text file (one per image),
    while displaying a progress bar.
    """
    image_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff"}
    files = [f for f in os.listdir(input_dir) if os.path.splitext(f)[1].lower() in image_extensions]
    
    print(f"Found {len(files)} images in {input_dir}.")
    
    for filename in tqdm(files, desc="Processing images", dynamic_ncols=True):
        image_path = os.path.join(input_dir, filename)
        description = generate_description(image_path)
        
        output_filename = os.path.splitext(filename)[0] + ".txt"
        output_path = os.path.join(output_dir, output_filename)
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(description)


In [5]:
# Run the processing over the entire folder.
process_folder(input_folder, output_folder)


Found 31 images in ../data/tqa_train_val_test/test/teaching_images.


Expanding inputs for image tokens in LLaVa should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.50.
Processing images: 100%|██████████| 31/31 [05:38<00:00, 10.93s/it]
