<a href="https://colab.research.google.com/github/TimHBSWFL/UCSD-ML-Capstone/blob/main/images_to_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import torch
import pandas as pd
from PIL import Image
from transformers import pipeline
from transformers import BlipProcessor, BlipForConditionalGeneration

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
main_image_dir = '/content/drive/My Drive/Capstone Data Collection/test photos/'

image_paths = []

for subfolder in ['fast food', 'upscale']:
    subfolder_path = os.path.join(main_image_dir, subfolder)

    image_paths.extend([os.path.join(subfolder_path, img) for img in os.listdir(subfolder_path) if img.endswith(".jpg")])

print(len(image_paths))

50


In [4]:
models_to_test = [
    {"type": "pipeline", "model": "nlpconnect/vit-gpt2-image-captioning"},
    {"type": "blip", "model": "Salesforce/blip-image-captioning-large"},
    {"type": "blip", "model": "Salesforce/blip-image-captioning-base"}
]


loaded_models = []

for model_info in models_to_test:
    if model_info["type"] == "pipeline":
        loaded_models.append({
            "type": "pipeline",
            "model_name": model_info["model"],
            "model": pipeline("image-to-text", model=model_info["model"])
        })
    elif model_info["type"] == "blip":
        processor = BlipProcessor.from_pretrained(model_info["model"])
        model = BlipForConditionalGeneration.from_pretrained(model_info["model"]).to("cuda")
        loaded_models.append({
            "type": "blip",
            "model_name": model_info["model"],
            "processor": processor,
            "model": model
        })


results = []


def caption_image_with_models(image_path):
    raw_image = Image.open(image_path).convert("RGB")
    image_id = os.path.basename(image_path)

    for model_data in loaded_models:
        if model_data["type"] == "pipeline":

            result = model_data["model"](image_path)
            caption_text = result[0]['generated_text']

        elif model_data["type"] == "blip":
            processor = model_data["processor"]
            model = model_data["model"]

            inputs = processor(raw_image, return_tensors="pt").to("cuda")
            out = model.generate(**inputs)
            caption_text = processor.decode(out[0], skip_special_tokens=True)

        results.append({
            "photo_id": image_id,
            "model_name": model_data["model_name"],
            "caption": caption_text
        })


for image_path in image_paths:
    caption_image_with_models(image_path)

df_results = pd.DataFrame(results)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]



preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [10]:
filename = "Images_to_Text_Results.csv"

outpath_dir = '/content/drive/My Drive/Capstone Data Collection/' + filename

df_results.to_csv(outpath_dir, index=False)

In [11]:
df_results[0:10]

Unnamed: 0,photo_id,model_name,caption
0,Ax5PLwfU94uEXMafFdXrtw.jpg,nlpconnect/vit-gpt2-image-captioning,a sandwich with meat and cheese on a cutting b...
1,Ax5PLwfU94uEXMafFdXrtw.jpg,Salesforce/blip-image-captioning-large,a close up of a sandwich on a piece of paper o...
2,Ax5PLwfU94uEXMafFdXrtw.jpg,Salesforce/blip-image-captioning-base,"a hamburger with cheese, bacon and cheese on it"
3,bFNqVruIW3AXjgSuLHq4kg.jpg,nlpconnect/vit-gpt2-image-captioning,a person holding a sandwich in their hand
4,bFNqVruIW3AXjgSuLHq4kg.jpg,Salesforce/blip-image-captioning-large,someone is grabbing a sausage patty out of a b...
5,bFNqVruIW3AXjgSuLHq4kg.jpg,Salesforce/blip-image-captioning-base,a person putting a piece of cheese on top of a...
6,-BCem2FTrQu6q9atFuFZiA.jpg,nlpconnect/vit-gpt2-image-captioning,a sandwich and fries on a table
7,-BCem2FTrQu6q9atFuFZiA.jpg,Salesforce/blip-image-captioning-large,a close up of a hamburger and fries on a table
8,-BCem2FTrQu6q9atFuFZiA.jpg,Salesforce/blip-image-captioning-base,a burger with a cup of fries and a cup of fries
9,AP09Q5qM1tLbK6HPXTWmkA.jpg,nlpconnect/vit-gpt2-image-captioning,a bowl of food with vegetables and a fork


In [12]:
upscale_image_dir = '/content/drive/My Drive/Capstone Data Collection/test photos/upscale/'

image_paths = [os.path.join(upscale_image_dir, img) for img in os.listdir(upscale_image_dir) if img.endswith(".jpg")]

print(len(image_paths))

25


In [14]:
results = []

for image_path in image_paths:
    caption_image_with_models(image_path)

df_upscale = pd.DataFrame(results)



In [22]:
df_upscale[61:75]

Unnamed: 0,photo_id,model_name,caption
61,9j3DRqJxxer-ktIyujogsw.jpg,Salesforce/blip-image-captioning-large,arafed archway with a wall of pictures and a b...
62,9j3DRqJxxer-ktIyujogsw.jpg,Salesforce/blip-image-captioning-base,a hallway with pictures and pictures on the wall
63,9jc81-kHyxAeJRmvH5iWaQ.jpg,nlpconnect/vit-gpt2-image-captioning,a plate of food with a fish and vegetables
64,9jc81-kHyxAeJRmvH5iWaQ.jpg,Salesforce/blip-image-captioning-large,there is a plate of shrimp with a parsley on it
65,9jc81-kHyxAeJRmvH5iWaQ.jpg,Salesforce/blip-image-captioning-base,a plate with a cooked shrimp and garni
66,10d52ZZ0chw-ABihHQQ5Dw.jpg,nlpconnect/vit-gpt2-image-captioning,a piece of meat is being cooked on a stove top
67,10d52ZZ0chw-ABihHQQ5Dw.jpg,Salesforce/blip-image-captioning-large,there is a sushi on a black plate with a knife
68,10d52ZZ0chw-ABihHQQ5Dw.jpg,Salesforce/blip-image-captioning-base,a black plate topped with su - style su - te
69,28cmi3WB2RH-vJmBluJ-eQ.jpg,nlpconnect/vit-gpt2-image-captioning,a piece of food that is sitting on a table
70,28cmi3WB2RH-vJmBluJ-eQ.jpg,Salesforce/blip-image-captioning-large,there is a small cupcake with shrimp and veget...
