In [None]:
from google.colab import files
import zipfile, os
from PIL import Image

uploaded = files.upload()

for fname in uploaded:
    if fname.endswith('.zip'):
        folder_name = os.path.splitext(fname)[0]
        with zipfile.ZipFile(fname, 'r') as zip_ref:
            zip_ref.extractall(folder_name)
        print(f"✅ Extracted to: {folder_name}")


In [None]:
import os
import json
from PIL import Image
import requests
from bs4 import BeautifulSoup

# ==== CONFIG ====
image_folder = "VLM task food/VLM task food"
output_json_path = "dish_metadata.json"
valid_exts = ('.jpg', '.jpeg', '.png', '.bmp', '.gif')

# ==== Scrape instructions from AllRecipes ====
def get_allrecipes_instructions(dish_name):
    try:
        query = dish_name.replace('_', '+')
        url = f"https://www.allrecipes.com/search?q={query}"
        headers = {"User-Agent": "Mozilla/5.0"}
        search_resp = requests.get(url, headers=headers)
        soup = BeautifulSoup(search_resp.content, "html.parser")

        first_result = soup.select_one("a.card__titleLink")
        if not first_result:
            return ["No instructions found."]

        recipe_url = first_result["href"]
        recipe_resp = requests.get(recipe_url, headers=headers)
        recipe_soup = BeautifulSoup(recipe_resp.content, "html.parser")

        instructions = recipe_soup.select("ul.instructions-section li p")
        steps = [step.get_text(strip=True) for step in instructions]

        return steps if steps else ["No steps found."]
    except Exception as e:
        return [f"Error fetching instructions: {e}"]

# ==== MAIN LOOP ====
metadata_list = []
import time
contents = os.listdir(image_folder)

for fname in contents:
    file_path = os.path.join(image_folder, fname)

    if os.path.isdir(file_path) or not fname.lower().endswith(valid_exts):
        continue

    try:
        image = Image.open(file_path).convert("RGB")

        # Ask user for noisy title instead of using Gemini
        noisy_title = input(f"🎭 Enter a noisy title for {fname}: ")

        # Get original dish name (without extension) for recipe search
        search_name = os.path.splitext(fname)[0]
        print(f"🔍 Searching AllRecipes for: {search_name}")
        instructions = get_allrecipes_instructions(search_name)

        # Ask user for summary
        summary = input(f"📝 Enter summary for {fname}: ")

        metadata_list.append({
            "file_name": fname,
            "noisy_title": noisy_title,
            "instructions": instructions,
            "summary": summary
        })

        print(f"✅ Done: {fname} → '{noisy_title}'")

    except Exception as e:
        print(f"❌ Error processing {fname}: {e}")
    time.sleep(1)

# ==== SAVE JSON ====
with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(metadata_list, f, indent=4, ensure_ascii=False)

print(f"\n✅ JSON saved to: {output_json_path}")


In [1]:
!pip install transformers accelerate safetensors torch torchvision

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [1]:
from transformers import AutoProcessor, LlavaForConditionalGeneration
import torch, gc
from PIL import Image
import requests

In [3]:
def generate_summary():
    model_id = "llava-hf/llava-1.5-7b-hf"
    model = LlavaForConditionalGeneration.from_pretrained(
            model_id,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
    ).to(0)

    processor = AutoProcessor.from_pretrained(model_id)

    torch.cuda.empty_cache()
    gc.collect()


    image = Image.open("/content/chocolate_tart.jpeg").convert("RGB")
    conversation = [
        {
            "role": "system",
            "content": [
                 {"type": "text", "text": "You are a helpful cooking assistant. For a given image of a dish and noisy title of it, provide a concise 2-3 step summary on how to cook the dish.\n"
                  "Example 1:\n"
                  "Image: Pancakes \n"
                  "noisy_title: Sweet stack\n"
                  "output: Beat eggs, sugar and butter. Add flour, baking powder and baking soda. Thoroughly mix milk in batter and cook in batches on a non-stick pan until golden brown.\n"

                  "Example 2:\n"
                  "Image: hakka noodles\n"
                  "noisy_title: spicy twisters"
                  "output: Boil and drain noodles. Stir-fry veggies with sauces.Toss noodles and serve hot."
                   },
                ],
        },
        {
            "role": "user",
            "content": [
                {"type":"image","image": image},
                {"type": "text", "text": "noisy dish title: Rich Fudgy Slice"},
                {"type": "text", "text": "Provide concise 2-3 step summary of how to prepare the dish."}]
        },

    ]
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)


    inputs = processor(images=image, text=prompt, return_tensors='pt').to(0, torch.float16)



    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=100)

    torch.cuda.empty_cache()
    gc.collect()


    response = processor.batch_decode(output[:, inputs["input_ids"].shape[-1]:], skip_special_tokens=True)[0]
    print(response)

In [4]:
generate_summary()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/950 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/70.1k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.62M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/41.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

Image: Chocolate cake
noisy_title: Fudgy delight
output: Melt butter and chocolate. Mix in sugar, eggs and vanilla. Combine flour, baking powder and salt. Gradually add the dry mixture to the wet mixture and mix until smooth. Pour the batter into a greased cake pan and bake in a preheated oven at 350°F for 30-35 minutes
