In [2]:

import os


# Verify the current directory
current_dir = os.getcwd()

# Change to the desired directory
os.chdir(f"{current_dir}/Depth-Anything-V2")

current_dir = os.getcwd()

print(f"Current directory: {current_dir}")

Current directory: /root/Depth-SoM/Depth-Anything-V2


In [30]:
import os
import io
from PIL import Image as Image_PIL
import shutil
from datasets import load_dataset, Dataset, Image
from tqdm import tqdm
from huggingface_hub import login
from dotenv import load_dotenv

import os

load_dotenv()  # Load environment variables from .env file

from huggingface_hub import login


# --- Configuration ---

hf_token = os.getenv("HF_TOKEN")
login(hf_token)

hf_user = "Rajarshi-Roy-research"
# dataset_name = "ADE20K_OVSEG_Som"
dataset_name = "Som_bench_refcocog_refseg"
commit_message = "Updating dataset with VLM"
img_cols = ["image"]
image_to_process = "image"

# --- Helper Functions ---
def save_image_from_bytes(image_data, idx, image_folder):
    
    if isinstance(image_data,list):
        image_data = image_data[0]
        
    """Saves an image from byte data or PIL Image and returns the saved file path."""
    print(type(image_data))
    try:
        if isinstance(image_data, Image_PIL.Image):
            image = image_data
        elif isinstance(image_data, bytes):
            image = Image_PIL.open(io.BytesIO(image_data))
        else:
            print(f"Unsupported image data type at row {idx}: {type(image_data)}")
            return None

        image_path = os.path.join(image_folder, f"image_{idx}.jpg")
        image.save(image_path, format="JPEG")
        print(f"✅ Saved: {image_path}")
        return image_path
    except Exception as e:
        print(f"❌ Error saving image at row {idx}: {e}")
        return None




def push_to_hugging_face(ds, img_cols, hf_user, dataset_name, commit_message, hf_token, private=False):
    """Pushes the dataset to the Hugging Face Hub."""
    for img_col in img_cols:
        ds = ds.cast_column(img_col, Image())

    try:
        ds.push_to_hub(
            repo_id=f"{hf_user}/{dataset_name}",
            commit_message=commit_message,
            token=hf_token,
            private=private,
        )
        print(f"✅ Dataset successfully uploaded: {hf_user}/{dataset_name}")
    except Exception as e:
        print(f"❌ Error pushing to Hugging Face Hub: {e}")



# --- Main Script ---
# Clean up existing image folder
for img_col in img_cols:
    if os.path.exists(img_col):
        shutil.rmtree(img_col)
    os.makedirs(img_col)

# Load dataset
ds = load_dataset(f"{hf_user}/{dataset_name}",revision="a8196e2e83ea9c13ae9d3ed421c0edc27cc4a420")
# ds = load_dataset(f"{hf_user}/{dataset_name}")
print(ds["train"][:1])

#Process images using the dataset object directly.
for img_col in img_cols:
    ds = ds.map(
        lambda example, idx: {img_col: save_image_from_bytes(example[img_col], idx, img_col)},
        with_indices=True,
        num_proc=1 # Adjust as needed for performance
    )
    

# push_to_hugging_face(ds, img_cols, hf_user, dataset_name, commit_message, hf_token)

#Process and add captions (This part remains unchanged as it doesn't directly interact with image data)
try:
    #This section is not needed anymore since we are working directly with the dataset.
    #df = pd.read_csv("updated_dataset.csv") 
    pass
except FileNotFoundError:
    pass

ds["train"][:1]

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


{'id': ['000000007601'], 'image': [<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1205x800 at 0x78AF477C9720>], 'obj_text': [['A black cow that is only half visible standing close to a fully visible cow.', 'The far right black cow.; cow on the far right who is barely visible; cow butt 3:00; cow barely showing; cow far right; far right cow butt', 'A cow standing in high grass wearing an ear tag with the number 342.; Cow with a number 312 on a tag on its ear.; yellow tag in ear; cow with tag; Cow with yellow tag; middle cow; big cow yellow tag; center cow']], 'ref_ids': [[3, 1, 2]], 'json_data': [[{'ref_id': 3, 'text': ['A black cow that is only half visible standing close to a fully visible cow.']}, {'ref_id': 1, 'text': ['The far right black cow.', 'cow on the far right who is barely visible', 'cow butt 3:00', 'cow barely showing', 'cow far right', 'far right cow butt']}, {'ref_id': 2, 'text': ['A cow standing in high grass wearing an ear tag with the number 342.', 'Cow with a 

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

<class 'PIL.JpegImagePlugin.JpegImageFile'>
✅ Saved: image/image_0.jpg
<class 'PIL.JpegImagePlugin.JpegImageFile'>
✅ Saved: image/image_1.jpg
<class 'PIL.JpegImagePlugin.JpegImageFile'>
✅ Saved: image/image_2.jpg
<class 'PIL.JpegImagePlugin.JpegImageFile'>
✅ Saved: image/image_3.jpg
<class 'PIL.JpegImagePlugin.JpegImageFile'>
✅ Saved: image/image_4.jpg
<class 'PIL.JpegImagePlugin.JpegImageFile'>
✅ Saved: image/image_5.jpg
<class 'PIL.JpegImagePlugin.JpegImageFile'>
✅ Saved: image/image_6.jpg
<class 'PIL.JpegImagePlugin.JpegImageFile'>
✅ Saved: image/image_7.jpg
<class 'PIL.JpegImagePlugin.JpegImageFile'>
✅ Saved: image/image_8.jpg
<class 'PIL.JpegImagePlugin.JpegImageFile'>
✅ Saved: image/image_9.jpg
<class 'PIL.JpegImagePlugin.JpegImageFile'>
✅ Saved: image/image_10.jpg
<class 'PIL.JpegImagePlugin.JpegImageFile'>
✅ Saved: image/image_11.jpg
<class 'PIL.JpegImagePlugin.JpegImageFile'>
✅ Saved: image/image_12.jpg
<class 'PIL.JpegImagePlugin.JpegImageFile'>
✅ Saved: image/image_13.jpg
<c

{'id': ['000000007601'],
 'image': [<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1205x800>],
 'obj_text': [['A black cow that is only half visible standing close to a fully visible cow.',
   'The far right black cow.; cow on the far right who is barely visible; cow butt 3:00; cow barely showing; cow far right; far right cow butt',
   'A cow standing in high grass wearing an ear tag with the number 342.; Cow with a number 312 on a tag on its ear.; yellow tag in ear; cow with tag; Cow with yellow tag; middle cow; big cow yellow tag; center cow']],
 'ref_ids': [[3, 1, 2]],
 'json_data': [[{'ref_id': 3,
    'text': ['A black cow that is only half visible standing close to a fully visible cow.']},
   {'ref_id': 1,
    'text': ['The far right black cow.',
     'cow on the far right who is barely visible',
     'cow butt 3:00',
     'cow barely showing',
     'cow far right',
     'far right cow butt']},
   {'ref_id': 2,
    'text': ['A cow standing in high grass wearing an ear tag 

In [31]:
import torch
from transformers import Pix2StructProcessor, Pix2StructForConditionalGeneration
import requests
from PIL import Image as Image_PIL

# Set device to CUDA if available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the processor and model, then move the model to the chosen device
processor = Pix2StructProcessor.from_pretrained('google/deplot')
model = Pix2StructForConditionalGeneration.from_pretrained('google/deplot').to(device)



2025-03-10 13:39:16.665589: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-10 13:39:16.675712: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741613956.690200    1217 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741613956.694533    1217 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-10 13:39:16.708823: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [32]:
import re
import ast

def extract_list_from_string(text):
    """
    Extracts a list from a string by finding the first occurrence of square brackets.

    Parameters:
        text (str): The input string containing a Python list.

    Returns:
        list: Extracted list if found, otherwise an empty list.
        
    """
    match = re.search(r"\[.*?\]", text, re.DOTALL)
    if match:
        try:
            return ast.literal_eval(match.group(0))  # Safely parse the list
        except (SyntaxError, ValueError):
            return []
    return []

# Example usage
text = "```python\n[3, 1, 2]\n```"
print(extract_list_from_string(text))  # Output: [3, 1, 2]
print(type(extract_list_from_string(text)))  # Output: [3, 1, 2]


[3, 1, 2]
<class 'list'>


In [38]:
import httpx
import base64
import ast

def text_to_list(text):
    return ast.literal_eval(text)





def get_prompt(obj_list):
  prompt=f"""I have labeled a bright numeric ID at the center for each visual object in the image. \nPlease tell me the IDs for:"""

  for i in obj_list:
    prompt += f"\n-{i}\n"

  prompt+= "\nPlease, properly give answer in the form of just a python list format accoriding to the order of objects asked\nAnswer Example:`[1,2,4..]`"

  # print(prompt)
  return prompt

def get_response(prompt,img_path):
  
  if isinstance(img_path, Image_PIL.Image):
    image = img_path
  
  elif isinstance(img_path, str):
    image = Image_PIL.open(img_path)
  print(prompt)
  
  # Prepare inputs and move them to the device
  inputs = processor(images=image, text=prompt, return_tensors="pt")
  inputs = {key: tensor.to(device) for key, tensor in inputs.items()}

  # Generate predictions on the GPU
  predictions = model.generate(**inputs, max_new_tokens=512)

  # Decode and print the output
  response = processor.decode(predictions[0], skip_special_tokens=True)
  
  print("response",response)
  print(extract_list_from_string(response))


def get_prediction(index, ds):

  img_path = ds[index]['image']
  obj_list = ds[index]['obj_text']

  prompt = get_prompt(obj_list)
  #print(prompt)
  return get_response(prompt,img_path)



In [39]:
list_ =  get_prediction(1,ds["train"])
list_

Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. In the new behavior, If both images and text are provided, image_processor is not a VQA processor, and `add_special_tokens` is unset, the default value of `add_special_tokens` will be changed to `False` when calling the tokenizer. To test the new behavior, set `legacy=False`as a processor call argument.


I have labeled a bright numeric ID at the center for each visual object in the image. 
Please tell me the IDs for:
-little girl in light green shirt; Little Girl in a green dress waiting for the cake to be cut.; green; green dress; green shirt; Girl in green; girl in green; girl in green

-the blonde girls head; blonde hair with small green hairtie; blonde hair; blonde hair; blonde girl; blonde hair girl on left; hair in left corner; blonde head

-A knife cutting a cake.; Person wearing jacket cutting the cake.; standing man; person cutting cake; cutting the cake; man cutting cake

-girl with glasses; glasses; girl with glasses; right kid; right gal!; right kid

Please, properly give answer in the form of just a python list format accoriding to the order of objects asked
Answer Example:`[1,2,4..]`


response TITLE |  <0x0A>  | 1 <0x0A> 1 | 1 <0x0A> 2 | 2 <0x0A> 3 | 3 <0x0A> 4 | 4 <0x0A> 5 | 5 <0x0A> 6 | 6 <0x0A> 7 | 7 <0x0A> 8 | 10 <0x0A> 9 | 9 <0x0A> 10 | 11 <0x0A> 11 | 7 <0x0A> 12 | 13 <0x0A> 13 | 14 <0x0A> 14 | 11 <0x0A> 15 | 10 <0x0A> 16 | 10 <0x0A> 17 | 10 <0x0A> 18 | 10 <0x0A> 19 | 10 <0x0A> 20 | 10 <0x0A> 21 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 21 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 10 <0x0A> 20 | 1
[]


In [34]:
len(list_)

1