<a href="https://colab.research.google.com/github/Tushar-Nagar-64/GW_echoes/blob/master/Florence_v2_large.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Set up functions to run Florence

In [None]:
!pip install transformers requests torch pillow
!pip install timm flash_attn

In [None]:
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import requests
import copy
import torch
%matplotlib inline

model_id = 'microsoft/Florence-2-large'
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype='auto').eval().cuda()
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

In [None]:
def run_example(image, task_prompt, text_input=None):
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input

    inputs = processor(text=prompt, images=image, return_tensors="pt").to('cuda', torch.float16)
    generated_ids = model.generate(
      input_ids=inputs["input_ids"].cuda(),
      pixel_values=inputs["pixel_values"].cuda(),
      max_new_tokens=1024,
      early_stopping=False,
      do_sample=False,
      num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text,
        task=task_prompt,
        image_size=(image.width, image.height)
    )

    return parsed_answer

In [None]:
def convert_to_od_format(data):
    """
    Converts a dictionary with 'bboxes' and 'bboxes_labels' into a dictionary with separate 'bboxes' and 'labels' keys.

    Parameters:
    - data: The input dictionary with 'bboxes', 'bboxes_labels', 'polygons', and 'polygons_labels' keys.

    Returns:
    - A dictionary with 'bboxes' and 'labels' keys formatted for object detection results.
    """
    # Extract bounding boxes and labels
    bboxes = data.get('bboxes', [])
    labels = data.get('bboxes_labels', [])

    # Construct the output format
    od_results = {
        'bboxes': bboxes,
        'labels': labels
    }

    return od_results

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
def plot_bbox(image, data):
   # Create a figure and axes
    fig, ax = plt.subplots()

    # Display the image
    ax.imshow(image)

    # Plot each bounding box
    for bbox, label in zip(data['bboxes'], data['labels']):
        # Unpack the bounding box coordinates
        x1, y1, x2, y2 = bbox
        # Create a Rectangle patch
        rect = patches.Rectangle((x1, y1), x2-x1, y2-y1, linewidth=1, edgecolor='r', facecolor='none')
        # Add the rectangle to the Axes
        ax.add_patch(rect)
        # Annotate the label
        plt.text(x1, y1, label, color='white', fontsize=8, bbox=dict(facecolor='red', alpha=0.5))

    # Remove the axis ticks and labels
    ax.axis('off')

    # Show the plot
    plt.show()

In [None]:
import cv2
import numpy as np

def extract_faces(image, bboxes):
    faces = []
    for bbox in bboxes:
        x1, y1, x2, y2 = bbox
        # Crop the face from the image
        face = np.array(image)[int(y1):int(y2), int(x1):int(x2)]
        faces.append(face)
    return faces


In [None]:
from PIL import Image
import numpy as np
import cv2

def preprocess_faces(faces, size=(160, 160)):
    """
    Preprocess a list of face images: convert to PIL, resize, and normalize.

    Parameters:
    - faces (list of numpy.ndarray): List of face images in BGR format.
    - size (tuple): Desired output size (width, height) for resizing.

    Returns:
    - List of preprocessed face images as numpy arrays with shape (1, height, width, channels).
    """
    processed_faces = []

    for face in faces:
        # Convert BGR numpy array to RGB numpy array
        # rgb_image = cv2.cvtColor(face, cv2.COLOR_BGR2RGB)
        rgb_image = face

        # Convert RGB numpy array to PIL image
        pil_image = Image.fromarray(rgb_image)

        # Resize image
        pil_image = pil_image.resize(size)

        # Convert PIL Image to numpy array and normalize
        face_array = np.array(pil_image) / 255.0

        # Add a batch dimension
        face_array = np.expand_dims(face_array, axis=0)

        processed_faces.append(face_array)

    return processed_faces


Run Florence

In [None]:
def run(url, task_prompt:str='<OPEN_VOCABULARY_DETECTION>', search_term:str='human head'):
    image_bits = Image.open(requests.get(url, stream=True).raw)
    results = run_example(image_bits, task_prompt, text_input="human head")
    bbox_results  = convert_to_od_format(results[task_prompt])

    image = image_bits

    # plot_bbox(image, bbox_results)
    faces = extract_faces(image, bbox_results['bboxes'])
    processed_faces = preprocess_faces(extract_faces(image, bbox_results['bboxes']))

    return(bbox_results, faces, processed_faces)

In [None]:
[bbox,faces,processed_faces] = run(url="https://th.bing.com/th/id/OIP.kjOAjQunGxvlajiw77c_XQHaE8?rs=1&pid=ImgDetMain")

FaceNet embeddings