In [1]:
import torch
import pandas as pd
from PIL import Image, ImageDraw
from transformers import AutoProcessor, AutoModelForCausalLM
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import numpy as np
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [2]:
# Device settings
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
device

'cuda:0'

In [3]:
# Loading model
def load_model():
    CHECKPOINT = "microsoft/Florence-2-base-ft"
    model = AutoModelForCausalLM.from_pretrained(CHECKPOINT, trust_remote_code=True).to(device, dtype=torch_dtype)
    processor = AutoProcessor.from_pretrained(CHECKPOINT, trust_remote_code=True)
    return model, processor

In [4]:
model, processor = load_model()

Florence2LanguageForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [5]:
def model_predict(model,processor, img, task_prompt, phrase=''):

    inputs = processor(text=task_prompt+phrase, images=img, return_tensors="pt").to(device, torch_dtype)
    
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=512,
            num_beams=3,
            do_sample=False
        )

    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

    parsed_answer = processor.post_process_generation(
            generated_text,
            task=task_prompt,
            image_size=(img.width, img.height))

    # key = "<CAPTION_TO_PHRASE_GROUNDING>"
    # key = "<OPEN_VOCABULARY_DETECTION>"
    key = task_prompt

    detections = parsed_answer.get(key, {"bboxes": [], "labels": []})
    bboxes = detections.get("bboxes", [])
    labels = detections.get("labels", [])

    data = []
    area_img  = img.width * img.height

    for bbox, label in zip(bboxes, labels):
        x_min, y_min, x_max, y_max = map(int, bbox)
        bbox_area = (x_max - x_min) * (y_max - y_min)
        if bbox_area < 0.7*area_img:
            data.append([x_min, y_min, x_max, y_max, label])

    return pd.DataFrame(data, columns=["x1", "y1", "x2", "y2", "object"])

In [6]:
task = "Object Detection (OD)"
# task = "Phrase Grounding (PG)"
# task = "Image Captioning (IC)"
# task = "Open Vocabulary Detection (OVD)"

phrase = ''
if task == "Object Detection (OD)":
    task_prompt = "<OD>"
elif task == "Phrase Grounding (PG)":
    task_prompt = "<CAPTION_TO_PHRASE_GROUNDING>"
    phrase = 'tomato'
elif task == "Image Captioning (IC)":
    task_prompt = "<CAPTION>"
else:
    task_prompt = "<OPEN_VOCABULARY_DETECTION>"
    # phrase = 'Tomato'

task , task_prompt , phrase

('Object Detection (OD)', '<OD>', '')

In [8]:
# Loading dataset images and labels
img_folder = 'Images'
labels_folder = 'labels'

result = []

for file in tqdm(np.sort(os.listdir(img_folder))):
    if not file.endswith('.JPG'):
        continue

    # if file == "0000.JPG" or file == "0001.JPG" or file == "0002.JPG":
    #     continue

    image_path = os.path.join(img_folder, file)
    img = Image.open(image_path).convert('RGB')  
    # img = img.resize((1024, 1024))

    label_name = file.replace('.JPG','.txt')
    label_path = os.path.join(labels_folder, label_name)
    label_file = pd.read_csv(label_path, header = None,names = ['class', 'x', 'y', 'w', 'h'],sep=" ")

    model_pred_bbox = model_predict(model,processor, img, task_prompt, phrase)

    result.append([file,len(label_file),len(model_pred_bbox)])

100%|██████████| 520/520 [11:23<00:00,  1.31s/it]


In [9]:
result_df = pd.DataFrame(result)
results_df = result_df.sort_values(by=result_df.columns[0])
results_df.reset_index(drop=True, inplace=True)
results_df.columns = ['image', 'GT', 'Prediction']
results_df

Unnamed: 0,image,GT,Prediction
0,0000.JPG,8,0
1,0001.JPG,15,12
2,0002.JPG,61,40
3,0003.JPG,12,10
4,0004.JPG,14,0
...,...,...,...
515,0515.JPG,31,30
516,0516.JPG,20,17
517,0517.JPG,21,16
518,0518.JPG,9,9


In [10]:
y_true = results_df['GT']
y_pred = results_df['Prediction']

mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print(f"MAE (Mean Absolute Error): {mae:.4f}")
print(f"MSE (Mean Squared Error): {mse:.4f}")
print(f"R2 Score: {r2:.4f}")

MAE (Mean Absolute Error): 5.3635
MSE (Mean Squared Error): 71.6942
R2 Score: 0.2789


In [11]:
# Saving results.
results_df.to_csv("/home/tanutiwari/Documents/coco/Tomato_count/Data_tomatoes/OD_count.csv", index=False)