In [15]:
import torch
import pandas as pd
from PIL import Image, ImageDraw
from transformers import AutoProcessor, AutoModelForCausalLM
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import numpy as np
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [16]:
# Device settings
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
device

'cuda:0'

In [17]:
# Loading model
def load_model():
    CHECKPOINT = "microsoft/Florence-2-base-ft"
    model = AutoModelForCausalLM.from_pretrained(CHECKPOINT, trust_remote_code=True).to(device, dtype=torch_dtype)
    processor = AutoProcessor.from_pretrained(CHECKPOINT, trust_remote_code=True)
    return model, processor

In [18]:
model, processor = load_model()

In [19]:
def model_predict(model,processor, img, task_prompt, phrase=''):

    inputs = processor(text=task_prompt+phrase, images=img, return_tensors="pt").to(device, torch_dtype)
    
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=512,
            num_beams=3,
            do_sample=False
        )

    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

    parsed_answer = processor.post_process_generation(
            generated_text,
            task=task_prompt,
            image_size=(img.width, img.height))

    key = "<CAPTION_TO_PHRASE_GROUNDING>"
    detections = parsed_answer.get(key, {"bboxes": [], "labels": []})
    bboxes = detections.get("bboxes", [])
    labels = detections.get("labels", [])

    data = []
    area_img  = img.width * img.height

    for bbox, label in zip(bboxes, labels):
        x_min, y_min, x_max, y_max = map(int, bbox)
        bbox_area = (x_max - x_min) * (y_max - y_min)
        if bbox_area < 0.7*area_img:
            data.append([x_min, y_min, x_max, y_max, label])

    return pd.DataFrame(data, columns=["x1", "y1", "x2", "y2", "object"])

In [20]:
# task = "Object Detection (OD)"
task = "Phrase Grounding (PG)"
# task = "Image Captioning (IC)"

phrase = ''
if task == "Object Detection (OD)":
    task_prompt = "<OD>"
elif task == "Phrase Grounding (PG)":
    task_prompt = "<CAPTION_TO_PHRASE_GROUNDING>"
    phrase = 'Red and green tomatoes'

else:
    task_prompt = "<CAPTION>"

task , task_prompt

('Phrase Grounding (PG)', '<CAPTION_TO_PHRASE_GROUNDING>')

In [21]:
# Loading dataset images and labels
img_folder = 'Images'
labels_folder = 'labels'

result = []

for file in tqdm(os.listdir(img_folder)):
    if not file.endswith('.JPG'):
        continue

    image_path = os.path.join(img_folder, file)
    img = Image.open(image_path).convert('RGB')  
    # img = img.resize((, 224))

    label_name = file.replace('.JPG','.txt')
    label_path = os.path.join(labels_folder, label_name)
    label_file = pd.read_csv(label_path, header = None,names = ['class', 'x', 'y', 'w', 'h'],sep=" ")

    model_pred_bbox = model_predict(model,processor, img, task_prompt, phrase)

    result.append([file,len(label_file),len(model_pred_bbox)])

100%|██████████| 520/520 [12:51<00:00,  1.48s/it]


In [22]:
result_df = pd.DataFrame(result)
results_df = result_df.sort_values(by=result_df.columns[0])
results_df.reset_index(drop=True, inplace=True)
results_df.columns = ['image', 'GT', 'Prediction']
results_df

Unnamed: 0,image,GT,Prediction
0,0000.JPG,8,0
1,0001.JPG,15,1
2,0002.JPG,61,0
3,0003.JPG,12,1
4,0004.JPG,14,0
...,...,...,...
515,0515.JPG,31,2
516,0516.JPG,20,4
517,0517.JPG,21,1
518,0518.JPG,9,1


In [None]:
y_true = results_df['GT']
y_pred = results_df['Prediction']

mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print(f"MAE (Mean Absolute Error): {mae:.4f}")
print(f"MSE (Mean Squared Error): {mse:.4f}")
print(f"R2 Score: {r2:.4f}")

MAE (Mean Absolute Error): 14.0577
MSE (Mean Squared Error): 304.6154
R2 Score: -2.0639


In [24]:
# Saving results.
results_df.to_csv("/home/tanutiwari/Documents/coco/Tomato_count/Data_tomatoes/results(phrase-Red and green tomatoes).csv", index=False)