In [None]:
!pip install transformers 
!pip install torchvision
!pip install torch
!pip install pycocotools 
!pip install tqdm 
!pip install Pillow

In [None]:
# Importieren der notwendigen Module
import os
import torch
import json
from tqdm import tqdm
from PIL import Image
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
from transformers import AutoProcessor, Owlv2ForObjectDetection

In [None]:
# Modell und Prozessor laden
processor = AutoProcessor.from_pretrained("google/owlv2-large-patch14-ensemble")
model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-large-patch14-ensemble")

In [None]:
# Pfade anpassen
image_folder = "   " 
annotation_file = "   " 

In [None]:
# Festlegen des Geräts (CPU oder GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Bilder sammeln
image_paths = [os.path.join(image_folder, img) for img in os.listdir(image_folder) if img.endswith('.jpg') or img.endswith('.png')]

# Laden der Ground-Truth-Annotationen
coco_gt = COCO(annotation_file)

# Erstellen des Mappings von Dateinamen zu Bild-IDs
filename_to_image_id = {img['file_name']: img['id'] for img in coco_gt.dataset['images']}

In [None]:
# Inferenz vorbereiten
results_owlv2 = []
text_prompts = ["animal"]

In [None]:
# Bilder durchlaufen und verarbeiten
for image_path in tqdm(image_paths, desc="Verarbeitung von Bildern"):
    image = Image.open(image_path).convert("RGB")
    image_filename = os.path.basename(image_path)
    image_id = filename_to_image_id.get(image_filename)

    if image_id is None:
        print(f"Bild-ID für {image_filename} nicht gefunden. Überspringe dieses Bild.")
        continue

    # Vorverarbeitung
    inputs = processor(images=image, text=text_prompts, return_tensors="pt").to(device)

    # Inferenz
    with torch.no_grad():
        outputs = model(**inputs)

    # Post-Processing
    target_sizes = torch.tensor([image.size[::-1]]).to(device)
    results = processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)[0]

    # Extrahieren der Bounding Boxes, Scores und Labels
    results_owlv2.append({
        "image_id": image_id,
        "boxes": results["boxes"].tolist(),
        "scores": results["scores"].tolist(),
        "labels": results["labels"].tolist(),
    })

In [None]:
# Konvertierung der Ergebnisse in das COCO-Format
def convert_to_coco_format_owlv2(results, ground_truth_category_id):
    coco_results = []
    for result in results:
        image_id = result["image_id"]
        for box, score in zip(result["boxes"], result["scores"]):
            x_min, y_min, x_max, y_max = box
            width = x_max - x_min
            height = y_max - y_min
            # Setzen der category_id auf die Ground-Truth-Kategorie-ID
            coco_results.append({
                "image_id": image_id,
                "category_id": ground_truth_category_id,
                "bbox": [x_min, y_min, width, height],
                "score": score
            })
    return coco_results

In [None]:
def convert_to_coco_format_owlv2(results, ground_truth_category_id):
    coco_results = []
    for result in results:
        image_id = result["image_id"]
        boxes = result["boxes"]
        scores = result["scores"]
        labels = result["labels"] # Die Labels sind die Indizes der Text-Prompts
        for box, score, label in zip(boxes, scores, labels):
            x_min, y_min, x_max, y_max = box
            width = x_max - x_min
            height = y_max - y_min
            # Setzen der category_id auf die Ground-Truth-Kategorie-ID
            category_id = ground_truth_category_id
            coco_results.append({
                "image_id": image_id,
                "category_id": category_id,
                "bbox": [x_min, y_min, width, height],
                "score": score
            })
    return coco_results

In [None]:
# Ergebnisse konvertieren und speichern
animal_category_id = 1 # Entsprechend der Ground-Truth-Annotationen
coco_results_owlv2 = convert_to_coco_format_owlv2(results_owlv2, animal_category_id)

with open("owlv2_results.json", "w") as f:
    json.dump(coco_results_owlv2, f)


In [None]:
# Evaluation der Ergebnisse
coco_dt_owlv2 = coco_gt.loadRes("owlv2_results.json")
coco_eval_owlv2 = COCOeval(coco_gt, coco_dt_owlv2, iouType='bbox')
coco_eval_owlv2.evaluate()
coco_eval_owlv2.accumulate()
coco_eval_owlv2.summarize()