# SwinT Pre-Trained model

In [None]:
# # âœ… Step 1: Install dependencies
# !pip install -q torch torchvision transformers timm pillow matplotlib supervision

# # âœ… Step 2: Imports
# import torch
# from PIL import Image
# import matplotlib.pyplot as plt
# import matplotlib.patches as patches
# from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

# # âœ… Step 3: Load model + processor
# model_id = "IDEA-Research/grounding-dino-tiny"
# processor = AutoProcessor.from_pretrained(model_id)
# model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id)

# # âœ… Step 4: Load image from Drive
# image_path = "/content/drive/MyDrive/CAPESTONE/my room ideas.jpeg"  # apna path
# image = Image.open(image_path).convert("RGB")

# # âœ… Step 5: Prompts
# texts = ["Candle", "glasses", "watch", "Pen", "Books", "lamp" ]


# # âœ… Step 6: Preprocess + forward pass
# inputs = processor(images=image, text=texts, return_tensors="pt")
# with torch.no_grad():
#     outputs = model(**inputs)

# # âœ… Step 7: Post-process (use correct function)
# target_sizes = [image.size[::-1]]
# results = processor.post_process_grounded_object_detection(
#     outputs=outputs,
#     input_ids=inputs.input_ids,
#     target_sizes=target_sizes,
#     threshold=0.5
# )

# # âœ… Step 8: Plot detections
# fig, ax = plt.subplots(1, figsize=(10, 8))
# ax.imshow(image)

# for box, score, label in zip(results[0]["boxes"], results[0]["scores"], results[0]["labels"]):
#     x1, y1, x2, y2 = box.tolist()
#     w, h = x2 - x1, y2 - y1

#     ax.add_patch(patches.Rectangle((x1, y1), w, h,
#                                    linewidth=2, edgecolor="yellow", facecolor="none"))
#     ax.text(x1, y1, f"{label} ({score:.2f})",
#             color="yellow", fontsize=12, weight="bold")

# plt.axis("off")
# plt.show()


# SwinB Pre-Trained Model


In [None]:
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
from PIL import Image
import torch
import matplotlib.pyplot as plt
import matplotlib.patches as patches

!pip install torchmetrics -q
from torchmetrics.detection.mean_ap import MeanAveragePrecision


# Load model + processor
model_id = "IDEA-Research/grounding-dino-base"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id)

# Load image
image_path = "/content/drive/MyDrive/Rough Work/Data/ash-v0_MCllHY9M-unsplash.jpg"
image = Image.open(image_path).convert("RGB")

# Prompts (be descriptive for better results)
texts = ["wolf", "dog"]

# Preprocess
inputs = processor(images=image, text=texts, return_tensors="pt")

# Forward pass
with torch.no_grad():
    outputs = model(**inputs)

# Postprocess (boxes back to original image size)
target_sizes = torch.tensor([[image.size[1], image.size[0]]])  # [[H, W]]
results = processor.post_process_grounded_object_detection(
    outputs=outputs,
    input_ids=inputs.input_ids,
    target_sizes=target_sizes,
    threshold=0.3
)


fig, ax = plt.subplots(1, figsize=(10, 8))
ax.imshow(image)

for box, score, label in zip(results[0]["boxes"], results[0]["scores"], results[0]["labels"]):
    x1, y1, x2, y2 = box.tolist()
    w, h = x2 - x1, y2 - y1

    ax.add_patch(patches.Rectangle((x1, y1), w, h,
                                   linewidth=2, edgecolor="red", facecolor="none"))
    ax.text(x1, y1 - 10, f"{label} {score:.2f}", color="red",
            bbox=dict(facecolor="white", alpha=0.7, edgecolor="none"))

plt.axis("off")
plt.savefig("detections.png", dpi=300, bbox_inches="tight")  # save output
plt.show()


#Evaluation Matric

metric = MeanAveragePrecision()

# ðŸ”¹ Ground truth example (format: xyxy)
target = [
    dict(
        boxes=torch.tensor([[50, 40, 200, 180]]),   # true box
        labels=torch.tensor([0])                    # class id
    )
]

# ðŸ”¹ Predictions from Grounding DINO
preds = [
    dict(
        boxes=results[0]["boxes"],                  # predicted boxes
        scores=results[0]["scores"],                # confidence
        labels=torch.arange(len(results[0]["labels"]))  # map labels â†’ ids
    )
]

metric.update(preds, target)
final_map = metric.compute()
print(final_map)



# Model working on multiple images



In [1]:
import os
import torch
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
from torch import tensor
import numpy as np

# -----------------------------
# ðŸ”¹ Step 1: Load pretrained model + processor
# -----------------------------
model_id = "IDEA-Research/grounding-dino-base"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id)

# -----------------------------
# ðŸ”¹ Step 2: Paths
# -----------------------------
images_dir = "/content/drive/MyDrive/GroundingDino--/archive (1)/Images"  # ðŸ“‚ your folder
results_dir = "/content/drive/MyDrive/GroundingDino--/results_groundingdino"
os.makedirs(results_dir, exist_ok=True)

# -----------------------------
# ðŸ”¹ Step 3: Prompts to test
# -----------------------------
prompts = ["phone", "pen", "book", "laptop", "cow","cat","table", "chair","cycle","men",
           "human","a boat", "plant", "dog", "puppy","sofa", "couch","car",
           "bottle", "Bus", "dinning table", "Sheep", "a train", "sports bike",
           "TV", "Television", "air plane", "flower" ]

# =============================
# Milestone 1: Select & preprocess images
# =============================
image_files = [f for f in os.listdir(images_dir) if f.lower().endswith((".jpg", ".jpeg", ".png"))]
image_files = image_files[:100]  # âœ… limit to 50â€“100 diverse images

# =============================
# Milestone 2: Inference + Save Predictions
# =============================
all_preds = []

for img_name in image_files:
    img_path = os.path.join(images_dir, img_name)
    image = Image.open(img_path).convert("RGB")

    print(f"ðŸ”Ž Processing: {img_name}")
    W, H = image.size
    target_sizes = tensor([[H, W]])

    # Run detection
    inputs = processor(images=image, text=prompts, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)

    results = processor.post_process_grounded_object_detection(
        outputs=outputs,
        input_ids=inputs.input_ids,
        target_sizes=target_sizes,
        threshold=0.25
    )

    # Collect predictions for saving
    for box, score, label in zip(results[0]["boxes"], results[0]["scores"], results[0]["labels"]):
        x1, y1, x2, y2 = box.tolist()
        all_preds.append({
            "image_id": img_name,
            "label": label,
            "score": float(score),
            "bbox": [x1, y1, x2, y2]
        })

    # ---- Qualitative Visualization ----
    fig, ax = plt.subplots(1, figsize=(10, 8))
    ax.imshow(image)
    for box, score, label in zip(results[0]["boxes"], results[0]["scores"], results[0]["labels"]):
        x1, y1, x2, y2 = box.tolist()
        w, h = x2 - x1, y2 - y1
        ax.add_patch(patches.Rectangle((x1, y1), w, h, linewidth=2, edgecolor="red", facecolor="none"))
        ax.text(x1, y1 - 5, f"{label} ({score:.2f})", color="red", fontsize=10, weight="bold",
                bbox=dict(facecolor="white", alpha=0.7, edgecolor="none"))
    plt.axis("off")
    save_path = os.path.join(results_dir, f"detected_{img_name}")
    plt.savefig(save_path, bbox_inches="tight")
    plt.close()
    print(f"âœ… Saved visualization at: {save_path}")

    # ---- Show a few results inline ----

    display(Image.open(save_path))


# Save predictions CSV
pred_csv = os.path.join(results_dir, "predictions.csv")
pd.DataFrame(all_preds).to_csv(pred_csv, index=False)
print(f"âœ… Saved all predictions to: {pred_csv}")


# =============================
# Milestone 3: Quantitative Evaluation (mAP)
# =============================
def compute_iou(box1, box2):
    """ box format: [x1,y1,x2,y2] """
    x1, y1, x2, y2 = np.maximum(box1[:2], box2[:2])
    x3, y3, x4, y4 = np.minimum(box1[2:], box2[2:])
    inter = max(0, x3-x1) * max(0, y3-y1)
    area1 = (box1[2]-box1[0]) * (box1[3]-box1[1])
    area2 = (box2[2]-box2[0]) * (box2[3]-box2[1])
    union = area1 + area2 - inter
    return inter/union if union > 0 else 0

def evaluate_map(pred_csv, gt_csv, iou_thresh=0.5):
    preds = pd.read_csv(pred_csv)
    gts = pd.read_csv(gt_csv)

    aps = []
    for label in gts["label"].unique():
        gt_label = gts[gts["label"]==label]
        pred_label = preds[preds["label"]==label]

        tp, fp = [], []

        for _, pred in pred_label.iterrows():
            pred_box = list(map(float, pred["bbox"].strip("[]").split(",")))
            ious = [compute_iou(pred_box, list(map(float, gt["bbox"].strip("[]").split(",")))) for _, gt in gt_label.iterrows()]
            max_iou = max(ious) if ious else 0
            if max_iou >= iou_thresh:
                tp.append(1); fp.append(0)
            else:
                tp.append(0); fp.append(1)

        tp_cum = np.cumsum(tp)
        fp_cum = np.cumsum(fp)
        precision = tp_cum / (tp_cum + fp_cum + 1e-6)
        recall = tp_cum / (len(gt_label) + 1e-6)
        ap = np.trapz(precision, recall)
        aps.append(ap)

    print(f"ðŸ“Š mAP@{iou_thresh}: {np.mean(aps):.4f}")
    return np.mean(aps)
plt.show()
# ðŸ‘‰ Run only if you have ground truth CSV

gt_csv = "/content/drive/MyDrive/GroundingDino--/archive (1)/Images Data"

mAP = evaluate_map(pred_csv, gt_csv)
print(f"ðŸ“Š mAP@0.5: {mAP:.4f}")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/457 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/933M [00:00<?, ?B/s]

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/GroundingDino--/archive (1)/Images'