In [1]:
!pip install git+https://github.com/autodistill/autodistill-grounded-sam-2 -q
!pip install supervision opencv-python-headless ipywidgets -q
!pip install git+https://github.com/IDEA-Research/GroundingDINO.git

  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting git+https://github.com/IDEA-Research/GroundingDINO.git
  Cloning https://github.com/IDEA-Research/GroundingDINO.git to /tmp/pip-req-build-2rka4wx1
  Running command git clone --filter=blob:none --quiet https://github.com/IDEA-Research/GroundingDINO.git /tmp/pip-req-build-2rka4wx1
  Resolved https://github.com/IDEA-Research/GroundingDINO.git to commit 856dde20aee659246248e20734ef9ba5214f5e44
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [2]:
import os
import cv2
import numpy as np
import random
import torch
import supervision as sv
import matplotlib.pyplot as plt
from io import BytesIO
from google.colab import files
from IPython.display import display

from autodistill_grounded_sam_2 import GroundedSAM2
from autodistill.detection import CaptionOntology
from autodistill.utils import plot

Importing from timm.models.layers is deprecated, please import via timm.layers


In [3]:
ontology_dict = {
    "VK logo": "логотип социальной сети VK, выполненный в характерных синих и белых тонах, с узнаваемой стилизацией букв 'VK'",

    # можно указать промпт и на другие логотипы, например на Телеграмм
    #"Telegram logo": "логотип мессенджера Telegram, представляющий собой стилизованный белый самолет на синем фоне"
    }
ontology = CaptionOntology(ontology_dict)

In [4]:
# порог выбран эмпирически и равен 0,25
base_model = GroundedSAM2(
    ontology=ontology,
    model="Grounding DINO",
    grounding_dino_box_threshold=0.25
)

trying to load grounding dino directly


torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:3595.)


final text_encoder_type: bert-base-uncased



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allow

In [5]:
def process_image(image_path, detection_threshold=0.3):
    """
    Обрабатывает изображение:
      - Детектирует кандидатов (логотипы) с помощью модели GroundedSAM2.
      - Возвращает аннотированное изображение, лог и список найденных кандидатов.
    """
    # Загрузка референсных эмбеддингов
    reference_folder = "vk_reference"  # Папка с образцами логотипов
    ref_paths = glob.glob(os.path.join(reference_folder, "*"))
    reference_embeddings = []

    for ref_path in ref_paths:
        ref_image = cv2.imread(ref_path)
        emb = get_clip_embedding(ref_image)
        reference_embeddings.append(emb)

    reference_embeddings = np.vstack(reference_embeddings)  # Строим матрицу эмбеддингов

    # детекция кандидатов
    results = base_model.predict(image_path).with_nms()
    results = results[results.confidence > detection_threshold]

    image_orig = cv2.imread(image_path)
    log = f"Найдено кандидатов (после NMS, порог {detection_threshold}): {len(results)}\n"

    candidates = []
    verified_boxes = []

    # результат детекции
    for box in results:
        coords = box[0]

        if len(coords) < 4:
            continue

        try:
            x1, y1, x2, y2 = [int(float(c)) for c in coords[:4]]
        except Exception as e:
            log += f"Ошибка при конвертации координат {coords[:4]}: {e}\n"
            continue

        # добавляем координаты в список кандидатов
        candidates.append([x1, y1, x2, y2])

        candidate_crop = image_orig[y1:y2, x1:x2]
        if candidate_crop.size == 0:
            continue

        # получаем эмбеддинг кандидата через CLIP
        candidate_emb = get_clip_embedding(candidate_crop)
        # вычисляем косинусное сходство с каждым эмбеддингом из vk_reference
        sims = np.dot(reference_embeddings, candidate_emb.T).squeeze()
        max_sim = np.max(sims)
        log += f"Кандидат [{x1}, {y1}, {x2}, {y2}] - макс. сходство: {max_sim:.3f}\n"
        if max_sim > detection_threshold:
            verified_boxes.append([x1, y1, x2, y2])

    is_logo_found = len(verified_boxes) > 0
    log += f"Верифицированных кандидатов: {len(verified_boxes)}\n"

    # аннотируем после САМА
    if len(verified_boxes) > 0:
        detections = sv.Detections(xyxy=np.array(verified_boxes))
        annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
        annotated_image = annotator.annotate(image_orig.copy(), detections=detections)
    else:
        annotated_image = image_orig
        log += "Не было найдено логотипов.\n"

    return is_logo_found, annotated_image, log, candidates

In [6]:
import ipywidgets as widgets
from IPython.display import clear_output
import glob

# ползунок САМА
ground_slider = widgets.FloatSlider(
    value=0.3,
    min=0.1,
    max=0.9,
    step=0.05,
    description="Ground thresh:",
    continuous_update=False
)

upload_button = widgets.FileUpload(
    accept='image/*',
    multiple=True,
    description="Загрузить изображения"
)

process_button = widgets.Button(
    description="Запустить обработку",
    button_style="success"
)

output_area = widgets.Output()

# переменная для хранения кандидатов после САМА
candidates = []

def process_uploaded_images(b):
    global candidates, image_orig
    with output_area:
        clear_output()
        if len(upload_button.value) == 0:
            print("Пожалуйста, загрузите хотя бы одно изображение.")
            return
        for filename in upload_button.value:
            content = upload_button.value[filename]['content']
            temp_path = f"temp_{filename}"
            with open(temp_path, "wb") as f:
                f.write(content)
            print(f"Обработка изображения: {filename}")
            try:
                is_logo_found, ann_img, log, detected_candidates = process_image(temp_path, ground_slider.value)
                if len(detected_candidates) == 0:
                    print("Не было найдено кандидатов на изображении.")
                    return
                image_orig = cv2.imread(temp_path)
                candidates = detected_candidates
            except ValueError as e:
                print(f"Ошибка обработки изображения: {e}")
                continue

            print(log)
            plt.figure(figsize=(6,6))
            ann_img_rgb = cv2.cvtColor(ann_img, cv2.COLOR_BGR2RGB)
            plt.imshow(ann_img_rgb)
            plt.axis("off")
            plt.show()


process_button.on_click(process_uploaded_images)

display(widgets.VBox([ground_slider, upload_button, process_button, output_area]))

VBox(children=(FloatSlider(value=0.3, continuous_update=False, description='Ground thresh:', max=0.9, min=0.1,…

In [7]:
import ipywidgets as widgets
from IPython.display import clear_output
import cv2
import numpy as np
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import supervision as sv
import matplotlib.pyplot as plt

# настройка ползунка CLIP
clip_slider = widgets.FloatSlider(
    value=0.75,
    min=0.1,
    max=1.0,
    step=0.05,
    description="CLIP thresh:",
    continuous_update=False
)

# загрузка модели CLIP
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model.to(device)
clip_model.eval()

# получение эмбеддинга
def get_clip_embedding(image):
    pil_img = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    inputs = clip_processor(images=pil_img, return_tensors="pt").to(device)
    with torch.no_grad():
        embedding = clip_model.get_image_features(**inputs)
    embedding = embedding / embedding.norm(dim=-1, keepdim=True)
    return embedding.cpu().numpy()

reference_folder = "vk_reference"
ref_paths = glob.glob(os.path.join(reference_folder, "*"))
reference_embeddings = []

for ref_path in ref_paths:
    ref_image = cv2.imread(ref_path)
    emb = get_clip_embedding(ref_image)
    reference_embeddings.append(emb)

reference_embeddings = np.vstack(reference_embeddings)

# верификация кандидатов через CLIP
def process_image_with_clip(image, candidates, clip_threshold=0.3):
    verified_boxes = []
    log = ""

    for box in candidates:
        x1, y1, x2, y2 = box
        candidate_crop = image[y1:y2, x1:x2]
        if candidate_crop.size == 0:
            continue
        candidate_emb = get_clip_embedding(candidate_crop)
        sims = np.dot(reference_embeddings, candidate_emb.T).squeeze()
        max_sim = np.max(sims)
        log += f"Кандидат [{x1}, {y1}, {x2}, {y2}] - макс. сходство: {max_sim:.3f}\n"
        if max_sim > clip_threshold:
            verified_boxes.append([x1, y1, x2, y2])

    is_logo_found = len(verified_boxes) > 0
    log += f"Верифицированных кандидатов: {len(verified_boxes)}\n"

    # аннотируем итогового кандидата
    if len(verified_boxes) > 0:
        detections = sv.Detections(xyxy=np.array(verified_boxes))
        annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
        annotated_image = annotator.annotate(image.copy(), detections=detections)
    else:
        annotated_image = image
        log += "Не было найдено логотипов.\n"

    return is_logo_found, annotated_image, log

# интерфейс CLIP
def process_uploaded_images_with_clip(b):
    with output_area:
        clear_output()
        if len(upload_button.value) == 0:
            print("Пожалуйста, загрузите хотя бы одно изображение.")
            return
        for filename in upload_button.value:
            content = upload_button.value[filename]['content']
            temp_path = f"temp_{filename}"
            with open(temp_path, "wb") as f:
                f.write(content)
            print(f"Обработка изображения: {filename}")

            # передача детектированных кандидатов пользователю
            is_logo_found, ann_img, log = process_image_with_clip(image_orig, candidates, clip_threshold=clip_slider.value)
            print(log)
            plt.figure(figsize=(6,6))
            ann_img_rgb = cv2.cvtColor(ann_img, cv2.COLOR_BGR2RGB)
            plt.imshow(ann_img_rgb)
            plt.axis("off")
            plt.show()

# кнопка
process_button_clip = widgets.Button(description="Запустить обработку с CLIP", button_style="info")
process_button_clip.on_click(process_uploaded_images_with_clip)

display(widgets.VBox([clip_slider, upload_button, process_button_clip, output_area]))

VBox(children=(FloatSlider(value=0.75, continuous_update=False, description='CLIP thresh:', max=1.0, min=0.1, …