We will use zero-shot baseline: [OVSeg](https://github.com/facebookresearch/ov-seg), which utilizes [SAM](https://github.com/facebookresearch/segment-anything) as a proposal generator instead of MaskFormer in the original setup.

Let's download models and install necessary packages

In [None]:
!git lfs install
!git clone https://huggingface.co/spaces/facebook/ov-seg 

In [None]:
cd ov-seg

In [None]:
%%capture
!pip install --upgrade pip
!pip -q install -r requirements.txt
!pip -q install typing-extensions --upgrade
!pip -q install scipy --upgrade
!pip -q install gradio timm ftfy wandb open_clip_torch==1.3.0 git+https://github.com/facebookresearch/segment-anything.git
!pip -q install git+https://github.com/facebookresearch/detectron2.git

In [None]:
# import multiprocessing as mp
import requests
from tqdm.auto import tqdm

import numpy as np
import pandas as pd
from PIL import Image
from io import BytesIO

import gradio as gr

from detectron2.structures import BitMasks
from open_vocab_seg.modeling.clip_adapter.adapter import PIXEL_MEAN, PIXEL_STD
from open_vocab_seg.modeling.clip_adapter.utils import crop_with_mask
import cv2
import torch
import open_clip
from segment_anything import SamAutomaticMaskGenerator, sam_model_registry 
from torch.nn import functional as F

import warnings
warnings.filterwarnings("ignore")

In [None]:
def get_iou(bb1, bb2):
    # Taken from https://stackoverflow.com/a/42874377
    """
    Calculate the Intersection over Union (IoU) of two bounding boxes.

    Parameters
    ----------
    bb1 : dict
        Keys: {'x1', 'x2', 'y1', 'y2'}
        The (x1, y1) position is at the top left corner,
        the (x2, y2) position is at the bottom right corner
    bb2 : dict
        Keys: {'x1', 'x2', 'y1', 'y2'}
        The (x, y) position is at the top left corner,
        the (x2, y2) position is at the bottom right corner

    Returns
    -------
    float
        in [0, 1]
    """
    assert bb1['x1'] < bb1['x2']
    assert bb1['y1'] < bb1['y2']
    assert bb2['x1'] < bb2['x2']
    assert bb2['y1'] < bb2['y2']

    # determine the coordinates of the intersection rectangle
    x_left = max(bb1['x1'], bb2['x1'])
    y_top = max(bb1['y1'], bb2['y1'])
    x_right = min(bb1['x2'], bb2['x2'])
    y_bottom = min(bb1['y2'], bb2['y2'])

    if x_right < x_left or y_bottom < y_top:
        return 0.0

    # The intersection of two axis-aligned bounding boxes is always an
    # axis-aligned bounding box
    intersection_area = (x_right - x_left) * (y_bottom - y_top)

    # compute the area of both AABBs
    bb1_area = (bb1['x2'] - bb1['x1']) * (bb1['y2'] - bb1['y1'])
    bb2_area = (bb2['x2'] - bb2['x1']) * (bb2['y2'] - bb2['y1'])

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
    assert iou >= 0.0
    assert iou <= 1.0
    return iou


def get_image_array(url):
    response = requests.get(url)
    img_bytes = BytesIO(response.content)
    img_cv2 = cv2.imdecode(np.frombuffer(img_bytes.read(), np.uint8), -1)
    img_np = np.asarray(img_cv2)
    return img_np


def get_models(sam_path, ovsegclip_path, model_type):
    sam = sam_model_registry[model_type](checkpoint=sam_path).cuda()
    predictor = SamAutomaticMaskGenerator(sam, points_per_batch=16)
    clip_model, _, _ = open_clip.create_model_and_transforms('ViT-L-14', pretrained=ovsegclip_path)
    clip_model.cuda()
    return predictor, clip_model


def get_mask(url, question, granularity, predictor, clip_model):
    class_names = [question]
    img = get_image_array(url)
    image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    with torch.no_grad(), torch.cuda.amp.autocast():
        masks = predictor.generate(image)

    pred_masks = [masks[i]['segmentation'][None,:,:] for i in range(len(masks))]
    pred_masks = np.row_stack(pred_masks)
    pred_masks = BitMasks(pred_masks)
    bboxes = pred_masks.get_bounding_boxes()

    mask_fill = [255.0 * c for c in PIXEL_MEAN]
    image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))

    regions = []
    for bbox, mask in zip(bboxes, pred_masks):
        region, _ = crop_with_mask(
            image,
            mask,
            bbox,
            fill=mask_fill,
        )
        regions.append(region.unsqueeze(0))
    regions = [F.interpolate(r.to(torch.float), size=(224, 224), mode="bicubic") for r in regions]

    pixel_mean = torch.tensor(PIXEL_MEAN).reshape(1, -1, 1, 1)
    pixel_std = torch.tensor(PIXEL_STD).reshape(1, -1, 1, 1)
    imgs = [(r/255.0 - pixel_mean) / pixel_std for r in regions]
    imgs = torch.cat(imgs)
    if len(class_names) == 1:
        class_names.append('others')
    txts = [f'a photo of {cls_name}' for cls_name in class_names]
    text = open_clip.tokenize(txts)

    img_batches = torch.split(imgs, 32, dim=0)

    with torch.no_grad(), torch.cuda.amp.autocast():
        text_features = clip_model.encode_text(text.cuda())
        text_features /= text_features.norm(dim=-1, keepdim=True)
        image_features = []
        for img_batch in img_batches:
            image_feat = clip_model.encode_image(img_batch.cuda().half())
            image_feat /= image_feat.norm(dim=-1, keepdim=True)
            image_features.append(image_feat.detach())
        image_features = torch.cat(image_features, dim=0)
        class_preds = (100.0 * image_features @ text_features.T).softmax(dim=-1)
    select_cls = torch.zeros_like(class_preds)

    max_scores, select_mask = torch.max(class_preds, dim=0)
    if len(class_names) == 2 and class_names[-1] == 'others':
        select_mask = select_mask[:-1]
    if granularity < 1:
        thr_scores = max_scores * granularity
        select_mask = []
        if len(class_names) == 2 and class_names[-1] == 'others':
            thr_scores = thr_scores[:-1]
        for i, thr in enumerate(thr_scores):
            cls_pred = class_preds[:,i]
            locs = torch.where(cls_pred > thr)
            select_mask.extend(locs[0].tolist())
    for idx in select_mask:
        select_cls[idx] = class_preds[idx]
    semseg = torch.einsum("qc,qhw->chw", select_cls.float(), pred_masks.tensor.float().cuda())

    r = semseg
    blank_area = (r[0] == 0)
    pred_mask = r.argmax(dim=0).to('cpu')
    pred_mask[~blank_area] = 1
    pred_mask = np.array(pred_mask, dtype=int)
    return pred_mask


def get_bounding_box(mask: np.array) -> np.array:
    """
    Get the bounding box of a segmentation mask in the form of a NumPy bool array.
    
    Args:
        mask (NumPy array): The segmentation mask as a NumPy bool array.
        
    Returns:
        A NumPy array of the bounding box in the format [left, top, right, bottom].
    """

    rows = np.any(mask, axis=1)
    cols = np.any(mask, axis=0)
    left, right = np.where(cols)[0][[0, -1]]
    top, bottom = np.where(rows)[0][[0, -1]]

    return np.array([left, top, right, bottom])

The models require CUDA

In [None]:
predictor, clip_model = get_models('./sam_vit_h_4b8939.pth', './ovseg_clip_l_9a1909.pth', 'vit_h')

Let's take a look at the data. We have eight columns: image URL, its width and height in pixels, positions of the top-left and bottom-right corners of a bounding box and the question to be answered.

In [None]:
test_private = pd.read_csv('https://raw.githubusercontent.com/Toloka/WSDMCup2023/main/test_private.csv')
test_private.head()

`image` and `question` are input columns. A human asked this question about some object located in this image. So, our goal is to find this object. In other words, `left`, `top`, `right`, and `bottom` are *target variables* we want to predict.

Let's run a prediction for the test private set!

In [None]:
predictions = []
n_imgs = 0
total_iou = 0.0
progress = tqdm(test_private.iterrows(), total=len(test_private))
for _, row in progress:
    img_url = row['image']
    question = row['question']
    gt_box = np.array([row['left'], row['top'], row['right'], row['bottom']])
    try:
        pred_mask = get_mask(img_url, question, 1.0, predictor, clip_model)
    except Exception:
        continue
        
    gt_bb = {'x1': row['left'], 'y1': row['top'], 'x2': row['right'], 'y2': row['bottom']}
    bb_predicted = get_bounding_box(pred_mask)
    bb_predicted = {'x1': bb_predicted[0], 'y1': bb_predicted[1], 'x2': bb_predicted[2], 'y2': bb_predicted[3]}
    total_iou += get_iou(gt_bb, bb_predicted)
    n_imgs += 1
    progress.set_description(f'IoU: {round(total_iou / n_imgs * 100, 2)}')
    
    left = bb_predicted['x1']
    top = bb_predicted['y1']
    right = bb_predicted['x2']
    bottom = bb_predicted['y2']
    predictions.append([img_url, left, top, right, bottom])
predictions = pd.DataFrame(predictions, columns=['image', 'left', 'top', 'right', 'bottom'])

Let's look at the results...

In [None]:
predictions

...and save them

In [None]:
predictions.to_csv('ovseg_sam_result.csv', index=None)