## Download repository Grounding DINO


In [1]:
!which python3


/home/nikolay/Documents/Projects/Diploma/My repos/Pseudo_label_Grounding_DINO/.venv/bin/python3


In [2]:
# %cd ..
# !git clone https://github.com/IDEA-Research/GroundingDINO.git
# %cd GroundingDINO
# !pip install -e .
# !pip install -r requirements.txt
%cd ..


/home/nikolay/Documents/Projects/Diploma/My repos/Pseudo_label_Grounding_DINO


## Load model

In [3]:
from groundingdino.util.inference import load_model, predict
import os
import supervision as sv
import torch
import cv2
from torch.utils.data import Dataset, DataLoader
import glob
import pandas as pd
from utils import AnnotationDF
import tqdm



## Data


In [4]:
class ImageDataset(Dataset):
    def __init__(self, img_dir, extensions=['.jpeg']):
        self.img_dir = img_dir
        self.ext = extensions
        self.files = []
        self._search_files()
        

    def _search_files(self):
        self.files = []
        for ext in self.ext:
            self.files.extend(glob.glob(self.img_dir + '/*' + ext))
        self.files = sorted(self.files) 

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        img_path = self.files[idx]
        img_name = os.path.split(img_path)[-1].split('.')[0]
        img_source = cv2.imread(img_path)
        img_source = cv2.cvtColor(img_source, cv2.COLOR_BGR2RGB)
        image = torch.Tensor(img_source/255).permute(2,0,1)
        return image, img_name  
    
# voc_dataset = ImageDataset(img_dir='data/VOC2007/train2007', extensions=['.jpg'])
# voc_dataloader = DataLoader(voc_dataset, batch_size=1, shuffle=False)

## Inference functoins

In [5]:
### Run predictions
def run_pseudo_labelling(model, dataloader, box_threshold, text_threshold, classes):
    # Init annotation table
    ann_df = AnnotationDF(classes)

    for img, filename in tqdm.tqdm(dataloader):
        boxes_all = None

        for cls_prompt in classes:
            
            boxes, logits, phrases = predict(
                model=model,
                image=img.squeeze(0),
                caption=cls_prompt,
                box_threshold=box_threshold,
                text_threshold=text_threshold
            )
            if boxes_all is None:
                boxes_all = boxes
                logits_all = logits
                phrases_all = phrases
            else:
                phrases_all.extend(phrases)
                logits_all = torch.concat((logits_all, logits), dim=0)
                boxes_all = torch.concat((boxes_all, boxes), dim=0)
        ann_df.add_annotation(filename, list(zip(boxes_all, logits_all, phrases_all)))

    return ann_df
    

## Run Code

In [6]:
WEIGHTS_NAME = "groundingdino_swinb_cogcoor.pth"
WEIGHTS_PATH = os.path.join("GroundingDINO_weights", WEIGHTS_NAME)
CONFIG_PATH = "GroundingDINO/groundingdino/config/GroundingDINO_SwinB_cfg.py"

In [7]:
classes = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 
    'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 
    'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']

In [8]:
model = load_model(CONFIG_PATH, WEIGHTS_PATH)



final text_encoder_type: bert-base-uncased


In [9]:
val_img_path = 'datasets/VOC/images/val'
val_save_path= 'data/VOC/val'

test_img_path = 'datasets/VOC/images/test'
test_save_path= 'data/VOC/test'

train_img_path = 'datasets/VOC/images/train'
train_save_path= 'data/VOC/train'


###########
save_ann_path = test_save_path
voc_dataset = ImageDataset(img_dir=test_img_path, extensions=['.jpg'])
voc_dataset = torch.utils.data.Subset(voc_dataset,[x for x in range(100)])


voc_dataloader = DataLoader(voc_dataset, batch_size=1, shuffle=False)

In [10]:
### Run predictions
ann_df = run_pseudo_labelling(model=model, dataloader=voc_dataloader, box_threshold=0.1, text_threshold=0.1, classes=classes)
ann_df.save_annotations(save_ann_path)

100%|██████████| 100/100 [03:51<00:00,  2.32s/it]
