**KOD WYMAGA OPTYMALIZACJI**

Zauważone błędy:
* kod pomija niewielką ilość klatek, z niezanych powodów (14/333) - prawdopodobnie przez za wysoki threshold


**DO ZROBIENIA**

* wizualizacja wyników - możliwość szybkiego przelecenia po klatkach i oceny jakości detekcji
  * możliwość odrzucenia detekcji - klatka wpada do folderu z wadliwie ocenionymi klatkami do manualnej oceny
  * wlatują tam też klatki z za małą pewnością
* możliwość dodania samego wideo
* dodanie podglądu - wyświetla się losowy obraz z predykcją - można dobrać threshold

In [1]:
import torch
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
import numpy as np
import matplotlib.pyplot as plt 
import cv2
import glob
import pandas as pd
import os
import time
import warnings

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class DataLoader:
    def __init__(self, path):
        self.path = path
        self.dir_name = os.path.basename(path)
        self.height = 480
        self.width = 640
        self.channels = 3
        
    def load_folder(self):
        image_files = glob.glob(f"{self.path}/*.jpg")
        image_names = [os.path.basename(image) for image in image_files]
        length = np.shape(image_files)[0]
        image_array = np.zeros((length, self.height, self.width, self.channels))
        for i, image in enumerate(image_files):
            img = cv2.imread(image)
            if img is not None:
                image_array[i,:,:,:] = img
        return image_names, image_array
        
    def load_video(self):
        raise NotImplementedError

In [3]:
loader = DataLoader(r"C:\Users\Piotr\PycharmProjects\VID2FRAME\video_2024_11_19_11-04-50")

In [4]:
names, images = loader.load_folder()

In [5]:
class PredictWithDino:
    def __init__(self):
        model_id = "IDEA-Research/grounding-dino-tiny"
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        if self.device != torch.device("cuda"):
            warnings.warn("WARNING: Cuda not available.")
        self.processor = AutoProcessor.from_pretrained(model_id)
        self.model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(self.device)
        self.columns = ["name", "label", "score", "x_min", "y_min", "x_max", "y_max"]


    def __make_df(self, data):
        return pd.DataFrame(data, columns=self.columns)
        
    
    def predict_and_save(self, data_loader: DataLoader, box_threshold, text_threshold, text):
        data = []
        print("LOADING DATA")
        image_names, image_array = data_loader.load_folder()
        height = data_loader.height
        width = data_loader.width
        channels = data_loader.channels
        
        print("PROCESSING DATA")
        start_time = time.time()
        for i, image in enumerate(image_array):
            vid_name = image_names[i]
            inputs = self.processor(images=image, text=text, return_tensors="pt").to(self.device)
            with torch.no_grad():
                outputs = self.model(**inputs)
            
            
            results_test = self.processor.post_process_grounded_object_detection(
                outputs,
                inputs.input_ids,
                box_threshold=box_threshold,
                text_threshold=text_threshold,
                target_sizes=[(height, width)]
            )
            results_test = results_test[0]
            if len(results_test["scores"]) != 0:
                for j in range(len(results_test["scores"])):
                    print(vid_name)
                    x_min, y_min, x_max, y_max = map(int, results_test["boxes"].cpu().numpy()[j])
                    label = results_test["labels"][j].replace("[SEP]", "").replace(".", "").strip()
                    data.append([vid_name, label, results_test["scores"].cpu().numpy()[j], x_min, y_min, x_max, y_max])
        end_time = time.time()
        df = self.__make_df(data)
        print(end_time-start_time)
        return df


    def preview(self, loader, text):
        return
        

In [8]:
dino = PredictWithDino()

In [9]:
dframe = dino.predict_and_save(loader, 0.4, 0.001, "stairs .")

LOADING DATA
PROCESSING DATA
frame_0000.jpg
frame_0001.jpg
frame_0002.jpg
frame_0003.jpg
frame_0004.jpg
frame_0005.jpg
frame_0006.jpg
frame_0007.jpg
frame_0008.jpg
frame_0009.jpg
frame_0010.jpg
frame_0011.jpg
frame_0012.jpg
frame_0013.jpg
frame_0014.jpg
frame_0015.jpg
frame_0016.jpg
frame_0017.jpg
frame_0018.jpg
frame_0019.jpg
frame_0020.jpg
frame_0021.jpg
frame_0022.jpg
frame_0023.jpg
frame_0024.jpg
frame_0025.jpg
frame_0026.jpg
frame_0027.jpg
frame_0028.jpg
frame_0029.jpg
frame_0030.jpg
frame_0031.jpg
frame_0032.jpg
frame_0033.jpg
frame_0034.jpg
frame_0035.jpg
frame_0036.jpg
frame_0037.jpg
frame_0038.jpg
frame_0039.jpg
frame_0040.jpg
frame_0041.jpg
frame_0042.jpg
frame_0043.jpg
frame_0044.jpg
frame_0045.jpg
frame_0046.jpg
frame_0047.jpg
frame_0048.jpg
frame_0049.jpg
frame_0050.jpg
frame_0051.jpg
frame_0052.jpg
frame_0053.jpg
frame_0054.jpg
frame_0055.jpg
frame_0056.jpg
frame_0057.jpg
frame_0058.jpg
frame_0059.jpg
frame_0060.jpg
frame_0061.jpg
frame_0062.jpg
frame_0063.jpg
frame_0064.

In [10]:
dframe.head(20)

Unnamed: 0,name,label,score,x_min,y_min,x_max,y_max
0,frame_0000.jpg,stairs,0.792086,174,185,558,478
1,frame_0001.jpg,stairs,0.777504,172,185,560,479
2,frame_0002.jpg,stairs,0.800503,172,185,560,478
3,frame_0003.jpg,stairs,0.80676,169,185,560,478
4,frame_0004.jpg,stairs,0.76984,163,184,559,479
5,frame_0005.jpg,stairs,0.781672,164,184,559,479
6,frame_0006.jpg,stairs,0.630473,169,184,562,479
7,frame_0007.jpg,stairs,0.77296,175,184,566,479
8,frame_0008.jpg,stairs,0.785416,171,183,569,478
9,frame_0009.jpg,stairs,0.735279,171,183,569,478


In [12]:
# dframe.to_csv(f"{loader.dir_name}.csv", index=False)

In [8]:
class Visualize:
    def __init__(self, path):
        loader = DataLoader(path)
        image_names, image_array = loader.load_folder()
        self.dir_name = loader.dir_name
        self.dataframe = pd.read_csv(f"{loader.dir_name}.csv")
        height = loader.height
        width = loader.width
        channels = loader.channels
        return