In [1]:
import torch
import cv2
import numpy as np
from ultralytics import YOLO
from deep_sort.deep_sort.tracker import Tracker as DeepSortTracker
from deep_sort.tools import generate_detections as gdet
from deep_sort.deep_sort import nn_matching
from deep_sort.deep_sort.detection import Detection
import torchreid
from torchvision import transforms
from PIL import Image



In [2]:
"""class Tracker:
    tracker = None
    encoder = None
    tracks = None

    def __init__(self):
        max_cosine_distance = 0.4
        nn_budget = None

        encoder_model_filename = 'model_data/mars-small128.pb'

        metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget)
        self.tracker = DeepSortTracker(metric)
        self.encoder = gdet.create_box_encoder(encoder_model_filename, batch_size=1)

    def update(self, frame, detections):

        if len(detections) == 0:
            self.tracker.predict()
            self.tracker.update([])  
            self.update_tracks()
            return

        bboxes = np.asarray([d[:-1] for d in detections])
        bboxes[:, 2:] = bboxes[:, 2:] - bboxes[:, 0:2]
        scores = [d[-1] for d in detections]

        features = self.encoder(frame, bboxes)

        dets = []
        for bbox_id, bbox in enumerate(bboxes):
            dets.append(Detection(bbox, scores[bbox_id], features[bbox_id]))

        self.tracker.predict()
        self.tracker.update(dets)
        self.update_tracks()

    def update_tracks(self):
        tracks = []
        for track in self.tracker.tracks:
            if not track.is_confirmed() or track.time_since_update > 1:
                continue
            bbox = track.to_tlbr()

            id = track.track_id

            tracks.append(Track(id, bbox))

        self.tracks = tracks


class Track:
    track_id = None
    bbox = None

    def __init__(self, id, bbox):
        self.track_id = id
        self.bbox = bbox"""

'class Tracker:\n    tracker = None\n    encoder = None\n    tracks = None\n\n    def __init__(self):\n        max_cosine_distance = 0.4\n        nn_budget = None\n\n        encoder_model_filename = \'model_data/mars-small128.pb\'\n\n        metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget)\n        self.tracker = DeepSortTracker(metric)\n        self.encoder = gdet.create_box_encoder(encoder_model_filename, batch_size=1)\n\n    def update(self, frame, detections):\n\n        if len(detections) == 0:\n            self.tracker.predict()\n            self.tracker.update([])  \n            self.update_tracks()\n            return\n\n        bboxes = np.asarray([d[:-1] for d in detections])\n        bboxes[:, 2:] = bboxes[:, 2:] - bboxes[:, 0:2]\n        scores = [d[-1] for d in detections]\n\n        features = self.encoder(frame, bboxes)\n\n        dets = []\n        for bbox_id, bbox in enumerate(bboxes):\n            dets.append(Detection(bbox

In [3]:
model = YOLO("C:/Users/julie/OneDrive - UCL/Master_2/Mémoire/ChimpRec/Code/Body_detection/YOLO_small/runs/detect/train9/weights/best.pt")
video_path = "C:/Users/julie/Documents/Unif/Mémoire/tracking_video_test2.mp4" #Vidéo input
output_path = "tracking_test_output7.mp4" 
file_improve_tracking_path = "C:/Users/julie/Documents/Unif/Mémoire/tracking_improvement.txt" #Fichier dans lequel les annotations de bbox sont écrites 

# Paramètres de DeepSORT
max_cosine_distance = 0.3  # Distance max pour matcher un objet (plus bas = plus strict)
nn_budget = None  # Taille max du buffer pour le modèle d'appariement
metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget)

# Initialiser DeepSORT
tracker = DeepSortTracker(metric)

In [4]:
def draw_bbox(image, color, bbox, label):
    x1, y1, x2, y2, score = bbox
    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
    factor = 0.65 if label == "Face" else 0.3
    font_scale = max(0.4, (x2 - x1 + y2 - y1) / 300) * factor

    cv2.rectangle(image, (x1, y1), (x2, y2), color, 4)
    label_text = f"{label}"#: {score:.2f}"
    (w, h), _ = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_COMPLEX, font_scale, 1)

    overlay = image.copy()
    cv2.rectangle(overlay, (x2 - w - 10, y2 - h - 10), (x2, y2), color, -1)
    cv2.addWeighted(overlay, 0.5, image, 0.5, 0, image)

    cv2.putText(image, label_text, (x2 - w - 5, y2 - 5), cv2.FONT_HERSHEY_COMPLEX, font_scale, (255,255,255), 1)
    return image

In [5]:
# Charger OSNet pré-entraîné sur Market-1501
model_feature_extraction = torchreid.models.build_model(name='osnet_x1_0', num_classes=751, pretrained=True)
model_feature_extraction.eval()  # Mode évaluation

# Transformer l'image pour OSNet
transform = transforms.Compose([
    transforms.Resize((256, 128)),  # Taille attendue par OSNet
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def extract_features_osnet(frame, bbox):
    """Extrait les features d'un chimpanzé avec OSNet."""
    x1, y1, x2, y2 = map(int, bbox)  # Convertir bbox en entiers
    chimp_img = frame[y1:y2, x1:x2]  # Extraire la région d'intérêt

    chimp_img = cv2.cvtColor(chimp_img, cv2.COLOR_BGR2RGB)  # Convertir en RGB
    chimp_img = transform(Image.fromarray(chimp_img)).unsqueeze(0)  # Appliquer transformations

    with torch.no_grad():
        feature = model_feature_extraction(chimp_img)  # Extraire les features

    # Convertir les features en vecteur 1D et s'assurer qu'ils ont une forme correcte
    feature = feature.cpu().numpy().flatten()  # Retourner un vecteur 512D

    return feature



Successfully loaded imagenet pretrained weights from "C:\Users\julie/.cache\torch\checkpoints\osnet_x1_0_imagenet.pth"
** The following layers are discarded due to unmatched keys or layer size: ['classifier.weight', 'classifier.bias']


In [6]:
def track(colors, colors_used, video_path, file_improve_tracking_path, output_path, model, tracker): 
    cap = cv2.VideoCapture(video_path)
    ret, frame = cap.read()
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))

    file_improve_tracking = open(file_improve_tracking_path, "w")
    while ret: 
        file_improve_tracking.write("#\n")
        predictions = model.predict(frame, verbose=False)[0]
        
        #On ne garde que les détections de bboxes qui ont une confiance suppérieur à 0.2
        detections = []
        for x1, y1, x2, y2, score, _ in predictions.boxes.data.tolist():
            if score >= 0.2:
                bbox = np.array([x1, y1, x2 - x1, y2 - y1]) #Coin supérieur gauche + width et height 
                confidence = float(score)
                #feature permet à DeepSORT de prendre en compte l'apparence en plus dans le tracking donc il faudrait extraire l'apparence grâce à un model 
                #Fonctionne un peu comme SORT pour l'instant ducoup car l'apparence est pas prise en compte 
                feature = extract_features_osnet(frame, [x1, y1, x2, y2]) 
                detections.append(Detection(bbox, confidence, feature))

        # Mettre à jour avec les détections et forcer l'association des features
        if len(detections) > 0:
            tracker.predict()  # Prédit la position des objets dans la frame suivante
            tracker.update(detections)  # Maj avec les détections


        #Dessiner les bboxes
        for track in tracker.tracks: #Parcourt les objets suivis
            if track.is_confirmed() and track.track_id not in tracker.metric.samples:
                tracker.metric.samples[track.track_id] = []

            #Si pour un objet suivi il y a des doutes sur son identité ou si pas été mis à jour depuis trop de temps, on l'ignore.
            if not track.is_confirmed() or track.time_since_update > 1: 
                if track.track_id in colors_used: 
                    colors.append(colors_used[track.track_id])
                    colors_used.pop(track.track_id)
                continue

            bbox = track.to_tlbr()  # Format [x1, y1, x2, y2]
            track_id = track.track_id
            class_id = "Chimp"
            if track_id not in colors_used.keys(): 
                colors_used[track_id] = colors.pop(0) 
            color = colors_used[track_id]

            # Dessiner la bounding box avec l'ID du chimpanzé
            str_bbox = ' '.join(map(str, bbox))
            file_improve_tracking.write(f"{track_id} {str_bbox}\n")
            draw_bbox(frame, color, (*bbox, 1), f"{class_id} {track_id}")

        # Afficher la vidéo avec le suivi
        out.write(frame)

        ret, frame = cap.read()
    file_improve_tracking.close()

    cap.release()
    out.release()
    cv2.destroyAllWindows()

In [7]:
colors_used = {}

colors = [
    (120, 50, 99),
    (180, 25, 16),
    (73, 89, 176),
    (200, 158, 18),
    (199, 214, 152),
    (181, 37, 229),
    (118, 73, 165),
    (136, 3, 53),
    (40, 47, 142),
    (246, 26, 168),
    (33, 83, 190),
    (151, 220, 243),
    (156, 122, 217),
    (173, 0, 128),
    (61, 242, 230),
    (37, 10, 125),
    (64, 229, 201),
    (64, 137, 49),
    (136, 225, 85),
    (146, 80, 77),
    (255, 0, 0),
    (0, 255, 0),
    (0, 0, 255),
    (255, 255, 0),
    (0, 255, 255),
    (255, 0, 255),
    (255, 165, 0),
    (255, 255, 255),
    (0, 0, 0),
    (128, 0, 0),
    (0, 128, 0),
    (128, 128, 0),
    (0, 128, 128),
    (128, 0, 128),
    (255, 105, 180),
    (255, 69, 0),
    (34, 139, 34),
    (70, 130, 180),
    (255, 228, 225),
    (218, 165, 32)
]

track(colors, colors_used, video_path, file_improve_tracking_path, output_path, model, tracker)

KeyboardInterrupt: 