Added normalized bboxes for tracker and object detector

Also added tests for ObjectDetection and several bug fixes
OpenShot · Jul 29, 2020 · 483f288 · 483f288
1 parent 88a0c37
commit 483f288
Show file tree

Hide file tree

Showing 19 changed files with 377 additions and 124 deletions.
diff --git a/include/CVObjectDetection.h b/include/CVObjectDetection.h
@@ -44,11 +44,13 @@
 #include "Clip.h"
 #include "objdetectdata.pb.h"
 
+#include "../src/sort_filter/sort.hpp"
+
 using google::protobuf::util::TimeUtil;
 
 struct CVDetectionData{
     CVDetectionData(){}
-    CVDetectionData(std::vector<int> _classIds, std::vector<float> _confidences, std::vector<cv::Rect> _boxes, size_t _frameId){
+    CVDetectionData(std::vector<int> _classIds, std::vector<float> _confidences, std::vector<cv::Rect_<float>> _boxes, size_t _frameId){
         classIds = _classIds;
         confidences = _confidences;
         boxes = _boxes;
@@ -57,7 +59,7 @@ struct CVDetectionData{
     size_t frameId;
     std::vector<int> classIds;
     std::vector<float> confidences;
-    std::vector<cv::Rect> boxes;
+    std::vector<cv::Rect_<float>> boxes;
 };
 
 class CVObjectDetection{
@@ -74,6 +76,8 @@ class CVObjectDetection{
     std::string processingDevice;
     std::string protobuf_data_path;
 
+    SortTracker sort;
+
     uint progress;
 
     size_t start;
@@ -86,6 +90,8 @@ class CVObjectDetection{
 
     void DetectObjects(const cv::Mat &frame, size_t frame_number);
 
+    bool iou(cv::Rect pred_box, cv::Rect sort_box);
+
     // Remove the bounding boxes with low confidence using non-maxima suppression
     void postprocess(const cv::Size &frameDims, const std::vector<cv::Mat>& out, size_t frame_number);
 
@@ -100,6 +106,8 @@ class CVObjectDetection{
 
     void detectObjectsClip(openshot::Clip &video, size_t start=0, size_t end=0, bool process_interval=false);
 
+    CVDetectionData GetDetectionData(size_t frameId);
+
     /// Protobuf Save and Load methods
     // Save protobuf file
     bool SaveTrackedData();

diff --git a/include/CVStabilization.h b/include/CVStabilization.h
@@ -86,8 +86,7 @@ class CVStabilization {
     size_t end;
 
     cv::Mat last_T;
-    cv::Mat cur, cur_grey;
-    cv::Mat prev, prev_grey;
+    cv::Mat prev_grey;
     std::vector <TransformParam> prev_to_cur_transform; // Previous to current 
     std::string protobuf_data_path;
 

diff --git a/include/CVTracker.h b/include/CVTracker.h
@@ -28,10 +28,10 @@ using google::protobuf::util::TimeUtil;
 struct FrameData{
   size_t frame_id = -1;
   float rotation = 0;
-  int x1 = -1;
-  int y1 = -1;
-  int x2 = -1;
-  int y2 = -1;
+  float x1 = -1;
+  float y1 = -1;
+  float x2 = -1;
+  float y2 = -1;
 
   // Constructors
   FrameData()
@@ -40,7 +40,7 @@ struct FrameData{
   FrameData( size_t _frame_id)
   {frame_id = _frame_id;}
 
-  FrameData( size_t _frame_id , float _rotation, int _x1, int _y1, int _x2, int _y2)
+  FrameData( size_t _frame_id , float _rotation, float _x1, float _y1, float _x2, float _y2)
   {
       frame_id = _frame_id;
       rotation = _rotation;
@@ -51,24 +51,14 @@ struct FrameData{
   }
 };
 
-class RemoveJitter{
-  private:
-    std::vector<cv::Rect2d> bboxTracker;
-    int boxesInterval;
-    int boxesInVector;
-
-  public:
-    RemoveJitter(int boxesInterval);
-    void update(cv::Rect2d bbox, cv::Rect2d &out_bbox);
-};
-
 class CVTracker { 
   private:
     std::map<size_t, FrameData> trackedDataById; // Save tracked data       
     std::string trackerType; // Name of the chosen tracker
     cv::Ptr<cv::Tracker> tracker; // Pointer of the selected tracker
 
     cv::Rect2d bbox; // Bounding box coords 
+    SortTracker sort;
 
     std::string protobuf_data_path; // Path to protobuf data file
 
@@ -86,7 +76,7 @@ class CVTracker {
     bool initTracker(cv::Mat &frame, size_t frameId);
 
     // Update the object tracker according to frame 
-    bool trackFrame(cv::Mat &frame, size_t frameId, SortTracker &sort, RemoveJitter &removeJitter);
+    bool trackFrame(cv::Mat &frame, size_t frameId);
 
   public:
 

diff --git a/include/effects/ObjectDetection.h b/include/effects/ObjectDetection.h
@@ -43,7 +43,7 @@
 
 struct DetectionData{
     DetectionData(){}
-    DetectionData(std::vector<int> _classIds, std::vector<float> _confidences, std::vector<cv::Rect> _boxes, size_t _frameId){
+    DetectionData(std::vector<int> _classIds, std::vector<float> _confidences, std::vector<cv::Rect_<float>> _boxes, size_t _frameId){
         classIds = _classIds;
         confidences = _confidences;
         boxes = _boxes;
@@ -52,7 +52,7 @@ struct DetectionData{
     size_t frameId;
     std::vector<int> classIds;
     std::vector<float> confidences;
-    std::vector<cv::Rect> boxes;
+    std::vector<cv::Rect_<float>> boxes;
 };
 
 namespace openshot

diff --git a/include/effects/Tracker.h b/include/effects/Tracker.h
@@ -52,10 +52,10 @@ using google::protobuf::util::TimeUtil;
 struct EffectFrameData{
   size_t frame_id = -1;
   float rotation = 0;
-  int x1 = -1;
-  int y1 = -1;
-  int x2 = -1;
-  int y2 = -1;
+  float x1 = -1;
+  float y1 = -1;
+  float x2 = -1;
+  float y2 = -1;
 
   // Constructors
   EffectFrameData()
@@ -64,7 +64,7 @@ struct EffectFrameData{
   EffectFrameData( int _frame_id)
   {frame_id = _frame_id;}
 
-  EffectFrameData( int _frame_id , float _rotation, int _x1, int _y1, int _x2, int _y2)
+  EffectFrameData( int _frame_id , float _rotation, float _x1, float _y1, float _x2, float _y2)
   {
       frame_id = _frame_id;
       rotation = _rotation;

diff --git a/src/CVObjectDetection.cpp b/src/CVObjectDetection.cpp
@@ -40,7 +40,6 @@
 CVObjectDetection::CVObjectDetection(std::string processInfoJson, ProcessingController &processingController)
 : processingController(&processingController), processingDevice("CPU"){
     SetJson(processInfoJson);
-    setProcessingDevice();
 }
 
 void CVObjectDetection::setProcessingDevice(){
@@ -70,7 +69,10 @@ void CVObjectDetection::detectObjectsClip(openshot::Clip &video, size_t _start,
     nmsThreshold = 0.1;
 
     // Load the network
+    if(classesFile == "" || modelConfiguration == "" || modelWeights == "")
+        return;
     net = cv::dnn::readNetFromDarknet(modelConfiguration, modelWeights);
+    setProcessingDevice();
 
     size_t frame_number;
     if(!process_interval || end == 0 || end-start <= 0){
@@ -164,7 +166,100 @@ void CVObjectDetection::postprocess(const cv::Size &frameDims, const std::vector
     std::vector<int> indices;
     cv::dnn::NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
 
-    detectionsData[frameId] = CVDetectionData(classIds, confidences, boxes, frameId);
+    // std::vector<cv::Rect> sortBoxes;
+    // for(auto box : boxes)
+    //     sortBoxes.push_back(box);
+    // sort.update(sortBoxes, frameId, sqrt(pow(frameDims.width,2) + pow(frameDims.height, 2)));
+
+
+    // sortBoxes.clear();
+    // for(auto TBox : sort.frameTrackingResult)
+    //     if(TBox.frame == frameId){
+    //         sortBoxes.push_back(TBox.box);
+    //     }
+
+    // for(int i = 0; i<boxes.size(); i++){
+    //     bool found = false;
+    //     for(int j = 0; j<sortBoxes.size(); j++){
+    //         if( iou(boxes[i], sortBoxes[j]) ){
+    //             boxes[i] = sortBoxes[j];
+    //             sortBoxes.erase(sortBoxes.begin() + j);
+    //             found = true;
+    //             break;
+    //         }
+    //     }
+    //     if(!found){
+    //         boxes.erase(boxes.begin() + i);
+    //         confidences.erase(confidences.begin() + i);
+    //         classIds.erase(classIds.begin() + i);
+    //     }
+    // }
+
+
+
+    // std::map<int, std::vector<cv::Rect> > rectAndClasses;
+    // for(int i=0; i<boxes.size(); i++){
+    //     if(rectAndClasses.find(classIds[i]) == rectAndClasses.end()){
+    //         std::vector<cv::Rect> bboxes;
+    //         rectAndClasses[classIds[i]] = bboxes;
+    //     }
+
+    //     rectAndClasses[classIds[i]].push_back(boxes[i]);
+    // }
+
+    // for(std::map<int, std::vector<cv::Rect> >::iterator it = rectAndClasses.begin(); it != rectAndClasses.end(); it++){
+    //     if(sort.find(it->first) == sort.end()){
+    //         SortTracker classTracker;
+    //         sort[it->first] = classTracker;
+    //     }
+    //     sort[it->first].update(it->second, frameId, sqrt(pow(frameDims.width,2) + pow(frameDims.height, 2)));
+    // }
+
+    // classIds.clear(); boxes.clear(); confidences.clear();
+
+    // for(std::map<int, SortTracker>::iterator it = sort.begin(); it != sort.end(); it++){
+    //     for(auto TBox : it->second.frameTrackingResult){
+    //         boxes.push_back(TBox.box);
+    //         classIds.push_back(it->first);
+    //         confidences.push_back(1);
+    //     }
+    // }
+
+
+    std::vector<cv::Rect_<float>> normalized_boxes;
+    for(auto box : boxes){
+        cv::Rect_<float> normalized_box;
+        normalized_box.x = (box.x)/(float)frameDims.width;
+        normalized_box.y = (box.y)/(float)frameDims.height;
+        normalized_box.width = (box.x+box.width)/(float)frameDims.width;
+        normalized_box.height = (box.y+box.height)/(float)frameDims.height;
+        normalized_boxes.push_back(normalized_box);
+    }
+
+    detectionsData[frameId] = CVDetectionData(classIds, confidences, normalized_boxes, frameId);
+}
+
+bool CVObjectDetection::iou(cv::Rect pred_box, cv::Rect sort_box){
+    // determine the (x, y)-coordinates of the intersection rectangle
+	int xA = std::max(pred_box.x, sort_box.x);
+	int yA = std::max(pred_box.y, sort_box.y);
+	int xB = std::min(pred_box.x + pred_box.width, sort_box.x + sort_box.width);
+	int yB = std::min(pred_box.y + pred_box.height, sort_box.y + sort_box.height);
+
+	// compute the area of intersection rectangle
+	int interArea = std::max(0, xB - xA + 1) * std::max(0, yB - yA + 1);
+	// compute the area of both the prediction and ground-truth
+	// rectangles
+	int boxAArea = (pred_box.width + 1) * (pred_box.height + 1);
+	int boxBArea = (sort_box.width + 1) * (sort_box.height + 1);
+	// compute the intersection over union by taking the intersection
+	// area and dividing it by the sum of prediction + ground-truth
+	// areas - the interesection area
+	float iou = interArea / (float)(boxAArea + boxBArea - interArea);
+
+    if(iou > 0.75)
+        return true;
+    return false;
 }
 
 // Get the names of the output layers
@@ -185,6 +280,17 @@ std::vector<cv::String> CVObjectDetection::getOutputsNames(const cv::dnn::Net& n
     return names;
 }
 
+CVDetectionData CVObjectDetection::GetDetectionData(size_t frameId){
+    // Check if the stabilizer info for the requested frame exists
+    if ( detectionsData.find(frameId) == detectionsData.end() ) {
+
+        return CVDetectionData();
+    } else {
+
+        return detectionsData[frameId];
+    }
+}
+
 bool CVObjectDetection::SaveTrackedData(){
     // Create tracker message
     libopenshotobjdetect::ObjDetect objMessage;

diff --git a/src/CVStabilization.cpp b/src/CVStabilization.cpp
@@ -82,6 +82,12 @@ void CVStabilization::stabilizeClip(openshot::Clip& video, size_t _start, size_t
 
 // Track current frame features and find the relative transformation
 void CVStabilization::TrackFrameFeatures(cv::Mat frame, size_t frameNum){
+    std::cout<<"frame "<<frameNum<<"\n";
+    if(cv::countNonZero(frame) < 1){
+        last_T = cv::Mat();
+        prev_grey = cv::Mat();
+        return;
+    }
 
     if(prev_grey.empty()){
         prev_grey = frame;
@@ -93,7 +99,6 @@ void CVStabilization::TrackFrameFeatures(cv::Mat frame, size_t frameNum){
     std::vector <cv::Point2f> prev_corner2, cur_corner2;
     std::vector <uchar> status;
     std::vector <float> err;
-
     // Extract new image features
     cv::goodFeaturesToTrack(prev_grey, prev_corner, 200, 0.01, 30);
     // Track features
@@ -105,23 +110,37 @@ void CVStabilization::TrackFrameFeatures(cv::Mat frame, size_t frameNum){
             cur_corner2.push_back(cur_corner[i]);
         }
     }
+    // In case no feature was detected
+    if(prev_corner2.empty() || cur_corner2.empty()){
+        last_T = cv::Mat();
+        prev_grey = cv::Mat();
+        return;
+    }
+
     // Translation + rotation only
-    cv::Mat T = estimateRigidTransform(prev_corner2, cur_corner2, false); // false = rigid transform, no scaling/shearing
+    cv::Mat T = estimateAffinePartial2D(prev_corner2, cur_corner2); // false = rigid transform, no scaling/shearing
 
-    // If no transformation is found, just use the last known good transform.
-    if(T.data == NULL) {
-        last_T.copyTo(T);
+    double da, dx, dy;
+    if(T.size().width == 0 || T.size().height == 0){
+
+        dx = 0;
+        dy = 0;
+        da = 0;
+    }
+    else{
+        // If no transformation is found, just use the last known good transform.
+        if(T.data == NULL && !last_T.empty()) 
+            last_T.copyTo(T);
+        // Decompose T
+        dx = T.at<double>(0,2);
+        dy = T.at<double>(1,2);
+        da = atan2(T.at<double>(1,0), T.at<double>(0,0));
     }
 
     T.copyTo(last_T);
-    // Decompose T
-    double dx = T.at<double>(0,2);
-    double dy = T.at<double>(1,2);
-    double da = atan2(T.at<double>(1,0), T.at<double>(0,0));
 
     prev_to_cur_transform.push_back(TransformParam(dx, dy, da));
-
-    cur.copyTo(prev);
+    std::cout<<"10\n";
     frame.copyTo(prev_grey);
 
     // Show processing info