OpenPecha · eroux · Jan 23, 2023 · Jan 23, 2023 · Jan 23, 2023 · Jan 23, 2023
@@ -66,3 +66,4 @@ pre-commit install
 PYTHONPATH=.:$PYTHONPATH pytest tests
 ```
 <!-- This section must link to the docs which are in the root of the repository in /docs -->
+
@@ -150,14 +150,15 @@ def dict_to_bbox(self, word):
     def get_width_of_vertices(vertices):
         if len(vertices) < 4:
             return None
-        smallest_x = -1
+        # oddly enough, sometimes Google returns a vertex with x=-1...
+        smallest_x = None
         largest_x = -1
         for v in vertices:
             if "x" not in v or "y" not in v:
                 continue
-            smallest_x = v["x"] if smallest_x == -1 else min(v["x"], smallest_x)
+            smallest_x = v["x"] if smallest_x == None else min(v["x"], smallest_x)
             largest_x = max(v["x"], largest_x)
-        if smallest_x == -1:
+        if smallest_x == None:
             return None
         return largest_x - smallest_x
 
@@ -187,7 +188,7 @@ def get_char_base_bboxes_and_avg_width(self, response):
                             if symbolunicat in UNICODE_CHARCAT_FOR_WIDTH:
                                 vertices = symbol['boundingBox']['vertices']
                                 width = GoogleVisionFormatter.get_width_of_vertices(vertices)
-                                if width > 0:
+                                if width is not None and width > 0:
                                     widths.append(width)
                             cur_word += symbol['text']
                             if self.has_space_attached(symbol):

@@ -0,0 +1,41 @@
+from pathlib import Path
+
+from openpecha.formatters.ocr.ocr import OCRFormatter
+from openpecha.formatters.ocr.google_vision import GoogleVisionFormatter, GoogleVisionBDRCFileProvider
+from openpecha.utils import load_yaml
+
+
+ocr_path = Path(__file__).parent / "data" / "36940497.json"
+
+def negative_x_vertex():
+
+    # The json file has a vertex with an x coordinate of -1
+
+    state = {
+        "base_layer_len": 0,
+        "base_layer": "",
+        "low_confidence_annotations": {},
+        "language_annotations": [],
+        "pagination_annotations": {},
+        "word_confidences": [],
+        "latest_language_annotation": None,
+        "latest_low_confidence_annotation": None,
+        "page_low_confidence_annotations": []
+    }
+
+    ocr_object = load_yaml(ocr_path)
+
+    google_formatter = GoogleVisionFormatter()
+
+    bboxes, avg_char_width = google_formatter.get_char_base_bboxes_and_avg_width(response=ocr_object)
+
+    ocr_formatter = OCRFormatter()
+    ocr_formatter.remove_duplicate_symbols = True
+    ocr_formatter.same_line_ratio_threshold = 0.2
+
+    ocr_formatter.build_page(bboxes, 1, "36940497", state, avg_char_width)
+    base = state['base_layer']
+
+
+if __name__ == "__main__":
+    negative_x_vertex()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -66,3 +66,4 @@ pre-commit install
		PYTHONPATH=.:$PYTHONPATH pytest tests
		```
		<!-- This section must link to the docs which are in the root of the repository in /docs -->