Merge pull request #141 from VikParuchuri/dev

Fix transformers bug
VikParuchuri · Jun 30, 2024 · f7c6c04 · f7c6c04
2 parents 0d7c170 + 7301718
commit f7c6c04
Show file tree

Hide file tree

Showing 11 changed files with 525 additions and 481 deletions.
diff --git a/README.md b/README.md
@@ -39,16 +39,20 @@ Surya is named for the [Hindu sun god](https://en.wikipedia.org/wiki/Surya), who
 | Scanned Form     |  [Image](static/images/funsd.png)   |    [Image](static/images/funsd_text.jpg) |    [Image](static/images/funsd_layout.jpg) |    [Image](static/images/funsd_reading.jpg) |
 | Textbook         | [Image](static/images/textbook.jpg) | [Image](static/images/textbook_text.jpg) | [Image](static/images/textbook_layout.jpg) |   [Image](static/images/textbook_order.jpg) |
 
+# Hosted API
+
+There is a hosted API for all surya models available [here](https://www.datalab.to/):
+
+- Works with PDF, images, word docs, and powerpoints
+- Consistent speed, with no latency spikes
+- High reliability and uptime
+
 # Commercial usage
 
 I want surya to be as widely accessible as possible, while still funding my development/training costs. Research and personal usage is always okay, but there are some restrictions on commercial usage.
 
 The weights for the models are licensed `cc-by-nc-sa-4.0`, but I will waive that for any organization under $5M USD in gross revenue in the most recent 12-month period AND under $5M in lifetime VC/angel funding raised. If you want to remove the GPL license requirements (dual-license) and/or use the weights commercially over the revenue limit, check out the options [here](https://www.datalab.to).
 
-# Hosted API
-
-There is a hosted API for all surya models available [here](https://www.datalab.to/).  It's currently in beta, and I'm working on optimizing speed.
-
 # Installation
 
 You'll need python 3.9+ and PyTorch. You may need to install the CPU version of torch first if you're not using a Mac or a GPU machine.  See [here](https://pytorch.org/get-started/locally/) for more details.

diff --git a/ocr_text.py b/ocr_text.py
@@ -1,3 +1,4 @@
+import os
 import argparse
 import json
 from collections import defaultdict
@@ -13,7 +14,6 @@
 from surya.ocr import run_ocr
 from surya.postprocessing.text import draw_text_on_image
 from surya.settings import settings
-import os
 
 
 def main():

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "surya-ocr"
-version = "0.4.12"
+version = "0.4.14"
 description = "OCR, layout, reading order, and line detection in 90+ languages"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 readme = "README.md"

diff --git a/reading_order.py b/reading_order.py
@@ -1,3 +1,4 @@
+import os
 import argparse
 import copy
 import json
@@ -12,7 +13,6 @@
 from surya.ordering import batch_ordering
 from surya.postprocessing.heatmap import draw_polys_on_image
 from surya.settings import settings
-import os
 
 
 def main():

diff --git a/surya/model/detection/segformer.py b/surya/model/detection/segformer.py
@@ -435,7 +435,7 @@ def forward(
         return encoder_outputs
 
 class SegformerForRegressionMask(SegformerForSemanticSegmentation):
-    def __init__(self, config):
+    def __init__(self, config, **kwargs):
         super().__init__(config)
         self.segformer = SegformerModel(config)
         self.decode_head = SegformerForMaskDecodeHead(config)
@@ -446,6 +446,7 @@ def __init__(self, config):
     def forward(
         self,
         pixel_values: torch.FloatTensor,
+        **kwargs
     ) -> Union[Tuple, SemanticSegmenterOutput]:
 
         encoder_hidden_states = self.segformer(

diff --git a/surya/model/ordering/decoder.py b/surya/model/ordering/decoder.py
@@ -487,7 +487,7 @@ class MBartOrder(MBartForCausalLM):
     config_class = MBartOrderConfig
     _tied_weights_keys = []
 
-    def __init__(self, config):
+    def __init__(self, config, **kwargs):
         config = copy.deepcopy(config)
         config.is_decoder = True
         config.is_encoder_decoder = False
@@ -515,6 +515,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        **kwargs
     ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (

diff --git a/surya/model/ordering/encoder.py b/surya/model/ordering/encoder.py
@@ -15,7 +15,7 @@ class VariableDonutSwinEmbeddings(DonutSwinEmbeddings):
     Construct the patch and position embeddings. Optionally, also the mask token.
     """
 
-    def __init__(self, config, use_mask_token=False):
+    def __init__(self, config, use_mask_token=False, **kwargs):
         super().__init__(config, use_mask_token)
 
         self.patch_embeddings = DonutSwinPatchEmbeddings(config)
@@ -37,7 +37,7 @@ def __init__(self, config, use_mask_token=False):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(
-        self, pixel_values: Optional[torch.FloatTensor], bool_masked_pos: Optional[torch.BoolTensor] = None
+        self, pixel_values: Optional[torch.FloatTensor], bool_masked_pos: Optional[torch.BoolTensor] = None, **kwargs
     ) -> Tuple[torch.Tensor]:
 
         embeddings, output_dimensions = self.patch_embeddings(pixel_values)
@@ -68,7 +68,7 @@ def forward(
 
 class VariableDonutSwinModel(DonutSwinModel):
     config_class = VariableDonutSwinConfig
-    def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
+    def __init__(self, config, add_pooling_layer=True, use_mask_token=False, **kwargs):
         super().__init__(config)
         self.config = config
         self.num_layers = len(config.depths)

diff --git a/surya/model/recognition/decoder.py b/surya/model/recognition/decoder.py
@@ -436,7 +436,7 @@ class MBartMoE(MBartForCausalLM):
     config_class = MBartMoEConfig
     _tied_weights_keys = ["lm_head.weight"]
 
-    def __init__(self, config):
+    def __init__(self, config, **kwargs):
         config = copy.deepcopy(config)
         config.is_decoder = True
         config.is_encoder_decoder = False
@@ -467,6 +467,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        **kwargs
     ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 

diff --git a/surya/model/recognition/encoder.py b/surya/model/recognition/encoder.py
@@ -391,7 +391,7 @@ def forward(
 
 class VariableDonutSwinModel(DonutSwinModel):
     config_class = VariableDonutSwinConfig
-    def __init__(self, config, add_pooling_layer=True, use_mask_token=False):
+    def __init__(self, config, add_pooling_layer=True, use_mask_token=False, **kwargs):
         super().__init__(config)
         self.config = config
         self.num_layers = len(config.depths)
@@ -413,6 +413,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        **kwargs
     ) -> Union[Tuple, DonutSwinModelOutput]:
         r"""
         bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):

diff --git a/surya/postprocessing/heatmap.py b/surya/postprocessing/heatmap.py
@@ -108,7 +108,11 @@ def detect_boxes(linemap, text_threshold, low_text):
         segmap[labels == k] = 255
         x, y = stats[k, cv2.CC_STAT_LEFT], stats[k, cv2.CC_STAT_TOP]
         w, h = stats[k, cv2.CC_STAT_WIDTH], stats[k, cv2.CC_STAT_HEIGHT]
-        niter = int(math.sqrt(size * min(w, h) / (w * h)) * 2)
+        try:
+            niter = int(math.sqrt(size * min(w, h) / (w * h)) * 2)
+        except ValueError:
+            # Overflow when size is too large
+            niter = 0
         sx, ex, sy, ey = x - niter, x + w + niter + 1, y - niter, y + h + niter + 1
 
         # boundary checks