In [10]:
!pip install surya-ocr



In [1]:
from PIL import Image
from surya.ocr import run_ocr
from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
from surya.model.recognition.model import load_model as load_rec_model
from surya.model.recognition.processor import load_processor as load_rec_processor


  from .autonotebook import tqdm as notebook_tqdm





## Extract a few pages from the pdf

In [33]:
from PyPDF2 import PdfReader, PdfWriter

def extract_pages(input_pdf, output_pdf, pages_to_extract):
    # Create a PDF reader and writer
    reader = PdfReader(input_pdf)
    writer = PdfWriter()
    
    # Extract the specified pages
    for page_num in pages_to_extract:
        writer.add_page(reader.pages[page_num])

    # Write the extracted pages to a new PDF
    with open(output_pdf, 'wb') as output_file:
        writer.write(output_file)

    print(f"New PDF created: {output_pdf}")


input_pdf = r'D:\Programming\AI\Basics\intern-assignments\SuryaOCR\Unified-Directives-2080-Circular-Final-Published.pdf' 
output_pdf = 'sample5.pdf'       
pages_to_extract = [0,1,2,3,4,5,6]  

extract_pages(input_pdf, output_pdf, pages_to_extract)


New PDF created: sample5.pdf


### Convert the few extracted pages into images

In [34]:
from pdf2image import convert_from_path

pdf_path = r'D:\Programming\AI\Basics\intern-assignments\SuryaOCR\sample6.pdf'

images = convert_from_path(pdf_path)
image_paths = []
for i, image in enumerate(images):
    # Save each page as an image file
    image_path = f"page_{i + 11}.png"
    image.save(image_path, 'PNG')
    image_paths.append(image_path)

## Testing Surya OCR

In [2]:
image = Image.open('page_2.png')
langs = ["en",'ne'] # Replace with your languages - optional but recommended
det_processor, det_model = load_det_processor(), load_det_model()
rec_model, rec_processor = load_rec_model(), load_rec_processor()

predictions2 = run_ocr([image], [langs], det_model, det_processor, rec_model, rec_processor)

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32


Detecting bboxes: 100%|██████████| 1/1 [00:08<00:00,  8.95s/it]
Recognizing Text: 100%|██████████| 1/1 [00:17<00:00, 17.01s/it]


In [22]:
output_path = "temp.txt"
with open(output_path, "w", encoding="utf-8") as f:    
    out_text = format_text(predictions)    
    f.write(out_text)

In [10]:
class MyBBox():
    def __init__(self,x_min,y_min,x_max,y_max):
        self.x_min = x_min  
        self.y_min = y_min
        self.x_max = x_max
        self.y_max = y_max
        
    def __str__(self):
        return f'[{self.x_min},{self.y_min},{self.x_max},{self.y_max}]'
    # Make the class subscriptable
    def __getitem__(self, index):
        # Define a list of attributes to return for indexing
        bbox_values = [self.x_min, self.y_min, self.x_max, self.y_max]
        return bbox_values[index]

    def __setitem__(self, index, value):
        # Update the corresponding attribute based on the index
        if index == 0:
            self.x_min = value
        elif index == 1:
            self.y_min = value
        elif index == 2:
            self.x_max = value
        elif index == 3:
            self.y_max = value
        else:
            raise IndexError("Index out of range for MyBBox")

class MyTextLine():
    def __init__(self,bbox: MyBBox,text: str):        
        self.bbox = bbox
        self.text = text
    def __repr__(self):
        return f'MyTextLine(bbox = {self.bbox}, text = {self.text})'

In [11]:
def my_predictions(data):
    my_data = []
    for pred in data:
        bbox = MyBBox(pred.bbox[0],pred.bbox[1],pred.bbox[2],pred.bbox[3])
        my_data.append(MyTextLine(bbox,pred.text))
    return my_data

### Testing the predictions variable

In [3]:
list(predictions[0])[0][1][1].text

'बालुवाटार, काठमाडाँ ।'

In [30]:
saved_predictions = []
def detect_text(image_path):      
    image = Image.open(image_path)
    langs = ["en",'ne'] 
    det_processor, det_model = load_det_processor(), load_det_model()
    rec_model, rec_processor = load_rec_model(), load_rec_processor()
    predictions = run_ocr([image], [langs], det_model, det_processor, rec_model, rec_processor)
    saved_predictions.append(predictions)
    return predictions   

In [7]:
list(predictions2[0])[0][1]

[TextLine(polygon=[[381.0, 161.0], [1284.0, 164.0], [1284.0, 226.0], [381.0, 223.0]], confidence=0.994456946849823, text='नेपाल राष्ट्र बैंकबाट “क”, “ख” र “ग” वर्गका', bbox=[381.0, 161.0, 1284.0, 226.0]),
 TextLine(polygon=[[402.0, 236.0], [1256.0, 230.0], [1257.0, 287.0], [402.0, 293.0]], confidence=0.9924792647361755, text=' इजाजतपत्रप्राप्त  संस्थाहरुलाई  जारी  गरिएको', bbox=[402.0, 236.0, 1256.0, 287.0]),
 TextLine(polygon=[[496.0, 470.0], [1163.0, 458.0], [1166.0, 544.0], [498.0, 556.0]], confidence=0.9814775586128235, text='एकीकृत  निर्देशन,  २०८०', bbox=[496.0, 470.0, 1163.0, 544.0]),
 TextLine(polygon=[[209.0, 605.0], [1454.0, 609.0], [1454.0, 657.0], [209.0, 654.0]], confidence=0.998428225517273, text='(एकीकृत निर्देशन, २०७९ र सो पश्चात २०८० असोज मसान्त सम्म जारी भएका', bbox=[209.0, 605.0, 1454.0, 657.0]),
 TextLine(polygon=[[369.0, 661.0], [1292.0, 661.0], [1292.0, 705.0], [369.0, 705.0]], confidence=0.9764374494552612, text='परिपत्र / निर्देशन  समेतलाई  समावेश  गरी  परिमार्ज

In [28]:
def format_text(predictions):
    predictions = my_predictions(list(predictions[0])[0][1])
    data = sorted(predictions,key=lambda x: (x.bbox[1]))
    prev_y_temp = 0    
    for pred in data:
        if pred.bbox[1] - prev_y_temp < 30:            
            pred.bbox[1] = prev_y_temp            
        else:
            prev_y_temp = pred.bbox[1]    
    data = sorted(data,key=lambda x: (x.bbox[1],x.bbox[0]))
    formatted_text = ''
    min_x = min(data,key=lambda x: x.bbox[0]).bbox[0]
    min_y = data[0].bbox[1]
    prev_x_max = 0    
    prev_y_max = 0
    prev_y = 0    
    for pred in data:
        current_x = pred.bbox[0] - min_x
        current_y = pred.bbox[1] - min_y        
        current_y_max = pred.bbox[3] - min_y
        line_breaks = '\n'*int(((current_y-prev_y_max)/60))
        if line_breaks:
            prev_x_max = 0                        
        #blanks = ' '*int(((current_x-prev_x_max)/8))
        formatted_text += line_breaks
        #formatted_text += blanks
        formatted_text += ' ' *int(current_x/15)
        formatted_text += pred.text +'\n'
        prev_x_max = pred.bbox[2]
        prev_y = current_y
        prev_y_max = current_y_max
    return formatted_text

In [29]:
this = format_text(predictions2)
print(this)

           नेपाल राष्ट्र बैंकबाट “क”, “ख” र “ग” वर्गका
             इजाजतपत्रप्राप्त  संस्थाहरुलाई  जारी  गरिएको



                   एकीकृत  निर्देशन,  २०८०

(एकीकृत निर्देशन, २०७९ र सो पश्चात २०८० असोज मसान्त सम्म जारी भएका
          परिपत्र / निर्देशन  समेतलाई  समावेश  गरी  परिमार्जन  गरिएको)

















                              नेपाल  राष्ट्र  बैंक
                              केन्द्रीय  कार्यालय
                बैंक  तथा  वित्तीय  संस्था  नियमन  विभाग
                                २०८०  असोज



In [90]:
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont

# Register a custom font (optional, for Nepali text)
pdfmetrics.registerFont(TTFont("Mangal", r"D:\Programming\AI\Basics\intern-assignments\SuryaOCR\Mangal Regular\Mangal Regular.otf"))

# Get text width
def get_text_width(text, font_name, font_size):
    return pdfmetrics.stringWidth(text, font_name, font_size)

# Example usage
text = " "
font_name = "Mangal"  # Use a registered font
font_size = 20       # Specify the font size

text_width = get_text_width(text, font_name, font_size)
print(f"Text width: {text_width} points")

Text width: 10.0 points


In [44]:
sorted(list(predictions[0])[0][1],key=lambda x: (x.bbox[1],x.bbox[0]))[0].bbox[1]

167.0

In [35]:
output_path = "final.txt"
#image_paths = ['D:\Programming\AI\Basics\intern-assignments\SuryaOCR\page_8.png']
with open(output_path, "w", encoding="utf-8") as f:    
    for page,image_path in enumerate(image_paths):
        f.write(f'\n----------------------------------- Page {page+1} -----------------------------------\n')
        predictions = detect_text(image_path)
        formatted_text = format_text(predictions)
        f.write(formatted_text)

Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32


Detecting bboxes: 100%|██████████| 1/1 [00:08<00:00,  8.43s/it]
Recognizing Text: 100%|██████████| 1/1 [00:58<00:00, 58.09s/it]


Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32


Detecting bboxes: 100%|██████████| 1/1 [00:08<00:00,  8.08s/it]
Recognizing Text: 100%|██████████| 1/1 [00:17<00:00, 17.40s/it]


Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32


Detecting bboxes: 100%|██████████| 1/1 [00:07<00:00,  7.85s/it]
Recognizing Text: 100%|██████████| 1/1 [00:21<00:00, 22.00s/it]


Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32


Detecting bboxes: 100%|██████████| 1/1 [00:10<00:00, 10.68s/it]
Recognizing Text: 100%|██████████| 2/2 [02:02<00:00, 61.37s/it]


Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32


Detecting bboxes: 100%|██████████| 1/1 [00:10<00:00, 10.19s/it]
Recognizing Text: 100%|██████████| 2/2 [01:32<00:00, 46.03s/it]


Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32


Detecting bboxes: 100%|██████████| 1/1 [00:09<00:00,  9.02s/it]
Recognizing Text: 100%|██████████| 2/2 [01:42<00:00, 51.05s/it]


Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32


Detecting bboxes: 100%|██████████| 1/1 [00:08<00:00,  8.74s/it]
Recognizing Text: 100%|██████████| 3/3 [02:52<00:00, 57.59s/it]


Loaded detection model vikp/surya_det3 on device cpu with dtype torch.float32
Loaded recognition model vikp/surya_rec2 on device cpu with dtype torch.float32


Detecting bboxes: 100%|██████████| 1/1 [00:09<00:00,  9.71s/it]
Recognizing Text: 100%|██████████| 2/2 [01:59<00:00, 59.94s/it]


In [17]:
print(formatted_text)

                                                              केन्द्रीय कार्यालय
                                                            बालुवाटार, काठमाडाँ ।
                                                          फोन नं.: ४४९९८०४ /५
नंपाल राष्ट्र बैंक
                                                           Web Site: www.nrb.org.np
                                                              पोष्ट बक्स:७३
बैंक तथा वित्तीय संस्था नियमन विभाग
पत्र  संख्या:  बै.वि.नि.वि. / नीति / परिपत्र / 0 ४ / ०८० / ८१
                                                              मिति: २०८०/०६/३०
इजाजतपत्रप्रप्राप्त “क”, “ख” र “ग” वर्गका बैंक तथा वित्तीय संस्थाहरु,
महाशय.
  यस वैंकवाट “क” “ख” र “ग” वर्गका इजाजतपत्रप्राप्त बैंक तथा वित्तीय संस्थाहरुलाई जारी
गरिएको एकीकृत निर्देशन, २०७९ तथा सो पश्चीत २०८० असोज मसान्तसम्म जारी गरिएका परिपत्र
तथा संशोधनहरु एकीकृत गरी तयार पारिएको यसै साथ संलग्न एकीकृत निर्देशन, २०५० जारी गरिएको
हुँदा सोहीबमोजिम गर्नु/गराउनु हुन नेपाल राष्ट्र बैंक ऐन, २०५८ को दफा 