
Task : 

Extract the total amount of the receipts using OCR


Hypothesis:

1- The receipts are properly scanned
2- The value we want to extract will always be in the same line (to the right of) our keyword ( 'total' in our case)
3- the ocr will make no mistake recognizing characters


Steps:

1- We will scan our receipts with the tesseract open-source OCR.

2- We will extract all the boxes (coordinates) that contain our keyword

3- As we only focus on getting the total amount in this mini project, we will only generate the lines that   contain our keyword ('total')

    We will suppose that a box belongs to the line of our keyword's box. if and only if the difference between the Y coordinates of the centers of gravity of the two respective boxes is below a chosen threshold.

4- We will then extract the amounts from the string using Regex

5- finally, we will take the max of the extracted amounts as our answer and returns it to the user.



In [3]:
import pytesseract
from pytesseract import Output
import cv2
import re


class OCR_get_total:

    def __init__(self,img_path,keyword='total',seuil=0.5):
        
        self.img_path = img_path
        #chosen keyword
        self.keyword = keyword

        self.img = cv2.imread(self.img_path)
        # the ocr outputs
        self.d = pytesseract.image_to_data(self.img, output_type=Output.DICT)

        self.d['text'] = [s.lower() for s in self.d['text']]
        # the chosen threshold 
        self.seuil = seuil

        self.n_boxes = len(self.d['text'])
        # to store the boxes of the keywords
        self.keyword_boxes = {}
        # to store the generated lines
        self.dic_lines = {}
        # the final result
        self.result = None


    def get_keyword_boxes(self):
        """
        This function extract all the boxes (coordinates) that contain our keyword and stores them in a dictionnary "dic_lines"
        """

        for i in range(self.n_boxes):

            if self.keyword in self.d['text'][i]:

                self.keyword_boxes[i] = [self.d['left'][i], self.d['top'][i], self.d['width'][i], self.d['height'][i]]



    def get_keyword_lines(self):
        """
        this function generates the lines that contains our keyword
        """
       
        lines = {}

        for key in self.keyword_boxes.keys():

            str_key = ''

            threshold = self.keyword_boxes[key][3]*self.seuil


            for i in range(self.n_boxes):

                if self.d['text'][i] != '':

                    if self.d["left"][i] > (self.keyword_boxes[key][0]+self.keyword_boxes[key][2]):

                        if abs((self.d["top"][i]+ (self.d['height'][i]/2)) - (self.keyword_boxes[key][1]+(self.keyword_boxes[key][3]/2)))<=threshold:

                            str_key += ' ' + self.d['text'][i]

            lines[key] = str_key

        self.dic_lines =  lines



    def get_amount_from_lines(self):
        """
        this function uses regex to extract the amounts from the generated lines and returns the max amount found
        """


        p = '[\d]+[.,\d]+|[\d]*[.][\d]+|[\d]+'
    
        list_of_lines = list(self.dic_lines.values())

        list_of_amounts = []

        for s in list_of_lines:

            if re.search(p, s) is not None:

                for catch in re.finditer(p, s):

                    list_of_amounts.append(catch[0]) 

        list_of_amounts = [s.replace(',','.') for s in list_of_amounts]

        list_of_amounts = [float(s) if s[-1] != '.' else float(s[:-1]) for s in list_of_amounts]

        if list_of_amounts: 

            self.result = max(list_of_amounts)

        else:
            self.result = '?'




    def get_total(self):
        """
        this function returns the total to the end user if the Process successfuly detected it
        """

        self.get_keyword_boxes()
        self.get_keyword_lines()
        self.get_amount_from_lines()

        if self.result != '?':

            print(f'Total amount for this receipt is : {self.result} $')

        else:

            print('The OCR process failed to detect the total amount')

        


   



           


       

    


    

In [4]:
for k in range(1,40):

    a = OCR_get_total(f'./data/{k}.jpg')
    print( f'receipt  {k} : ')
    a.get_total()

    

receipt  1 : 
Total amount for this receipt is : 125.23 $
receipt  2 : 
Total amount for this receipt is : 15.81 $
receipt  3 : 
Total amount for this receipt is : 82.27 $
receipt  4 : 
Total amount for this receipt is : 41.14 $
receipt  5 : 
Total amount for this receipt is : 49.64 $
receipt  6 : 
Total amount for this receipt is : 44.687 $
receipt  7 : 
Total amount for this receipt is : 13.78 $
receipt  8 : 
Total amount for this receipt is : 30.37 $
receipt  9 : 
Total amount for this receipt is : 52.04 $
receipt  10 : 
Total amount for this receipt is : 30.37 $
receipt  11 : 
Total amount for this receipt is : 12.48 $
receipt  12 : 
The OCR process failed to detect the total amount
receipt  13 : 
Total amount for this receipt is : 30.37 $
receipt  14 : 
Total amount for this receipt is : 27.1 $
receipt  15 : 
Total amount for this receipt is : 35.66 $
receipt  16 : 
Total amount for this receipt is : 107.45 $
receipt  17 : 
Total amount for this receipt is : 17.77 $
receipt  18 : 