In [1]:
# for manipulating the PDF
import fitz

# for OCR using PyTesseract
import cv2                              # pre-processing images
import pytesseract                      # extracting text from images
import numpy as np
import matplotlib.pyplot as plt         # displaying output images
import pandas as pd
import re
from PIL import Image
pytesseract.pytesseract.tesseract_cmd =  r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [3]:
class PDF_to_csv:
    ZOOM_X: float = 3.0
    ZOOM_Y: float = 3.0
    def __init__(self, scanned_file_name):
        self.raw_pdf_name = scanned_file_name
        self.pdf = cv2.imread(scanned_file_name)
        self.doc = fitz.open(scanned_file_name)
        self.scaling_matrix = fitz.Matrix(PDF_to_csv.ZOOM_X, PDF_to_csv.ZOOM_Y)
        self.pages = 0
        self.page_names = []
    def set_number_of_pages(self):

        for page in self.doc:
            pix = page.get_pixmap(matrix = self.scaling_matrix)
            png = './pages_split/input-' + self.raw_pdf_name.split('\\')[-1].split('.')[0] + 'page-%i.png' % page.number
            pix.save(png)
            self.page_names.append(png)
    def preprocess_image(self):
        images = []
        for page in self.page_names:
            image = cv2.imread(page)
            gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
            ret, threshold_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_OTSU | cv2.THRESH_BINARY)
            images.append(threshold_image)
        return images
    
    @staticmethod
    def perform_ocr_return_text(list_of_images,name):
        text_for_tabularization = ""
        with open(f"./pages_split/predictions-{name}.txt", "w+") as f:
            f.write("")
        with open(f"./pages_split/predictions-{name}.txt", "w+") as f:
            for image in list_of_images:
                # Apply OCR on the image
                text = pytesseract.image_to_string(image, lang='ces', config='-c preserve_interword_spaces=1 --oem 3 --psm 6')
                f.write(text)
                text_for_tabularization += text
        return text_for_tabularization
    @staticmethod
    def clean_data(predictions):
        listed_text = predictions.split(sep = "\n")
        text_sparse = []
        for line in listed_text:
            word_buffer = []
            splitted_line = line.split("    ")
            for word in splitted_line:
                word_buffer.append(re.sub("\s\s+" , " ", word))
            text_sparse.append(word_buffer)
        dense_text = []
        for line in text_sparse:
            test_list = [i.strip() for i in line if i]
            dense_text.append(test_list)
        dense_text = [x for x in dense_text if x]
        cleaned_dense_text = []
        for line in dense_text:
            word_buffer = []
            for word in line:
                word_buffer.append("".join(ch for ch in word if ch.isalnum() or ch == " " or ch == "." or ch == "," or ch == ":" or ch == "@" or ch == "/"))
            cleaned_dense_text.append(word_buffer) 
        super_clean_dense_text = []
        for line in cleaned_dense_text:
            test_list = [i.strip() for i in line]
            super_clean_dense_text.append(test_list)
        list_same_lengths = []
        for person in super_clean_dense_text:
            person_buffer = [0,0,0,0,0,0]
            if len(person) == 4:
                person_buffer[0] = person[0]
                person_buffer[2] = person[1]
                person_buffer[3] = person[2]
                person_buffer[5] = person[3]
                list_same_lengths.append(person_buffer)
            if len(person) == 5:
                person_buffer[0] = person[0]
                person_buffer[1] = person[1]
                person_buffer[2] = person[2]
                person_buffer[3] = person[3]
                person_buffer[5] = person[4]
                list_same_lengths.append(person_buffer)
            if len(person) == 6:
                person_buffer[0] = person[0]
                person_buffer[1] = person[1]
                person_buffer[2] = person[2]
                person_buffer[3] = person[3]
                person_buffer[4] = person[4]
                person_buffer[5] = person[5]
                list_same_lengths.append(person_buffer)
        data = pd.DataFrame(list_same_lengths,columns=['Zákazník', 'Adresa', 'Město', 'Telefon', 'Email', 'Datum prodeje'])
        return  data[data["Zákazník"] != "Zákazník"]
                

In [8]:
SCANNED_FILE = "./adresy_Ostrava.pdf"

In [9]:
pdf = PDF_to_csv(SCANNED_FILE)
pdf.set_number_of_pages()
images = pdf.preprocess_image()
final_output = PDF_to_csv.perform_ocr_return_text(images,"adresy_Ostrava")
dataframe = PDF_to_csv.clean_data(final_output)
dataframe.to_csv("./csv_final/output_ocr_file_adresy_Ostrava.csv",index = False)

In [None]:
dataframe.head(50)