In [1]:
import textract

from os import listdir
from os.path import isfile, join

import pandas as pd

import pdf2image
import tempfile
import pytesseract

path_files = '../documentos-originais'
path_processed = '../dados-processados'
path_temp = '../temporario'

## Part 1 - Extract text from pdfs and imgs without problems

In [3]:
dataframe = pd.DataFrame(columns=['file', 'text'])

files = [f for f in listdir(path_files) if isfile(join(path_files, f))]

error_files = []
ignore_files = []


for file in files:
    try:
        text = textract.process(f'{path_files}/{file}', language='por')
    except:
        error_files.append(file)
        continue

    if 'PDF' in file.upper() and len(text) <= 10:
        ignore_files.append(file)
        continue

    dataframe = dataframe.append(pd.DataFrame({'file': [file], 'text': [text.decode('UTF-8')]}))

In [4]:
dataframe.to_csv(f'{path_processed}/images-and-pdf-with-text.csv')

pd.DataFrame(ignore_files, columns=['files']).to_csv(f'{path_processed}/ignored-files.csv')
pd.DataFrame(error_files, columns=['files']).to_csv(f'{path_processed}/error-files.csv')

## Part 2 - Extract text from pdfs without problem

- PDFs with image
- PDFs that textract cant extract text

In [12]:
# dataframe2 = pd.DataFrame(columns=['file', 'text'])

# ignored_files_dataframe = pd.read_csv(f'{path_processed}/ignored-files.csv', index_col=0)

# for i, file in ignored_files_dataframe.iterrows():
#     text = ''

#     with tempfile.TemporaryDirectory(dir='../temporario') as temp:
#         images = pdf2image.convert_from_path(f'{path_files}/{file[0]}', output_folder=temp, paths_only=True, fmt='png')

#         for image in images:
#             text += textract.process(image, language='por').decode('UTF-8') + '\n'

#     dataframe2 = dataframe2.append(pd.DataFrame({'file': [file], 'text': [text]}))

# dataframe2.head()

In [2]:
import multiprocessing

ignored_files_dataframe = pd.read_csv(f'{path_processed}/ignored-files.csv', index_col=0)


def process_file(file):
    text = ''

    with tempfile.TemporaryDirectory(dir=path_temp) as temp:
        images = pdf2image.convert_from_path(f'{path_files}/{file}', output_folder=temp, paths_only=True, fmt='png')

        for image in images:
            text += textract.process(image, language='por').decode('UTF-8') + '\n'

    dataframe = pd.DataFrame(columns=['file', 'text'])
    dataframe = dataframe.append(pd.DataFrame({'file': [file], 'text': [text]}))
    dataframe.to_csv(f'{path_temp}/{file}.csv')

cpus = 3

files = list(ignored_files_dataframe.files.values)

pool = multiprocessing.Pool(processes=cpus)
result = pool.map(process_file, files)

In [13]:
files = [f for f in listdir(path_temp) if isfile(join(path_temp, f))]

dataframe_files = []
for file in files:
    if '.csv' in file:
        dataframe_files.append(pd.read_csv(f'{path_temp}/{file}', index_col=0))

pd.concat(dataframe_files).to_csv(f'{path_processed}/pdf-without.csv')

In [15]:
data1 = pd.read_csv(f'{path_processed}/images-and-pdf-with-text.csv', index_col=0)
data2 = pd.read_csv(f'{path_processed}/pdf-without.csv', index_col=0)

pd.concat([data1, data2]).to_csv(f'{path_processed}/all-data.csv')