In [1]:
%%capture
!pip install pypdfium2
!pip install fastprogress

In [2]:
import pypdfium2 as pdfium
import os, boto3
from fastprogress.fastprogress import master_bar, progress_bar
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
BUCKET = "vijaydev"
STORAGE = "democra_files"

In [4]:
def return_all_files(bucket, folder_location):
    
    '''
    Return a list of all parquet files in the s3 folder
    Input: 
        bucket: s3 bucket location
        parquet_folder_location: Location of parquet files inside the s3 bucket
    Output:
        files_list: List of tuples pairs of (bucket,file_name)
    '''
    
    s3_client = boto3.Session()
    s3 = s3_client.resource('s3')
    my_bucket = s3.Bucket(bucket)
    files_list = [(file.key, os.path.basename(file.key)) for file in my_bucket.objects.filter(Prefix = folder_location)]
    return files_list

In [5]:
process_folders = ["esg_reports_batch_1", "esg_reports_batch_2", "esg_reports_batch_3", "esg_reports_batch_4"]

In [6]:
curr_files = []
for folder in process_folders:
    curr_files += return_all_files(BUCKET, folder)

In [7]:
mb = master_bar(range(len(curr_files)))
print(f"Processing files from folders {process_folders}")
for idx in mb:
    path, name = curr_files[idx]        
    s3_client = boto3.client('s3')
    s3_client.download_file(BUCKET, path, name)
    pdf = pdfium.PdfDocument(name)
    n_pages = len(pdf)
    for i in progress_bar(range(n_pages), parent = mb):
        page = pdf[i]
        textpage = page.get_textpage()
        text_all = textpage.get_text_range()
        if "democra" in text_all.lower():
            s3_client.upload_file(name, BUCKET, os.path.join(STORAGE,name))
    os.remove(name)
print("Completed!")

Processing files from folders ['esg_reports_batch_1', 'esg_reports_batch_2', 'esg_reports_batch_3', 'esg_reports_batch_4']


Completed!
