In [13]:
import requests
from bs4 import BeautifulSoup
import os
import urllib.request

import fitz

In [26]:
# URL of the page with the annual reports
url = 'https://www.ago.gov.sg/publications/annual-reports/'
url_root = 'https://www.ago.gov.sg'

# Create a folder to store the downloaded PDFs
data_folder = '../data/documents'
os.makedirs(data_folder, exist_ok=True)

# Fetch the content of the webpage
response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all links to PDF files
    pdf_links = soup.find_all('a', href=True)
    pdf_links = [link['href'] for link in pdf_links 
                 if (link['href'].endswith('.pdf') and 
                     link['href'].find("infographic") == -1)
                ]
  
    # Download each PDF
    for pdf_link in pdf_links:
        file_name = os.path.join(data_folder, pdf_link.split('/')[-1])
        urllib.request.urlretrieve(url_root + pdf_link, file_name)
else:
    print(f'Failed to retrieve the webpage. Status code: {response.status_code}')


In [22]:
# Converts PDF documents into single string
def get_content_page_pdf(pdf_path):
    """
    @param pdf_path: str, Path to document

    @return text: str
    """
    # using fitz from PyMuPDF library (superior)

    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    # store pages in a list
    text = ""

    # Open new pdf with table of contents
    new_pdf = fitz.open()
    add = False

    # Iterate through each page
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num).get_text()

        if page.lower().find("content") != -1:
            add = True
            new_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)
            continue

        if page.lower().find("overview") != -1:
            break

        if add:
            new_pdf.insert_pdf(pdf_document, from_page=page_num, to_page=page_num)

    
    content_pdf = pdf_path.split('.pdf')[0] + "_content_pages.pdf"
    new_pdf.save(content_pdf)
    new_pdf.close()
    pdf_document.close()
        

In [24]:
def get_files_in_directory(directory):
    # List to store the file paths
    file_paths = []

    # Loop through the directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            # Get the full path of the file
            file_path = os.path.join(root, file)
            file_paths.append(file_path)

    return file_paths

# Specify the directory
directory = data_folder

# Get the list of file paths
file_paths = get_files_in_directory(directory)

# Print the file paths
for path in file_paths:
    get_content_page_pdf(path)