In [8]:
%load_ext autoreload
%autoreload 2

In [18]:
import sys
import os
from openai import OpenAI
import numpy as np


# Add the parent directory (Auditbot_backend) to the system path
sys.path.append(
    os.path.abspath(
        os.path.join(
            os.path.dirname(f"{os.getcwd()}/content_page_parser.ipynb"), 
            '..'
        )
    )
)


from utils.preprocessing import pdf_to_text, pdf_to_pages, get_true_pages
from utils.custom_print import pretty_print_dictionary, pretty_print_list

DOCUMENT_DIR = "../data/documents"

In [4]:
# example documents to try out first
CONTENTS_PATH = "../data/documents/ar_fy2021_22_content_pages.pdf"
DOCUMENT_PATH = "../data/documents/ar_fy2021_22.pdf"

# converts pdf to pure text
text = pdf_to_text(CONTENTS_PATH)

# converts pdf to a list where each element is a page text
pages = pdf_to_pages(DOCUMENT_PATH)

# creates a dictoinary which matches each text to its true page number 
# (as stated in pdf doc)
true_pages = get_true_pages(pages)

In [20]:
print(text[:300])


i
CONTENTS
Page
OVERVIEW .
......................................................................................................             1
P
ART  
I  
A
 
 
 
:  AUDIT  OF  GOVERNMENT 
 
 
FINANCIAL 
 
STATEMENTS              13
P
ART  I  B  :  AUDIT  OF  GOVERNMENT  MINISTRIES,  ORGANS
      


In [21]:
pretty_print_list(pages[:3])

idx: 0

REPORT
OF  THE
AUDITOR-GENERAL
FOR  THE  FINANCIAL  YEAR
2021/22
REPUBLIC  OF  SINGAPORE


----------------------------------------------------------

idx: 1

4 July 2022
Madam Halimah Yacob 
President
Republic of Singapore
Dear Madam President
In accordance with the provisions of the Audit Act 
1966,  I am pleased to submit my Report on the 
audits carried out for the financial year 2021/22. 
Yours sincerely
Goh Soon Poh 
Auditor-General
AUDITOR-GENERAL’S OFFICE
SINGAPORE


----------------------------------------------------------

idx: 2

REPORT
OF  THE
AUDITOR-GENERAL
FOR  THE  FINANCIAL  YEAR
2021/22


----------------------------------------------------------



In [15]:
pretty_print_dictionary(true_pages)

{
    "1": "1\nOVERVIEW\nI am pleased to present my Report on the audits carried out by the Auditor-General\u2019s \nOffice (AGO) for the financial year 2021/22.\nThe audits give assurance to the President and Parliament on the proper accounting, \nmanagement and use of public resources.  In the process, they help strengthen financial \ngovernance of the public service and enhance the accountability of public sector \nentities as custodians and stewards of public resources.\nAudit Authority\nThe Auditor-General\u2019s authority to audit and report is provided for in legislation. \n \nThe key legislation that governs AGO\u2019s work are the Constitution of the Republic of \nSingapore and the Audit Act 1966.  \nThe details of AGO\u2019s audit authority are inAnnex I.\nAGO audits the accounts of all Government departments and offices.  AGO also \naudits public authorities and bodies administering public funds as prescribed by law, \nor upon request and with the approval of the Minister fo

In [22]:
# max number of pages in pdf document 
max_page = max(true_pages.keys())
print("max page:", max_page)

max page: 94


In [58]:
# feeds content page as raw text into LLM
# LLM outputs a nice hirarchial tree data structure (nested dictionary)
def request_tree(max_page, text):

    client = OpenAI(api_key='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')

    instructions = f""" Below is a table of contents. Convert it to a tree structure.
    Headings are the children of the root node. Subheadings are the children of the corresponding heading nodes.
    Make the leaf nodes a list of integers, ```[current page number, next page number mentioned]```.
    For heading nodes with indicated page numbers, add a child, SUMMARY, with a leaf node as value.
    Let the last page be {max_page+1}. 
    There is no need for a parent node called "Page".
    Give your tree structure as a python dictionary. 

    {text}
    """

    completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": instructions}
    ]
    )

    return completion


In [23]:
# helper functions to fill in page numbers into the tree the LLM creates
def edit_output(prior_tree):
     # this function is recurise to iterate through the nested tree depth first
    for child in prior_tree:

        # Tree's Leaves are a list of pages so check if leaves are rechead
        if type(prior_tree[child]) == list:

            prior_tree[child] = np.array(prior_tree[child]).flatten()
            # initialise dictionary to assign page_number: text pairs
            start = prior_tree[child][0]
            end = prior_tree[child][1]
            if start == end:
                end += 1

            prior_tree[child] = list(range(start, end))
        
        # if not a leaf, enter the child node (recursion)
        else:
            edit_output(prior_tree[child])

#----------------------------------------------------------------------------

# helper functions to convert LLM output (string) into an actual dictionary
def clean_output(completion):
    # get LLM output
    try:
        api_output = completion.choices[0].message.content
    except:
        api_output = completion.content

    # find where LLM's output dictionary is located. LLM always places code 
    # in between special characters
    start = api_output.find("{")
    end = api_output.find("```", start)
    dict_string = api_output[start:end].strip()

    # converts string to dictionary
    dict_actual = eval(dict_string)

    edit_output(dict_actual)

    return dict_actual

In [59]:
# main function that builds the whole tree data structure
def build_complete_tree(DOCUMENT_DIR):
    complete_tree = {}
    
    for root, dirs, files in os.walk(DOCUMENT_DIR):
        for file in files:
            # Get the full path of the file
            file_path = os.path.join(root, file)

            # documents directory contains content pages and audit reports
            # only consider content pages
            end = file_path.find("_content_pages")

            if end == -1:
                continue

            start = file_path.find("ar_fy") + len("ar_fy")
            
            year = file_path[start:end]

            text = pdf_to_text(file_path)

            doc_file_path = file_path[:end] + ".pdf"

            pages = pdf_to_pages(doc_file_path)

            true_pages = get_true_pages(pages)

            max_page = max(true_pages.keys())
            print("year of audit report:", year)
            print("number of pages:", max_page)

            completion = request_tree(max_page, text)

            print(completion.choices[0].message.content)

            complete_tree[year] = clean_output(completion)
    
    return complete_tree


In [None]:
complete_tree = build_complete_tree(DOCUMENT_DIR)

save_json(complete_tree, "../data/parsed_documents/complete_tree.json")

In [7]:
# Example Tree just for 2022_23
tree = {
    "CONTENTS": {
        "OVERVIEW": list(range(1, 13)),
        "PART I A: AUDIT OF GOVERNMENT FINANCIAL STATEMENTS": list(range(13, 15)),
        "PART I B: AUDIT OF GOVERNMENT MINISTRIES, ORGANS OF STATE AND GOVERNMENT FUNDS": {
            "SUMMARY": list(range(15, 17)), # added manually
            "MINISTRY OF COMMUNICATIONS AND INFORMATION": {
                "Tenderers Appointed Despite Not Meeting Evaluation Criteria": list(range(17, 18))
            },
            "MINISTRY OF DEFENCE": {
                "Overpayments for Vehicle Maintenance Services": list(range(18, 19))
            },
            "SAVER-Premium Fund": {
                "Incorrect Contributions to SAVER-Premium Fund Accounts": list(range(19, 21))
            },
            "PRIME MINISTER’S OFFICE": {
                "Public Service Division": {
                    "Weak Controls over the Most Privileged Operating System and Database Accounts": list(range(21, 23)),
                    "Weaknesses in Review of Accounts and Administrators’ Activities in IT System": list(range(23, 25)),
                    "Excessive Access Rights Granted to Encryption/Decryption Keys and Data Files": list(range(25, 27))
                }
            },
            "JUDICATURE": {
                "State Courts": {
                    "Development Project Savings Not Promptly Declared": list(range(27, 28)),
                    "Possible Irregularities in Quotations for Star Rate Items": list(range(28, 29))
                }
            }
        },
        "PART II: AUDIT OF STATUTORY BOARDS": {
            "SUMMARY": list(range(29, 31)), # added manually
            "MINISTRY OF CULTURE, COMMUNITY AND YOUTH": {
                "People’s Association": {
                    "Lapses in Evaluation and Award of Maintenance and Cleaning Services Tenders": list(range(31, 33)),
                    "Contracts Awarded to Debarred Contractors": list(range(33, 34)),
                    "Inappropriate Practices in Management of Moneys for Welfare Assistance Schemes": list(range(34, 36)),
                    "No Formal Agreements with External Service Partners": list(range(36, 38)),
                    "Weaknesses in Management of Accounts in IT Systems": list(range(38, 40))
                }
            },
            "MINISTRY OF SUSTAINABILITY AND THE ENVIRONMENT": {
                "Singapore Food Agency": {
                    "Inadequate Monitoring of Managing Agents’ Work": list(range(40, 41)),
                    "Possible Irregularities Noted in Quotations": list(range(41, 42))
                }
            },
            "MINISTRY OF TRANSPORT": {
                "Civil Aviation Authority of Singapore": {
                    "Lapses in Management of Grants": list(range(42, 45))
                }
            }
        },
        "PART III: THEMATIC AUDIT – COVID-19 RELATED GRANTS": {
            "SUMMARY": list(range(45, 56)), # added manually
            "MINISTRY OF FINANCE": {
                "Inland Revenue Authority of Singapore": list(range(56, 64)),
                "Jobs Support Scheme": {
                    "Inadequate Documentation on Risk Assessments": list(range(64, 65)),
                    "Cut-off Dates Used for Jobs Support Scheme Allotment Not in Accordance with Approved Dates": list(range(65, 66)),
                    "Evaluation and Approval of Appeals could be Improved": list(range(66, 67)),
                    "Anti-gaming Checks could be Improved": list(range(67, 68)),
                    "Lapses in Blocking Entities from Receiving Payouts": list(range(68, 69)),
                    "Lapses in Recovery of Payouts from Government-funded Entities": list(range(69, 70)),
                    "Rental Cash Grant/Rental Support Scheme": {
                        "No Documentary Evidence of Approval on Risk Assessment for Rental Cash Grant": list(range(70, 71)),
                        "Evaluation of Rental Grants could be Improved": list(range(71, 72)),
                        "Grant Eligibility Not Assessed in Accordance with Legislation or Requirements": list(range(72, 73)),
                        "Follow-up on Return of Grants could be Improved": list(range(73, 75)),
                        "Gaps in Granting of User Access Rights": list(range(75, 79))
                    }
                }
            },
            "MINISTRY OF TRADE AND INDUSTRY": {
                "Singapore Tourism Board": {
                    "SingapoRediscovers Vouchers Scheme": {
                        "Inadequate Controls to Ensure Only Eligible Merchants and Products were Onboarded": list(range(79, 81)),
                        "Disbursements Made for Ineligible Persons": list(range(81, 82)),
                        "Potential Exceptions Not Followed Up": list(range(82, 83)),
                        "No Monitoring on Merchants’ Compliance with Terms and Conditions": list(range(83, 84)),
                        "Weaknesses in IT Access Controls": list(range(84, 85)),
                        "Weaknesses in IT General Controls": list(range(85, 89))
                    }
                }
            }
        },
        "PART IV: AUDIT OF GOVERNMENT-OWNED COMPANIES AND OTHER ACCOUNTS": list(range(89, 91)),
        "ANNEXES": {
            "Annex I: AGO’s Audit Authority": list(range(91, 96)),
            "Annex II: Criteria for Appointment of Auditors": list(range(96, 99))
        }
    }
}
