In [55]:
import os, time, requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm import tqdm 
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException

In [74]:
BASE_URL = "https://www.legislation.vic.gov.au"
HEADERS  = {"User-Agent": "Mozilla/5.0"}
ACTS_LIST_URL = f"{BASE_URL}/in-force/acts/"
STATUTORY_LIST_URL = f"{BASE_URL}/in-force/statutory-rules/"
SAVE_DIR = "vic_laws_pdfs"
os.makedirs(SAVE_DIR, exist_ok=True)
DELAY = 3          # seconds between requests
HTTP_TIMEOUT = 20 # seconds for HTTP requests

In [88]:
keywords = [
    "road safety",
    "transport",
    "traffic",
    "vehicles",
    "driver",
    "licensing",
    "public transport",

    "occupational health and safety",
    "workplace",
    "employment",
    "employment standards",
    "industrial relations",
    "wages",
    "remuneration",
    "working conditions",

    "building",
    "building regulations",

    "public health",
    "public health regulations",

    "evidence",
    "crimes",
    "summary offences",

    "privacy",
    "freedom of information"
]

In [89]:
keywords_url = []
statutory_urls = []
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)

def find_matching_acts_paginated():
    driver.get(ACTS_LIST_URL)
    time.sleep(2)

    results = []
    page_count = 1  # Start from page 1

    while True:
        print(f"Scraping page {page_count}...")
        time.sleep(1.5)  # Wait for page content

        links = driver.find_elements(By.CSS_SELECTOR, "table td a")
        for link in links:
            title = link.text.strip()
            low = title.lower()
            if any(k in low for k in keywords):
                href = link.get_attribute("href")
                slug = href.strip("/").split("/")[-1]
                results.append((title, slug, href))

        # Try to click the next button
        try:
            next_button = driver.find_element(
                By.CSS_SELECTOR,
                'button[aria-label="Go to next page"]'
            )
            next_button.click()
            page_count += 1
        except NoSuchElementException:
            print("No more pages.")
            break

    print(f"✅ Finished scraping. Total pages visited: {page_count}")
    return results

# Run
matches = find_matching_acts_paginated()
for title, slug, url in matches:
    print(f"{title}, url: {url}")
    keywords_url.append(url)

driver.quit()

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
No more pages.
✅ Finished scraping. Total pages visited: 21
Accident Compensation (Occupational Health and Safety) Act 1996, url: https://www.legislation.vic.gov.au/in-force/acts/accident-compensation-occupational-health-and-safety-act-1996
Building Act 1993, url: https://www.legislation.vic.gov.au/in-force/acts/building-act-1993
Building and Construction Industry Security of Payment Act 2002, url: https://www.legislation.vic.gov.au/in-force/acts/building-and-construction-industry-security-payment-act-2002
Bus Services Act 1995
Former title:
Public Transport Competition Act 1995, ur

In [93]:
def download_pdf_from_act_url(act_url, target_folder="vic_acts_laws_pdfs"):
    driver.get(act_url)
    time.sleep(2)  # Wait for page content to load

    try:
        # Find all document links
        doc_links = driver.find_elements(By.CSS_SELECTOR, "a.rpl-document__link")

        success = False

        for link in doc_links:
            href = link.get_attribute("href")
            if href and href.lower().endswith(".pdf"):
                # Found a valid PDF link
                pdf_url = href
                fname = os.path.join(target_folder, os.path.basename(pdf_url))
                os.makedirs(target_folder, exist_ok=True)
                print("URL: ",act_url)
                print(f"Downloading from: {pdf_url}")
                resp = requests.get(pdf_url, stream=True)
                with open(fname, "wb") as f:
                    for chunk in resp.iter_content(1024):
                        f.write(chunk)

                print(f"Saved: {fname}")
                success = True

        if not success:
            print(f"[!] No PDF found on page: {act_url}")
        return success

    except Exception as e:
        print(f"[!] Error at {act_url}: {e}")
        return False

from selenium import webdriver
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)

# Assuming `keywords_url` already exists from previous scraping step
for url in keywords_url:
    download_pdf_from_act_url(url)

driver.quit()

URL:  https://www.legislation.vic.gov.au/in-force/acts/accident-compensation-occupational-health-and-safety-act-1996
Downloading from: https://content.legislation.vic.gov.au/sites/default/files/96028276-71c6-3afa-aafc-200def3f3a74_96-13a002.pdf
Saved: vic_acts_laws_pdfs\96028276-71c6-3afa-aafc-200def3f3a74_96-13a002.pdf
URL:  https://www.legislation.vic.gov.au/in-force/acts/building-act-1993
Downloading from: https://content.legislation.vic.gov.au/sites/default/files/2025-08/93-126aa143-authorised.pdf
Saved: vic_acts_laws_pdfs\93-126aa143-authorised.pdf
URL:  https://www.legislation.vic.gov.au/in-force/acts/building-and-construction-industry-security-payment-act-2002
Downloading from: https://content.legislation.vic.gov.au/sites/default/files/2024-01/02-15aa013-authorised.pdf
Saved: vic_acts_laws_pdfs\02-15aa013-authorised.pdf
URL:  https://www.legislation.vic.gov.au/in-force/acts/bus-services-act-1995
Downloading from: https://content.legislation.vic.gov.au/sites/default/files/2025-08

In [None]:
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)

def find_matching_stat_rules_paginated():
    driver.get(STATUTORY_LIST_URL)
    time.sleep(2)

    results = []
    page_count = 1

    while True:
        print(f"Scraping Statutory Rules page {page_count}...")
        time.sleep(1.5)

        links = driver.find_elements(By.CSS_SELECTOR, "table td a")
        for link in links:
            title = link.text.strip()
            title_lc = title.lower()
            if any(k in title_lc for k in keywords):
                href = link.get_attribute("href")
                slug = href.strip("/").split("/")[-1]
                results.append((title, slug, href))
        try:
            next_button = driver.find_element(By.CSS_SELECTOR, 'button[aria-label="Go to next page"]')
            next_button.click()
            page_count += 1
        except NoSuchElementException:
            print("No more pages.")
            break

    print(f"Finished scraping Statutory Rules. Pages visited: {page_count}")
    return results

matches = find_matching_stat_rules_paginated()
for title, slug, url in matches:
    print(f"{title} -> {url}")
    statutory_urls.append(url)

driver.quit()

🔍 Scraping Statutory Rules page 1...
🔍 Scraping Statutory Rules page 2...
🔍 Scraping Statutory Rules page 3...
🔍 Scraping Statutory Rules page 4...
🔍 Scraping Statutory Rules page 5...
🔍 Scraping Statutory Rules page 6...
🔍 Scraping Statutory Rules page 7...
🔍 Scraping Statutory Rules page 8...
🔍 Scraping Statutory Rules page 9...
🔍 Scraping Statutory Rules page 10...
🔍 Scraping Statutory Rules page 11...
🔍 Scraping Statutory Rules page 12...
No more pages.
Finished scraping Statutory Rules. Pages visited: 12
Building and Construction Industry Security of Payment Regulations 2023 -> https://www.legislation.vic.gov.au/in-force/statutory-rules/building-and-construction-industry-security-payment-regulations-2023
Building Regulations 2018 -> https://www.legislation.vic.gov.au/in-force/statutory-rules/building-regulations-2018
Child Employment Regulations 2024 -> https://www.legislation.vic.gov.au/in-force/statutory-rules/child-employment-regulations-2024
Children's Court (Evidence - Audio 

In [94]:
len(statutory_urls), len(keywords_url)

(57, 39)

In [95]:
def download_pdf_from_act_url(statutory_urls, target_folder="vic_Statutory_laws_pdfs"):
    driver.get(statutory_urls)
    time.sleep(2)  # Wait for page content to load

    try:
        # Find all document links
        doc_links = driver.find_elements(By.CSS_SELECTOR, "a.rpl-document__link")

        success = False

        for link in doc_links:
            href = link.get_attribute("href")
            if href and href.lower().endswith(".pdf"):
                # Found a valid PDF link
                pdf_url = href
                fname = os.path.join(target_folder, os.path.basename(pdf_url))
                os.makedirs(target_folder, exist_ok=True)
                print("URL: ",statutory_urls)
                print(f"Downloading from: {pdf_url}")
                resp = requests.get(pdf_url, stream=True)
                with open(fname, "wb") as f:
                    for chunk in resp.iter_content(1024):
                        f.write(chunk)

                print(f"Saved: {fname}")
                success = True

        if not success:
            print(f"[!] No PDF found on page: {statutory_urls}")
        return success

    except Exception as e:
        print(f"[!] Error at {statutory_urls}: {e}")
        return False

from selenium import webdriver
chrome_options = Options()
chrome_options.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_options)

for url in statutory_urls:
    download_pdf_from_act_url(url)

driver.quit()

URL:  https://www.legislation.vic.gov.au/in-force/statutory-rules/building-and-construction-industry-security-payment-regulations-2023
Downloading from: https://content.legislation.vic.gov.au/sites/default/files/2023-05/23-34sra001-authorised.pdf
Saved: vic_Statutory_laws_pdfs\23-34sra001-authorised.pdf
URL:  https://www.legislation.vic.gov.au/in-force/statutory-rules/building-regulations-2018
Downloading from: https://content.legislation.vic.gov.au/sites/default/files/2024-10/18-38sra027-authorised.pdf
Saved: vic_Statutory_laws_pdfs\18-38sra027-authorised.pdf
URL:  https://www.legislation.vic.gov.au/in-force/statutory-rules/child-employment-regulations-2024
Downloading from: https://content.legislation.vic.gov.au/sites/default/files/2024-05/24-33sra001-authorised.pdf
Saved: vic_Statutory_laws_pdfs\24-33sra001-authorised.pdf
URL:  https://www.legislation.vic.gov.au/in-force/statutory-rules/childrens-court-evidence-audio-visual-and-audio-linking-rules-2018
Downloading from: https://cont