In [1]:
import requests
import csv
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin

BASE_URL = "https://web.archive.org/web/20250708180027/https://www.myfootdr.com.au"
REGIONS_URL = BASE_URL + "/our-clinics/regions/"

In [2]:
def get_soup(url):
    resp = requests.get(url)
    resp.raise_for_status()
    return BeautifulSoup(resp.text, "html.parser")

In [27]:
def extract_clinic_info(clinic_url):
    soup = get_soup(clinic_url)
    # Name
    name_tag = soup.find("h1")
    name = name_tag.get_text(strip=True) if name_tag else ""
    # Address — guess: look for <i> or <div> with address class or “/” symbol
    address = ""
    # Many sites have <i class="fa-map-marker" …> or similar, or <div class="address">
    ad = soup.find(lambda tag: tag.name=="i" and "fa-map-marker" in tag.get("class", []))
    if ad:
        parent = ad.parent
        if parent:
            ad.decompose()  # remove the icon
            address = parent.get_text(" ", strip=True)
    else:
        # fallback: find “/” in text or some known pattern
        addr_div = soup.find("div", class_="address") or soup.find("p", class_="address")
        if addr_div:
            for span in addr_div.find_all("span"):
                span.decompose()
            address = addr_div.get_text(" ", strip=True)
    # Email
    email = ""
    em = soup.find("a", href=lambda h: h and ("mailto:" in h or "mailt" in h))
    if em:
        email = em.get_text(strip=True)
    # Phone
    phone = ""
    ph = soup.find("a",
        class_="smart-button rose-button rose-o desktoponly",
        href=lambda h: h and ("tel:" in h or "phone" in h.lower()))
    if ph:
        phone = ph.get_text(strip=True).replace("Call", "").strip()
    else:
        # fallback: find <p> or <span> with Phone
        ph2 = soup.find(string=lambda t: t and "Call" in t or "Phone" in t)
        if ph2:
            phone = ph2.strip()
    # Services — assuming there’s a list of services
    services = []
    serv_section = soup.find(lambda tag: tag.name in ["ul","div"] and "Services Available" in tag.get_text())
    if serv_section:
        for li in serv_section.find_all("li"):
            services.append(li.get_text(strip=True))
    else:
        # fallback: find paragraphs with known keywords
        for p in soup.find_all("p"):
            txt = p.get_text(strip=True)
            if any(k in txt.lower() for k in ["orthotics", "ingrown", "laser", "footcare"]):
                services.append(txt)
    services_str = "; ".join(services)
    return {
        "Name of Clinic": name,
        "Address": address,
        "Email": email,
        "Phone": phone,
        "Services": services_str,
    }

In [28]:
print(extract_clinic_info("https://web.archive.org/web/20250517063937/https://www.myfootdr.com.au/our-clinics/noosa/"))

{'Name of Clinic': 'Allsports Podiatry Noosa', 'Address': 'Unit 4, 17 Sunshine Beach Rd Noosa QLD 4567', 'Email': 'vnoosa@allsportspodiatry.com.au', 'Phone': '07 5447 3312', 'Services': "Ingrown toenails on a woman's foot, pain in the big toe closeup; Ingrown toenails, also known as onychocryptosis, are a common and painful complaint. A true ingrown toenail is when a spike or edge of nail pierces the skin at the nail edge. This is known as the sulcus and can cause inflammation and even lead to infection.There are a variety of factors that can cause ingrown toenails. The most common cause is due to improper cutting of your toenail and leaving a spike of nail in the sulcus. It can also be a result of a curved nail, known as an involuted nail, from external pressure.If you are experiencing pain, redness, and swelling around your toenail, it may be time to consider seeing one of our podiatrists. With over 30 years of combined experience in treating foot and ankle conditions, we have succes

In [31]:
def get_all_clinic_links(start_url):
    """
    Start from the main clinics page, visit each region,
    and return a list of clinic page URLs.
    """
    clinic_urls = []

    # Step 1: Get regions from the main page
    soup = get_soup(start_url)
    region_links = [
        a['href'] for a in soup.find_all('a', href=True)
        if "/our-clinics/" in a['href'] and "regions" in a['href']
    ]
    region_links = list(set(region_links))  # unique links

    print(f"Found {len(region_links)} regions.")

    # Step 2: Visit each region and get clinic links
    for rlink in region_links:
        region_url = urljoin(BASE_URL, rlink)
        print(f"Processing region: {region_url}")
        soup_region = get_soup(region_url)

        # Find clinic links in the region page
        clinic_anchors = [
            a for a in soup_region.find_all("a", href=True)
            if "/our-clinics/" in a['href'] and "regions" not in a['href']
        ]
        for a in clinic_anchors:
            clinic_url = urljoin(BASE_URL, a['href'])
            clinic_urls.append(clinic_url)
            print(f"  → Found clinic: {clinic_url}")

        time.sleep(1)  # polite pause per region

    print(f"Total clinics found: {len(clinic_urls)}")
    return clinic_urls


In [46]:
clinics = get_all_clinic_links(REGIONS_URL)

Found 12 regions.
Processing region: https://web.archive.org/web/20250804040701/https://www.myfootdr.com.au/our-clinics/regions/brisbane/
  → Found clinic: https://web.archive.org/web/20250829222334/https://www.myfootdr.com.au/our-clinics/
  → Found clinic: https://web.archive.org/web/20250829222334/https://www.myfootdr.com.au/our-clinics/
  → Found clinic: https://web.archive.org/web/20250829222334/https://www.myfootdr.com.au/our-clinics/
  → Found clinic: https://web.archive.org/web/20250829222334/https://www.myfootdr.com.au/our-clinics/albany-creek-allsports-podiatry/
  → Found clinic: https://web.archive.org/web/20250829222334/https://www.myfootdr.com.au/our-clinics/aspley-allsports-podiatry/
  → Found clinic: https://web.archive.org/web/20250829222334/https://www.myfootdr.com.au/our-clinics/calamvale-podiatry-centre-allsports/
  → Found clinic: https://web.archive.org/web/20250829222334/https://www.myfootdr.com.au/our-clinics/camp-hill-podiatry-centre-allsports/
  → Found clinic: 

In [47]:
# remove duplicates and the ones without proper URLs (ending with 'our-clinics/')
un_clinics = list(set([c for c in clinics if not c.endswith("our-clinics/")]))
print(f"Unique clinics to process: {len(un_clinics)}")

Unique clinics to process: 97


In [50]:
# clean the clinic URLs remove the part before https://wwww.myfootdr.com.au
# split and keep only the path part
test = [c.split("https://www.myfootdr.com.au")[-1] if "https://www.myfootdr.com.au" in c else c for c in un_clinics]
test = ["https://www.myfootdr.com.au"+c for c in test]
print(test)

['https://www.myfootdr.com.au/our-clinics/kangaroo-point-allsports-podiatry-centre/', 'https://www.myfootdr.com.au/our-clinics/charlestown/', 'https://www.myfootdr.com.au/our-clinics/aspley-allsports-podiatry/', 'https://www.myfootdr.com.au/our-clinics/bim-podiatry-balcatta/', 'https://www.myfootdr.com.au/our-clinics/back-in-motion-podiatry-bacchus-marsh/', 'https://www.myfootdr.com.au/our-clinics/christies-beach-podiatry-centre/', 'https://www.myfootdr.com.au/our-clinics/townsville-podiatry-centre/', 'https://www.myfootdr.com.au/our-clinics/modbury-podiatry-centre/', 'https://www.myfootdr.com.au/our-clinics/moorebank-podiatry-centre/', 'https://www.myfootdr.com.au/our-clinics/wembley-downs-podiatry-centre/', 'https://www.myfootdr.com.au/our-clinics/sale-the-foot-and-ankle-clinic-podiatry-clinic/', 'https://www.myfootdr.com.au/our-clinics/the-gap-podiatry-centre-allsports/', 'https://www.myfootdr.com.au/our-clinics/casula-podiatry-centre/', 'https://www.myfootdr.com.au/our-clinics/indo

In [40]:
def scrape_myfootdr_clinics(clinics):
    output_file = "myfootdr_clinics.csv"
    csv_fields = ["Name of Clinic", "Address", "Email", "Phone", "Services"]
    with open(output_file, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=csv_fields)
        writer.writeheader()

        for idx, clinic_url in enumerate(clinics, start=1):
            print(f"Processing clinic {idx}/{len(clinics)}: {clinic_url}")
            try:
                info = extract_clinic_info(clinic_url)
                writer.writerow(info)
            except Exception as e:
                print(f"Error processing {clinic_url}: {e}")
            time.sleep(1)  # polite pause per clinic

In [51]:
scrape_myfootdr_clinics(test)

Processing clinic 1/97: https://www.myfootdr.com.au/our-clinics/kangaroo-point-allsports-podiatry-centre/
Processing clinic 2/97: https://www.myfootdr.com.au/our-clinics/charlestown/
Processing clinic 3/97: https://www.myfootdr.com.au/our-clinics/aspley-allsports-podiatry/
Processing clinic 4/97: https://www.myfootdr.com.au/our-clinics/bim-podiatry-balcatta/
Processing clinic 5/97: https://www.myfootdr.com.au/our-clinics/back-in-motion-podiatry-bacchus-marsh/
Processing clinic 6/97: https://www.myfootdr.com.au/our-clinics/christies-beach-podiatry-centre/
Processing clinic 7/97: https://www.myfootdr.com.au/our-clinics/townsville-podiatry-centre/
Processing clinic 8/97: https://www.myfootdr.com.au/our-clinics/modbury-podiatry-centre/
Processing clinic 9/97: https://www.myfootdr.com.au/our-clinics/moorebank-podiatry-centre/
Processing clinic 10/97: https://www.myfootdr.com.au/our-clinics/wembley-downs-podiatry-centre/
Processing clinic 11/97: https://www.myfootdr.com.au/our-clinics/sale-t