In [223]:
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Constants
HEADLESS_MODE = True  # Headless mode for browser
START_URL = 'https://www.goodschools.com.au/compare-schools/search/in-victoria/secondary'  # Starting URL for scraping
MAX_SCHOOLS = 50  # Set the maximum number of schools to scrape
PAGE_LOAD_WAIT_TIME = 5  # Time to wait for pages to load (in seconds)
DETAIL_PAGE_WAIT_TIME = 3  # Time to wait for detail pages to load (in seconds)
CSV_FILE_NAME = 'victoria_secondary_schools.csv'  # Output CSV file name
DEFAULT_VALUE = 'Not provided'  # Default value for missing fields

# Example of a local path for chromedriver, commented out for portability
# This should be replaced or uncommented only when running on a specific machine
# service = Service('/path/to/your/file/venv/chromedriver-mac-arm64/chromedriver') 

# Set up the WebDriver
service = Service(ChromeDriverManager().install())
options = webdriver.ChromeOptions()
if HEADLESS_MODE:
    options.add_argument('--headless')  # Run in headless mode without GUI
driver = webdriver.Chrome(service=service, options=options)

# Set the starting URL
driver.get(START_URL)
time.sleep(PAGE_LOAD_WAIT_TIME)  # Wait for the page to load

# Initialize variables
schools_data = []
next_page_url = START_URL

# Start scraping
while len(schools_data) < MAX_SCHOOLS and next_page_url:
    # Parse the current page with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Find all school cards on the page
    school_cards = soup.find_all('div', class_='row mt-2 mb-3 pt-3 pb-3 bg-white border-bottom')

    for card in school_cards:
        try:
            # Extract the school name
            name_tag = card.find('h5', class_='mb-0 font-weight-bold')
            if not name_tag:
                continue  # Skip if name not found
            name = name_tag.text.strip()

            # Extract the school detail page link
            school_link_tag = name_tag.find_parent('a')
            if school_link_tag is None:
                continue  # Skip current card
            school_link = school_link_tag['href']

            # Extract sector information
            info_divs = card.find_all('div', class_='col-md-3 col-6 small')
            info_dict = {}
            for div in info_divs:
                label = div.find('b').text.strip()
                value = div.get_text(strip=True).replace(label, '').strip()
                info_dict[label] = value
            sector = info_dict.get('Sector', '').strip()

            # Visit the school's detail page
            driver.get(school_link)
            time.sleep(DETAIL_PAGE_WAIT_TIME)  # Wait for the page to load

            # Parse the detail page
            detail_soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Check if the page has moved or returns an error
            error_message = detail_soup.find('h1', text=re.compile(r'Page not found', re.IGNORECASE))
            if error_message:
                print(f"School page not found for '{name}'. Skipping.")
                continue  # Skip this school

            # Extract geolocation information
            geolocation_tags = detail_soup.find_all('span', class_='map-address load-address')
            geolocations = []
            postcodes = []
            for tag in geolocation_tags:
                if 'data-address' in tag.attrs:
                    geolocation = tag['data-address'].strip()
                    geolocations.append(geolocation)

                    # Extract postcode using regex
                    postcode_match = re.search(r'(\d{4})$', geolocation)
                    if postcode_match:
                        postcode = postcode_match.group(1)
                    else:
                        postcode = DEFAULT_VALUE
                    postcodes.append(postcode)

            # Join geolocations and postcodes into comma-separated strings
            geolocations_str = ', '.join(geolocations) if geolocations else DEFAULT_VALUE
            postcodes_str = ', '.join(postcodes) if postcodes else DEFAULT_VALUE

            # Initialize academic result variables with 'Not provided'
            scores_40_plus = DEFAULT_VALUE
            median_score = DEFAULT_VALUE
            vce_completions = DEFAULT_VALUE
            vet_completions = DEFAULT_VALUE

            # Extract Academic Results and store as separate fields
            academic_results_header = detail_soup.find('h4', text='Academic Results')
            if academic_results_header:
                academic_results_div = academic_results_header.find_parent('div')
                result_items = academic_results_div.find_all('p', class_='mb-1')
                for item in result_items:
                    key_text = item.contents[0].strip().strip(':')
                    value_span = item.find('span', class_='font-weight-bold')
                    if value_span:
                        value_text = value_span.text.strip()

                        # Assign values to corresponding variables
                        if key_text == 'Scores of 40+':
                            scores_40_plus = value_text
                        elif key_text == 'Median Score':
                            median_score = value_text
                        elif key_text == 'Satisfactory completions of VCE':
                            vce_completions = value_text
                        elif key_text == 'Satisfactory completions of VET':
                            vet_completions = value_text

            # Add the extracted data to the list
            schools_data.append({
                'Name': name,
                'Sector': sector,
                'Geolocation': geolocations_str,
                'Postcode': postcodes_str,
                'Scores of 40+': scores_40_plus,
                'Median Score': median_score,
                'Satisfactory completions of VCE': vce_completions,
                'Satisfactory completions of VET': vet_completions
            })

            # Check if we've reached the maximum number of schools
            if len(schools_data) >= MAX_SCHOOLS:
                break

            # Go back to the school listings page
            driver.back()
            time.sleep(DETAIL_PAGE_WAIT_TIME)  # Wait for the page to load

        except Exception as e:
            print(f"Error processing school '{name}': {e}")
            # Go back to the listings page in case of error
            driver.back()
            time.sleep(DETAIL_PAGE_WAIT_TIME)
            continue  # Skip this school

    # Check if we've reached the maximum number of schools
    if len(schools_data) >= MAX_SCHOOLS:
        break

    # Find the 'Next' page link
    next_page_link = soup.find('a', rel='next')
    if next_page_link:
        next_page_url = next_page_link['href']
        driver.get(next_page_url)
        time.sleep(PAGE_LOAD_WAIT_TIME)  # Wait for the page to load
    else:
        # No more pages to navigate
        next_page_url = None

# Close the WebDriver
driver.quit()

# Create a DataFrame and export to CSV
df = pd.DataFrame(schools_data)
df.fillna(DEFAULT_VALUE, inplace=True)
df.to_csv(CSV_FILE_NAME, index=False)
print('Data extraction completed. CSV file has been saved.')


  error_message = detail_soup.find('h1', text=re.compile(r'Page not found', re.IGNORECASE))
  academic_results_header = detail_soup.find('h4', text='Academic Results')


School page not found for 'Homestead Senior Secondary College'. Skipping.
Data extraction completed. CSV file has been saved.
