In [13]:
import os
import pickle
import re
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup

def extract_school_details(url):
    # Initialize a WebDriver instance
    driver = webdriver.Chrome()  # You can change the WebDriver as needed

    try:
        driver.get(url)

        # Handle cookies
        cookies_dir = 'saved_cookies'
        if not os.path.exists(cookies_dir):
            os.mkdir(cookies_dir)

        save_location = os.path.join(cookies_dir, 'cookies.pkl')
        pickle.dump(driver.get_cookies(), open(save_location, 'wb'))

        max_attempts = 10 
        attempts = 0

        while attempts < max_attempts:
            try:
                button_search = driver.find_element(By.CLASS_NAME, 'zoeken-resultaten-lijst-meer')
                button_search.click()
               
            except:
                attempts += 1
               

        # Get the HTML source
        html = driver.page_source

        # Parse HTML with BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')

        # Extract links
        school_links_dirty = soup.select('h3 a')
        school_links_clean = [item.get('href') for item in school_links_dirty]

        # Extract names
        school_list_dirty = soup.find_all(attrs={'class': re.compile(r'zoekresultaat-title')})
        school_list_clean = [school.text.strip() for school in school_list_dirty]

        # Extract addresses
        adress_list_dirty = soup.find_all(attrs={'class': re.compile(r'zoekresultaat-adres')})
        adress_list_clean = [address.text.strip() for address in adress_list_dirty]

        # Extract details
        details_list_dirty = soup.find_all(attrs={'class': re.compile(r'zoekresultaat-details')})
        details_list_clean = [detail.text.strip() for detail in details_list_dirty]

        # Extract number of students using regex
        pattern = r'\d+'
        number_of_students = [re.search(pattern, item).group() if re.search(pattern, item) else '' for item in details_list_clean]

        # Create a DataFrame
        df = pd.DataFrame({'name': school_list_clean, 'address': adress_list_clean, 'link': school_links_clean, 'details': details_list_clean, 'n_students': number_of_students})

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        df = pd.DataFrame()  # Return an empty DataFrame in case of an error

    finally:
        driver.quit()  # Close the WebDriver
        return df

In [14]:
test_df = extract_school_details('https://scholenopdekaart.nl/zoeken/basisscholen?zoektermen=Almere&weergave=Lijst')
test_df

Unnamed: 0,name,address,link,details,n_students
0,Alexander Roozendaalschool (Almere),Prokofjevstraat 3,/basisscholen/almere/23773/alexander-roozendaa...,OpenbaarSpeciaal onderwijs85 leerlingen,85
1,Shri Ganesha School,Regentesseweg 1,/basisscholen/almere/23141/shri-ganesha-school/,Confessioneel overigBasisonderwijs110 leerling...,110
2,Nautilus College - Radioweg,Radioweg 35,/basisscholen/almere/4998/nautilus-college-rad...,Algemeen bijzonderSpeciaal onderwijs,
3,Nautilus SO,J.J. Slauerhoffstraat 51,/basisscholen/almere/26221/nautilus-so/,Algemeen bijzonderSpeciaal onderwijs70 leerlingen,70
4,Islamitische Basisschool Al Iman,Harderwijkoever 3,/basisscholen/almere/9087/islamitische-basissc...,Confessioneel overigBasisonderwijs356 leerlingen,356
...,...,...,...,...,...
75,Montessori Campus,Marie Curielaan 301,/basisscholen/almere/26332/montessori-campus/,OpenbaarBasisonderwijs108 leerlingen,108
76,Olivijn,Marathonlaan 7,/basisscholen/almere/7458/olivijn/,Algemeen bijzonderSpeciaal onderwijs170 leerli...,170
77,De Bongerd,Boomgaardweg 10A,/basisscholen/almere/26244/de-bongerd/,OpenbaarSpeciaal onderwijs79 leerlingen,79
78,School van Vrede,Pieter van Damstraat 36,/basisscholen/almere/26626/school-van-vrede/,Algemeen bijzonderBasisonderwijs,


In [15]:
len(test_df)

80