In [4]:
!pip install selenium
!pip install beautifulsoup4
!pip install pandas

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException, ElementClickInterceptedException
from bs4 import BeautifulSoup
import time
import requests
import pandas as pd

# Set up the Selenium WebDriver (make sure to download the appropriate driver for your browser)
# Define the path to the chromedriver binary
chromedriver_path = '/path/to/chromedriver'  # Update the path

# Set up Chrome options
options = webdriver.ChromeOptions()
options.binary_location = chromedriver_path

driver = webdriver.Chrome()

# Open the website
driver.get('https://bookwhen.com/ventures-studio?start=2018-08-01T00%3A00%3A00.000%2B01%3A00#focus=ev-st29-20180809190000')

def load_all_events(driver, max_retries=5):
    retries = 0
    while True:
        try:
            # Scroll to the bottom of the page to ensure the button loads
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            
            # Wait for the "Show more..." button to be present and clickable
            show_more_button = WebDriverWait(driver, 60).until(
                EC.element_to_be_clickable((By.XPATH, "//button[@data-hook='show_more']"))
            )
            show_more_button.click()
            print("Clicked 'Show more...' button.")
            time.sleep(5)  # Wait for the content to load
            retries = 0  # Reset retries after a successful click
        except (NoSuchElementException, ElementClickInterceptedException) as e:
            if retries >= max_retries:
                print(f"Max retries reached. No 'Show more...' button found: {e}")
                break
            else:
                retries += 1
                print(f"No 'Show more...' button found or click intercepted (retry {retries}/{max_retries}): {e}")
                time.sleep(10)  # Wait a bit longer before retrying
        except StaleElementReferenceException:
            print(f"Stale element reference encountered (retry {retries}/{max_retries}). Retrying...")
            time.sleep(2)  # Wait a bit before retrying
            
        except Exception as e:
            print("Page loading complete.")
            break

# Load all events
load_all_events(driver)

# Parse the page source with BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Once all the events are loaded and parsed, close the browser
driver.quit()

print("\nBrowser closed.")
print("\n\nExtracting data...")
base_url = "https://bookwhen.com/ventures-studio"

# Parse the event links from the data-event attribute
event_links = [tr['data-event'] for tr in soup.select('tr[data-hook="agenda_list_item"]')]

titles = []
date_times = []
prices = []
availabilities = []
event_statuses = []
details_list = []
instructions_list = []
addresses = []

# Loop through each event link to get details
for event_id in event_links:
    event_url = f"{base_url}/e/{event_id}"
    event_response = requests.get(event_url)
    event_soup = BeautifulSoup(event_response.content, 'html.parser')
    
    # Extract title
    title_element = event_soup.find('h1', class_='margin_bottom_none')
    title = title_element.text.strip() if title_element else 'N/A'
    
    # Extract date and time
    date_time_element = event_soup.find('div', class_='event_header_info')
    date_time = date_time_element.text.strip() if date_time_element else 'N/A'
    
    # Extract price
    price_element = event_soup.find('span', class_='price')
    price = price_element.text.strip() if price_element else 'N/A'
    
    # Extract availability
    availability_element = event_soup.find('span', class_='notice_me')
    availability = availability_element.text.strip() if availability_element else 'N/A'
    
    # Extract event status
    event_status_element = event_soup.find('div', class_='event_status notice_me')
    event_status = event_status_element.find('span').text.strip() if event_status_element else 'Confirmed'

   # Extract address (second event_header_info)
    address = 'N/A'  # Default to 'N/A'
    address_elements = event_soup.find_all('div', class_='event_header_info')
    
    # Check if there's a second 'event_header_info' element and if it contains address-related content
    if len(address_elements) > 1:
        address_element = address_elements[1]
        
        # Look for content that would indicate this element contains an address (e.g., "location-dot" or street info)
        if 'location-dot' in address_element.get_text() or any(word in address_element.get_text() for word in ['Street', 'Road', 'Avenue', 'SW8', 'London']):
            address = address_element.get_text(strip=True).replace('location-dot', '')  # Clean up the address
        else:
            address = 'N/A'
    
    # Extract details
    details_element = event_soup.find('div', class_='information_section')
    if details_element:
        details_title = details_element.find('h3', class_='title')
        details = details_element.find('div', class_='markdown').text.strip() if details_title and details_title.text == 'Details' else 'N/A'
    else:
        details = 'N/A'
    
    # Extract instructions
    instructions_element = event_soup.find('div', class_='information_section with_dividing_line_above')
    if instructions_element:
        instructions_title = instructions_element.find('h3', class_='title')
        instructions = instructions_element.find('div', class_='markdown').text.strip() if instructions_title and instructions_title.text == 'Instructions' else 'N/A'
    else:
        instructions = 'N/A'
    
    titles.append(title)
    date_times.append(date_time)
    prices.append(price)
    availabilities.append(availability)
    event_statuses.append(event_status)
    details_list.append(details)
    instructions_list.append(instructions)
    addresses.append(address)

# Create a DataFrame
data = {
    'Title': titles,
    'Date & Time': date_times,
    'Price': prices,
    'Availability': availabilities,
    'Event Status': event_statuses,
    'Address': addresses,
    'Details': details_list,
    'Instructions': instructions_list
}

df = pd.DataFrame(data)

# Extracting the date part
df['Date'] = df['Date & Time'].str.split(',').str[1].str.strip() 
df['Date'] = df['Date'].str.split(' ').str[0:3].str.join(' ')  
df['Date'] = pd.to_datetime(df['Date'], format="%d %B '%y")

# Extract day 
df['Day'] = df['Date & Time'].str.split(',').str[0]

# Extracting the start and end times
df['Start Time'] = df['Date & Time'].str.extract(r'(\d{1,2}:\d{2}[ap]m|\d{1,2}[ap]m)')[0]
df['End Time'] = df['Date & Time'].str.extract(r'–\s*(\d{1,2}:\d{2}[ap]m|\d{1,2}[ap]m)')[0]

# Extracting the time zone
df['Time Zone'] = df['Date & Time'].str.extract(r'([A-Z]{3})$')[0]  

df['Price (£)'] = df['Price'].str.replace('£', '', regex=False)  
df['Price (£)'] = df['Price (£)'].str.replace('Free', '', regex=False).replace('', '0')

df.drop(columns=['Date & Time', 'Price'],inplace=True)

# Rearrange columns
cols = df.columns.tolist()
new_order = ['Title', 'Date', 'Day', 'Start Time', 'End Time', 'Time Zone', 'Price (£)'] + [col for col in cols if col not in ['Title', 'Date', 'Day', 'Start Time', 'End Time', 'Time Zone', 'Price (£)']]
df_rearranged = df[new_order]

# Export to CSV 
csv_file = 'Urban_Kpop_event_list.csv'
df_rearranged.to_csv(csv_file, index=False, encoding='utf-8-sig')


print(f"Data exported to {csv_file}")

Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Stale element reference encountered (retry 0/5). Retrying...
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked 'Show more...' button.
Clicked '

Clicked 'Show more...' button.
No 'Show more...' button found or click intercepted (retry 1/5): Message: element click intercepted: Element <button name="button" type="button" class="load_more" data-hook="show_more">...</button> is not clickable at point (254, 631). Other element would receive the click: <form class="cookie-banner" data-turbo="true" action="/config/cookies?from_host=bookwhen.com&amp;from_path=%2Fventures-studio" accept-charset="UTF-8" method="post">...</form>
  (Session info: chrome=130.0.6723.70)
Stacktrace:
	GetHandleVerifier [0x00007FF7DBAE3AF5+28005]
	(No symbol) [0x00007FF7DBA483F0]
	(No symbol) [0x00007FF7DB8E580A]
	(No symbol) [0x00007FF7DB93D6CE]
	(No symbol) [0x00007FF7DB93B16C]
	(No symbol) [0x00007FF7DB938628]
	(No symbol) [0x00007FF7DB93785D]
	(No symbol) [0x00007FF7DB92990E]
	(No symbol) [0x00007FF7DB95BA3A]
	(No symbol) [0x00007FF7DB929246]
	(No symbol) [0x00007FF7DB95BC50]
	(No symbol) [0x00007FF7DB97B8B3]
	(No symbol) [0x00007FF7DB95B7E3]
	(No symbol) [

Clicked 'Show more...' button.
No 'Show more...' button found or click intercepted (retry 1/5): Message: element click intercepted: Element <button name="button" type="button" class="load_more" data-hook="show_more">...</button> is not clickable at point (254, 631). Other element would receive the click: <form class="cookie-banner" data-turbo="true" action="/config/cookies?from_host=bookwhen.com&amp;from_path=%2Fventures-studio" accept-charset="UTF-8" method="post">...</form>
  (Session info: chrome=130.0.6723.70)
Stacktrace:
	GetHandleVerifier [0x00007FF7DBAE3AF5+28005]
	(No symbol) [0x00007FF7DBA483F0]
	(No symbol) [0x00007FF7DB8E580A]
	(No symbol) [0x00007FF7DB93D6CE]
	(No symbol) [0x00007FF7DB93B16C]
	(No symbol) [0x00007FF7DB938628]
	(No symbol) [0x00007FF7DB93785D]
	(No symbol) [0x00007FF7DB92990E]
	(No symbol) [0x00007FF7DB95BA3A]
	(No symbol) [0x00007FF7DB929246]
	(No symbol) [0x00007FF7DB95BC50]
	(No symbol) [0x00007FF7DB97B8B3]
	(No symbol) [0x00007FF7DB95B7E3]
	(No symbol) [

KeyboardInterrupt: 

In [6]:
df_rearranged.sample(5)

Unnamed: 0,Title,Date,Day,Start Time,End Time,Time Zone,Price (£),Availability,Event Status,Address,Details,Instructions
1159,DYLAN CHOREOGRAPHY - TIMBALAND 'GIVE IT TO ME'...,2023-09-24,Sunday,5pm,7pm,BST,17.0,Fully booked,Confirmed,,Terms and Conditions:\n\nCancellation Policy -...,Please wear comfortable dance attire and suita...
1246,RED VELVET - 'CHILL KILL',2023-12-05,Tuesday,7pm,9pm,GMT,17.0,Only 3 spaces left,Confirmed,,Terms and Conditions:\n\nCancellation Policy -...,Please wear comfortable dance attire and suita...
40,Chung Ha - LOVE U (Part 1),2018-10-19,Friday,7pm,8pm,BST,7.5,Only 3 spaces left,Confirmed,,,
1180,RIIZE - 'GET A GUITAR',2023-10-10,Tuesday,7pm,9pm,BST,17.0,Fully booked,Confirmed,,Terms and Conditions:\n\nCancellation Policy -...,Please wear comfortable dance attire and suita...
102,MAMAMOO - 'GOGOBEBE' (고고베베),2019-04-10,Wednesday,7pm,9pm,BST,16.0,Fully booked,Confirmed,,Studio 202 @Sylvia Young Theatre\nCancellation...,
