In [43]:

import numpy as np
import pandas as pd
import json
import os

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from tqdm.autonotebook import tqdm

In [44]:
# CHROME_BINARY_LOCATION = "/usr/bin/chrome-linux64/chrome"
# CHROMEDRIVER_BINARY_LOCATION = "/usr/bin/chromedriver-linux64/chromedriver"
chromedriver_path = 'C:/Users/gabeb/Desktop/programming/chromedriver-win64/chromedriver.exe'

def add_driver_options(options):
    """
    Add configurable options
    """
    chrome_options = Options()
    for opt in options:
        chrome_options.add_argument(opt)
    return chrome_options

def initialize_driver():
    """
    Initialize the web driver
    """
    driver_config = {
        "options": [
            "--headless",
            "--no-sandbox",
            "--start-fullscreen",
            "--allow-insecure-localhost",
            "--disable-dev-shm-usage",
            "user-agent=Chrome/116.0.5845.96"
        ],
    }
    options = add_driver_options(driver_config["options"])
    # options.binary_location = CHROME_BINARY_LOCATION
    driver = webdriver.Chrome(
        executable_path=chromedriver_path,
        options=options)
    return driver

In [45]:
def parse_rollercoaster_page(soup):
    # Initialize variables to store information
    city = region = country = make = None
    trackStats = {}
    image_urls = []

    # Find location information
    feature = soup.find('div', {"id": "feature"})
    if feature:
        descList = feature.findAll('div')
        if descList:
            desc = descList[0]
            locationLinks = desc.findAll('a')
            if len(locationLinks) == 3: # amusement park, city, country
                city = locationLinks[1].contents[0]
                region = locationLinks[1].contents[0]  # Assuming region is the same as city
                country = locationLinks[2].contents[0]
            else:
                city = locationLinks[-3].contents[0]
                region = locationLinks[-2].contents[0]
                country = locationLinks[-1].contents[0]

            # Find make information
            if len(descList) >= 2:
                make_link = descList[1].find('a')
                if make_link:
                    make = make_link.contents[0]

    # Find track information
    trackStatsTable = soup.find('table', {'class': 'stat-tbl'})
    if trackStatsTable:
        trackStatsTag = trackStatsTable.findAll('tr')
        for tr in trackStatsTag:
            key = tr.find('th').text.strip()
            valueTag = tr.find('td')
            if valueTag:
                value = valueTag.text.strip()
                if key == "Elements": # list of links
                    value = [a.contents[0] for a in valueTag.findAll('a')]
                elif valueTag.find('span', {'class': 'float'}) is not None: # float value
                    unit = valueTag.contents[1].strip() if len(valueTag.contents) >= 2 else ""
                    key = f"{key} ({unit})"
                    value = valueTag.find('span', {'class': 'float'}).text.strip()
            else:
                value = None
            trackStats[key] = value

    # Find images
    image_tags = soup.find_all('img', {'class': 'image'})
    for img in image_tags:
        image_url = urljoin(rcdb_url, img['src'])
        image_urls.append(image_url)

    return city, region, country, make, trackStats, image_urls

def parse_rollercoaster_table_row(row):
    # Get data from the table row
    elements = row.findAll('td')
    coasterName = elements[1].find('a').text.strip()
    coasterLink = elements[1].find('a')['href']
    amusementPark = elements[2].find('a').text.strip()
    coasterType = elements[3].find('a').text.strip()
    coasterDesign = elements[4].find('a').text.strip()
    statusTag = elements[5].find('a')
    status = statusTag.contents[0] if statusTag else "Removed"
    openedTag = elements[6].find('time')
    opened = openedTag.contents[0] if openedTag else "unknown"

    # Get data from the coaster specific link
    driver.get(f"{rcdb_url}{coasterLink}")
    coaster_soup = BeautifulSoup(driver.page_source, 'html')
    city, region, country, make, trackStats, image_urls = parse_rollercoaster_page(coaster_soup)

    return {
        'Name': coasterName, 
        'Amusement Park': amusementPark, 
        'Type': coasterType, 
        'Design': coasterDesign, 
        'Status': status, 
        'Opened': opened, 
        'City': city, 
        'Region': region, 
        'Country': country, 
        'Make': make,
        **trackStats,
        'Image URLs': image_urls
    }


In [52]:
rcdb_url = "https://rcdb.com"

# Path to your ChromeDriver executable
chromedriver_path = r'C:\Users\gabeb\Desktop\programming\chromedriver-win64\chromedriver.exe'

# Initialize Chrome WebDriver
# service = Service(chromedriver_path)
# service.start()
driver = webdriver.Chrome()
# driver = initialize_driver()
driver.get(f"{rcdb_url}/r.htm?ot=2")
soup = BeautifulSoup(driver.page_source, 'html')
numPages = int(soup.find('div', {"id": "rfoot"}).findAll('a')[-2].contents[0])
print(f"Total number of pages to iterate through: {numPages}")

pbar = tqdm(range(numPages))
rollercoasters = []

for pageNum in pbar:
    print(f"Getting information for page {pageNum+1}")
    driver.get(f"{rcdb_url}/r.htm?page={pageNum+1}&ot=2")
    soup = BeautifulSoup(driver.page_source, 'html')
    table = soup.findAll('table')[1].find('tbody')
    rows = table.findAll('tr')
    
    for row in rows:
        rowData = parse_rollercoaster_table_row(row)
        pbar.set_postfix(
            coasterName=rowData['Name'], 
            amusementPark=rowData['Amusement Park'])
        rollercoasters.append(rowData)
        
    # every page, save the information to a dataframe
    df = pd.DataFrame.from_records(rollercoasters)
    df.to_csv("data.csv", index=False)

ValueError: Timeout value connect was <object object at 0x000001C1667D48E0>, but it must be an int, float or None.

In [None]:
driver.quit()

In [None]:
df.head()

Unnamed: 0,Name,Amusement Park,Type,Design,Status,Opened,City,Region,Country,Make,...,Duration,G-Force (),Vertical Angle (°),Uphill Length (ft),Downhill Length (ft),Δ Elevation (ft),Airtime Points,Crossings,Bank Angle (°),Drop
0,€uro-Coaster,Wiener Prater,Steel,Suspended,Removed,5/29/2020,Vienna,Vienna,Austria,Reverchon,...,,,,,,,,,,
1,€uro Coaster,Funland Theme Park,Steel,Sit Down,Removed,2021,Somerset,England,United Kingdom,SBF Visa Group,...,,,,,,,,,,
2,1066,Festyland,Steel,Sit Down,Operating,3/27/2005,Bretteville-Sur-Odon,Normandy,France,Soquet,...,1:00,,,,,,,,,
3,10 Inversion Roller Coaster,Chimelong Paradise,Steel,Sit Down,Operating,2/2006,Guangzhou,Guangdong,China,Intamin Amusement Rides,...,1:32,,,,,,,,,
4,1970 Galaxy Rip Tide Coaster,Swampy Jack's Wongo Adventure,Steel,Sit Down,Relocated,6/27/2014,Panama City Beach,Florida,United States,S.D.C.,...,,,,,,,,,,


In [None]:


import pandas as pd

def search_and_display_rollercoasters(df):
    # Function to filter rollercoasters based on user input
    def filter_rollercoasters(query):
        return df[df.apply(lambda row: any(query.lower() in str(cell).lower() for cell in row), axis=1)]

    # Function to display rollercoasters
    def display_rollercoasters(rollercoasters):
        print("Rollercoasters found:")
        for i, (_, rollercoaster) in enumerate(rollercoasters.iterrows(), start=1):
            print(f"{i}. {rollercoaster['Name']} - {rollercoaster['Amusement Park']}")

    # Function to display rollercoaster information
    def display_rollercoaster_info(coaster):
        print("\nRollercoaster Information:")
        for key, value in coaster.items():
            print(f"{key}: {value}")

    # Ask user for search query
    query = input("Enter search query (rollercoaster name, type, company, or location): ").strip()

    # Filter rollercoasters based on query
    results = filter_rollercoasters(query)

    if results.empty:
        print("No rollercoasters found matching the search query.")
        return

    # Display filtered rollercoasters
    display_rollercoasters(results)

    # Ask user to select a rollercoaster
    while True:
        selection = input("Enter the number of the rollercoaster you want more information about (or 'exit' to quit): ").strip()
        if selection.lower() == 'exit':
            return
        if not selection.isdigit() or not (1 <= int(selection) <= len(results)):
            print("Invalid input. Please enter a number corresponding to a rollercoaster.")
            continue
        else:
            selected_coaster = results.iloc[int(selection) - 1]
            display_rollercoaster_info(selected_coaster)
            return

# Example usage:
# Assuming df is your Pandas DataFrame containing rollercoaster information
# Call the function passing the DataFrame as argument
search_and_display_rollercoasters(df)


NameError: name 'df' is not defined