In [3]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


In [11]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# URL of the website
URL_LIST = ["https://www.redbus.in/online-booking/hrtc",                        
       "https://www.redbus.in/online-booking/kaac-transport",
       "https://www.redbus.in/online-booking/meghalaya-transport-corporation-mtc",
       "https://www.redbus.in/online-booking/sikkim-nationalised-transport-snt",
       "https://www.redbus.in/online-booking/astc",
       "https://www.redbus.in/online-booking/gsrtc",
       "https://www.redbus.in/online-booking/bihar-state-road-transport-corporation-bsrtc",
       "https://www.redbus.in/online-booking/ksrtc-kerala",
       "https://www.redbus.in/online-booking/west-bengal-transport-corporation",
       "https://www.redbus.in/online-booking/tsrtc"]

def initialize_driver():
    driver = webdriver.Chrome()
    driver.maximize_window()
    return driver

def load_page(driver, url):
    driver.get(url)
    time.sleep(5)  # Wait for the page to load

# Function to scrape bus routes
def scrape_bus_routes(driver):
    route_elements = driver.find_elements(By.CLASS_NAME, 'route')
    bus_routes_link = [route.get_attribute('href') for route in route_elements]
    bus_routes_name = [route.text.strip() for route in route_elements]
    return bus_routes_link, bus_routes_name

# Function to scrape bus details
def scrape_bus_details(driver, url, route_name):
    try:
        driver.get(url)
        time.sleep(5)  # Allow the page to load
        
        # Click the "View Buses" button if it exists
        try:
            view_buses_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CLASS_NAME, "button"))
            )
            driver.execute_script("arguments[0].click();", view_buses_button)
            time.sleep(5)  # Wait for buses to load
            
            # Scroll down to load all bus items
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(5)  # Wait for the page to load more content

            # Find bus item details
            bus_name_elements = driver.find_elements(By.CLASS_NAME, "travels.lh-24.f-bold.d-color")
            bus_type_elements = driver.find_elements(By.CLASS_NAME, "bus-type.f-12.m-top-16.l-color.evBus")
            departing_time_elements = driver.find_elements(By.CLASS_NAME, "dp-time.f-19.d-color.f-bold")
            duration_elements = driver.find_elements(By.CLASS_NAME, "dur.l-color.lh-24")
            reaching_time_elements = driver.find_elements(By.CLASS_NAME, "bp-time.f-19.d-color.disp-Inline")
            star_rating_elements = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
            price_elements = driver.find_elements(By.CLASS_NAME, "fare.d-block")

            # Use XPath to handle both seat availability classes
            seat_availability_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'seat-left m-top-30') or contains(@class, 'seat-left m-top-16')]")

            bus_details = []
            for i in range(len(bus_name_elements)):
                bus_detail = {
                    "Route_Name": route_name,
                    "Route_Link": url,
                    "Bus_Name": bus_name_elements[i].text,
                    "Bus_Type": bus_type_elements[i].text,
                    "Departing_Time": departing_time_elements[i].text,
                    "Duration": duration_elements[i].text,
                    "Reaching_Time": reaching_time_elements[i].text,
                    "Star_Rating": star_rating_elements[i].text if i < len(star_rating_elements) else '0',
                    "Price": price_elements[i].text,
                    "Seat_Availability": seat_availability_elements[i].text if i < len(seat_availability_elements) else '0'
                }
                bus_details.append(bus_detail)
            return bus_details
        
        except Exception as e:
            print(f"Error occurred while scraping bus details for {url}: {str(e)}")
            return []

    except Exception as e:
        print(f"Error occurred while accessing {url}: {str(e)}")
        return []

# List to hold all bus details

# total pages
def find_total_pages(driver):
    try:
        # Assuming pagination buttons have a class or XPath, adjust it as per the website's structure
        pagination_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'DC_117_paginationTable')]//div[contains(@class, 'DC_117_pageTabs')]")
        if pagination_elements:
            # The last pagination element usually gives the total number of pages
            total_pages = int(pagination_elements[-1].text)  # Convert the last pagination number to an integer
            return total_pages
        else:
            return 1  # If no pagination is found, assume only one page
    except Exception as e:
        print(f"Error finding total pages: {str(e)}")
        return 1  # Default to 1 page if unable to determine

# Function to scrape all pages
def scrape_all_pages_for_url(url):
    all_bus_details = []
    driver = initialize_driver()
    load_page(driver, url)
    pages = find_total_pages(driver)
    print(f"Total pages found: {pages}")
    for page in range(1, pages+1):
        driver = initialize_driver()
        load_page(driver, url)
        if page > 1:
            pagination_tab = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, f"//div[contains(@class, 'DC_117_pageTabs')][text()='{page}']"))
                )
            driver.execute_script("arguments[0].scrollIntoView();", pagination_tab)
            driver.execute_script("arguments[0].click();", pagination_tab)
            time.sleep(5)  # Wait for the page to load
            
            all_bus_routes_link, all_bus_routes_name = scrape_bus_routes(driver)
            # Iterate over each bus route link and scrape the details
            for link, name in zip(all_bus_routes_link, all_bus_routes_name):
                bus_details = scrape_bus_details(driver, link, name)
                if bus_details:
                    all_bus_details.extend(bus_details)
    return all_bus_details
        
                    

        

def scrape_multiple_urls_and_save():
    for i, url in enumerate(URL_LIST):
        print(f"Scraping URL {i+1}/{len(URL_LIST)}: {url}")
        
        # Scrape bus details for the current URL
        all_bus_details1 = scrape_all_pages_for_url(url)
        
        # Convert the list of dictionaries to a DataFrame
        df = pd.DataFrame(all_bus_details1)
        
        # Save the DataFrame to a CSV file, with a unique name for each URL
        filename = f'bus_det_final_csvfile_{i+1}.csv'
        df.to_csv(filename, index=False)
        
        print(f"Saved bus details to {filename}")

# Start the scraping process for all URLs
scrape_multiple_urls_and_save()

# Close the driver
driver.quit()

Scraping URL 1/10: https://www.redbus.in/online-booking/hrtc
Total pages found: 4
Error occurred while scraping bus details for https://www.redbus.in/bus-tickets/chamba-himachal-pradesh-to-delhi: Message: 
Stacktrace:
0   chromedriver                        0x0000000102e8c274 cxxbridge1$str$ptr + 1907280
1   chromedriver                        0x0000000102e8475c cxxbridge1$str$ptr + 1875768
2   chromedriver                        0x0000000102a98260 cxxbridge1$string$len + 89488
3   chromedriver                        0x0000000102adc50c cxxbridge1$string$len + 368700
4   chromedriver                        0x0000000102b167d0 cxxbridge1$string$len + 606976
5   chromedriver                        0x0000000102ad112c cxxbridge1$string$len + 322652
6   chromedriver                        0x0000000102ad1d7c cxxbridge1$string$len + 325804
7   chromedriver                        0x0000000102e54504 cxxbridge1$str$ptr + 1678560
8   chromedriver                        0x0000000102e58e6c cxxbridge1

There was an error managing chromedriver (error decoding response body); using driver found in the cache
Error sending stats to Plausible: error sending request for url (https://plausible.io/api/event)


Error occurred while scraping bus details for https://www.redbus.in/bus-tickets/gopalganj-to-delhi: Message: 
Stacktrace:
0   chromedriver                        0x0000000104474274 cxxbridge1$str$ptr + 1907280
1   chromedriver                        0x000000010446c75c cxxbridge1$str$ptr + 1875768
2   chromedriver                        0x0000000104080260 cxxbridge1$string$len + 89488
3   chromedriver                        0x00000001040c450c cxxbridge1$string$len + 368700
4   chromedriver                        0x00000001040fe7d0 cxxbridge1$string$len + 606976
5   chromedriver                        0x00000001040b912c cxxbridge1$string$len + 322652
6   chromedriver                        0x00000001040b9d7c cxxbridge1$string$len + 325804
7   chromedriver                        0x000000010443c504 cxxbridge1$str$ptr + 1678560
8   chromedriver                        0x0000000104440e6c cxxbridge1$str$ptr + 1697352
9   chromedriver                        0x0000000104421618 cxxbridge1$str$ptr

There was an error managing chromedriver (error decoding response body); using driver found in the cache


ProtocolError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [9]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# List of URLs
URL_LIST = ["https://www.redbus.in/online-booking/hrtc",                        
       "https://www.redbus.in/online-booking/kaac-transport",
       "https://www.redbus.in/online-booking/meghalaya-transport-corporation-mtc",
       "https://www.redbus.in/online-booking/sikkim-nationalised-transport-snt",
       "https://www.redbus.in/online-booking/astc",
       "https://www.redbus.in/online-booking/gsrtc",
       "https://www.redbus.in/online-booking/bihar-state-road-transport-corporation-bsrtc",
       "https://www.redbus.in/online-booking/ksrtc-kerala",
       "https://www.redbus.in/online-booking/west-bengal-transport-corporation",
       "https://www.redbus.in/online-booking/tsrtc"]
def initialize_driver():
    """Initialize and return a Selenium WebDriver."""
    driver = webdriver.Chrome()
    driver.maximize_window()
    return driver

def load_page(driver, url):
    """Load a page and wait for it to fully load."""
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
    time.sleep(2)  # Small pause to ensure full load

def find_total_pages(driver):
    """Find the total number of pages for the given URL."""
    try:
        pagination_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'DC_117_paginationTable')]//div[contains(@class, 'DC_117_pageTabs')]")
        if pagination_elements:
            return int(pagination_elements[-1].text)  # Last pagination number
        return 1
    except Exception as e:
        print(f"Error finding total pages: {e}")
        return 1

def scrape_bus_routes(driver):
    """Scrape bus route links and names."""
    route_elements = driver.find_elements(By.CLASS_NAME, 'route')
    bus_routes_link = [route.get_attribute('href') for route in route_elements]
    bus_routes_name = [route.text.strip() for route in route_elements]
    return bus_routes_link, bus_routes_name

def scrape_bus_details(driver, url, route_name):
    """Scrape bus details from the given route."""
    try:
        driver.get(url)
        time.sleep(5)

        try:
            # Wait for buses to load and scroll to the bottom
            view_buses_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, "button")))
            driver.execute_script("arguments[0].click();", view_buses_button)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(5)

            # Scraping bus details
            bus_name_elements = driver.find_elements(By.CLASS_NAME, "travels.lh-24.f-bold.d-color")
            bus_type_elements = driver.find_elements(By.CLASS_NAME, "bus-type.f-12.m-top-16.l-color.evBus")
            departing_time_elements = driver.find_elements(By.CLASS_NAME, "dp-time.f-19.d-color.f-bold")
            duration_elements = driver.find_elements(By.CLASS_NAME, "dur.l-color.lh-24")
            reaching_time_elements = driver.find_elements(By.CLASS_NAME, "bp-time.f-19.d-color.disp-Inline")
            star_rating_elements = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
            price_elements = driver.find_elements(By.CLASS_NAME, "fare.d-block")
            seat_availability_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'seat-left m-top-30') or contains(@class, 'seat-left m-top-16')]")

            bus_details = []
            for i in range(len(bus_name_elements)):
                bus_detail = {
                    "Route_Name": route_name,
                    "Route_Link": url,
                    "Bus_Name": bus_name_elements[i].text,
                    "Bus_Type": bus_type_elements[i].text,
                    "Departing_Time": departing_time_elements[i].text,
                    "Duration": duration_elements[i].text,
                    "Reaching_Time": reaching_time_elements[i].text,
                    "Star_Rating": star_rating_elements[i].text if i < len(star_rating_elements) else '0',
                    "Price": price_elements[i].text,
                    "Seat_Availability": seat_availability_elements[i].text if i < len(seat_availability_elements) else '0'
                }
                bus_details.append(bus_detail)
            return bus_details

        except Exception as e:
            print(f"Error occurred while scraping bus details for {url}: {str(e)}")
            return []

    except Exception as e:
        print(f"Error occurred while accessing {url}: {str(e)}")
        return []

def scrape_all_pages_for_url(driver, url):
    """Scrape all pages for the given URL."""
    all_bus_details = []
    load_page(driver, url)
    total_pages = find_total_pages(driver)

    for page in range(1, total_pages + 1):
        if page > 1:
            # Navigate to the next page
            pagination_tab = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, f"//div[contains(@class, 'DC_117_pageTabs')][text()='{page}']")))
            driver.execute_script("arguments[0].scrollIntoView();", pagination_tab)
            driver.execute_script("arguments[0].click();", pagination_tab)
            time.sleep(5)

        # Scrape bus routes and details
        all_bus_routes_link, all_bus_routes_name = scrape_bus_routes(driver)
        for link, name in zip(all_bus_routes_link, all_bus_routes_name):
            bus_details = scrape_bus_details(driver, link, name)
            if bus_details:
                all_bus_details.extend(bus_details)

    return all_bus_details



def scrape_multiple_urls_and_save():
    """Scrape bus details for multiple URLs and save to CSV."""
    driver = initialize_driver()
    
    try:
        for i, url in enumerate(URL_LIST):
            print(f"Scraping URL {i+1}/{len(URL_LIST)}: {url}")
            
            # Scrape bus details for the current URL
            all_bus_details = scrape_all_pages_for_url(driver, url)
            
            # Convert the list of dictionaries to a DataFrame
            df = pd.DataFrame(all_bus_details)
            
            # Save the DataFrame to a CSV file, with a unique name for each URL
            filename = f'red_bus_details_{i+1}.csv'
            df.to_csv(filename, index=False)
            print(f"Saved bus details to {filename}")
    
    finally:
        driver.quit()

# Start scraping
scrape_multiple_urls_and_save()


Scraping URL 1/10: https://www.redbus.in/online-booking/hrtc
Error occurred while scraping bus details for https://www.redbus.in/bus-tickets/delhi-to-chamba-himachal-pradesh: Message: disconnected: not connected to DevTools
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome=129.0.6668.60)
Stacktrace:
0   chromedriver                        0x000000010283c274 cxxbridge1$str$ptr + 1907280
1   chromedriver                        0x000000010283475c cxxbridge1$str$ptr + 1875768
2   chromedriver                        0x0000000102448260 cxxbridge1$string$len + 89488
3   chromedriver                        0x00000001024320fc core::str::slice_error_fail::hbaf5d05fe3921cd2 + 63636
4   chromedriver                        0x000000010243203c core::str::slice_error_fail::hbaf5d05fe3921cd2 + 63444
5   chromedriver                        0x00000001024c6234 cxxbridge1$string$len + 605540
6   chromedriver                        0x000000010248112c c

WebDriverException: Message: disconnected: not connected to DevTools
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome=129.0.6668.60)
Stacktrace:
0   chromedriver                        0x000000010283c274 cxxbridge1$str$ptr + 1907280
1   chromedriver                        0x000000010283475c cxxbridge1$str$ptr + 1875768
2   chromedriver                        0x0000000102448260 cxxbridge1$string$len + 89488
3   chromedriver                        0x00000001024320fc core::str::slice_error_fail::hbaf5d05fe3921cd2 + 63636
4   chromedriver                        0x000000010243203c core::str::slice_error_fail::hbaf5d05fe3921cd2 + 63444
5   chromedriver                        0x00000001024c6234 cxxbridge1$string$len + 605540
6   chromedriver                        0x000000010248112c cxxbridge1$string$len + 322652
7   chromedriver                        0x0000000102481d7c cxxbridge1$string$len + 325804
8   chromedriver                        0x0000000102804504 cxxbridge1$str$ptr + 1678560
9   chromedriver                        0x0000000102808e6c cxxbridge1$str$ptr + 1697352
10  chromedriver                        0x00000001027e9618 cxxbridge1$str$ptr + 1568244
11  chromedriver                        0x000000010280973c cxxbridge1$str$ptr + 1699608
12  chromedriver                        0x00000001027dabbc cxxbridge1$str$ptr + 1508248
13  chromedriver                        0x0000000102825854 cxxbridge1$str$ptr + 1814576
14  chromedriver                        0x00000001028259ac cxxbridge1$str$ptr + 1814920
15  chromedriver                        0x00000001028343fc cxxbridge1$str$ptr + 1874904
16  libsystem_pthread.dylib             0x00000001a1d8826c _pthread_start + 148
17  libsystem_pthread.dylib             0x00000001a1d8308c thread_start + 8


In [21]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException
import time
import pandas as pd

# List of URLs to scrape
URL_LIST = [
    "https://www.redbus.in/online-booking/hrtc",
    "https://www.redbus.in/online-booking/kaac-transport",
    "https://www.redbus.in/online-booking/meghalaya-transport-corporation-mtc",
    "https://www.redbus.in/online-booking/sikkim-nationalised-transport-snt",
    "https://www.redbus.in/online-booking/astc",
    "https://www.redbus.in/online-booking/gsrtc",
    "https://www.redbus.in/online-booking/bihar-state-road-transport-corporation-bsrtc",
    "https://www.redbus.in/online-booking/ksrtc-kerala",
    "https://www.redbus.in/online-booking/west-bengal-transport-corporation",
    "https://www.redbus.in/online-booking/tsrtc"
]

# Function to initialize the WebDriver
def initialize_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Use headless mode to avoid opening a browser window
    options.add_argument("--disable-gpu")
    driver = webdriver.Chrome(options=options)
    driver.maximize_window()
    return driver

# Function to load a page
def load_page(driver, url):
    driver.get(url)
    time.sleep(5)  # Wait for the page to load

# Function to scrape bus routes
def scrape_bus_routes(driver):
    route_elements = driver.find_elements(By.CLASS_NAME, 'route')
    bus_routes_link = [route.get_attribute('href') for route in route_elements]
    bus_routes_name = [route.text.strip() for route in route_elements]
    return bus_routes_link, bus_routes_name

# Function to scrape bus details
def scrape_bus_details(driver, url, route_name):
    try:
        driver.get(url)
        time.sleep(5)  # Allow the page to load
        
        # Click the "View Buses" button if it exists
        try:
            view_buses_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CLASS_NAME, "button"))
            )
            driver.execute_script("arguments[0].click();", view_buses_button)
            time.sleep(5)  # Wait for buses to load
            
            # Scroll down to load all bus items
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(5)  # Wait for the page to load more content

            # Find bus item details
            bus_name_elements = driver.find_elements(By.CLASS_NAME, "travels.lh-24.f-bold.d-color")
            bus_type_elements = driver.find_elements(By.CLASS_NAME, "bus-type.f-12.m-top-16.l-color.evBus")
            departing_time_elements = driver.find_elements(By.CLASS_NAME, "dp-time.f-19.d-color.f-bold")
            duration_elements = driver.find_elements(By.CLASS_NAME, "dur.l-color.lh-24")
            reaching_time_elements = driver.find_elements(By.CLASS_NAME, "bp-time.f-19.d-color.disp-Inline")
            star_rating_elements = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
            price_elements = driver.find_elements(By.CLASS_NAME, "fare.d-block")

            # Use XPath to handle both seat availability classes
            seat_availability_elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'seat-left m-top-30') or contains(@class, 'seat-left m-top-16')]")

            bus_details = []
            for i in range(len(bus_name_elements)):
                bus_detail = {
                    "Route_Name": route_name,
                    "Route_Link": url,
                    "Bus_Name": bus_name_elements[i].text,
                    "Bus_Type": bus_type_elements[i].text,
                    "Departing_Time": departing_time_elements[i].text,
                    "Duration": duration_elements[i].text,
                    "Reaching_Time": reaching_time_elements[i].text,
                    "Star_Rating": star_rating_elements[i].text if i < len(star_rating_elements) else '0',
                    "Price": price_elements[i].text,
                    "Seat_Availability": seat_availability_elements[i].text if i < len(seat_availability_elements) else '0'
                }
                bus_details.append(bus_detail)
            return bus_details
        
        except Exception as e:
            print(f"Error occurred while scraping bus details for {url}: {str(e)}")
            return []

    except Exception as e:
        print(f"Error occurred while accessing {url}: {str(e)}")
        return []



# Function to scrape multiple pages if the bus routes are paginated
def scrape_all_pages_for_url(url):
    driver = initialize_driver()
    load_page(driver, url)

    # Scrape first page
    all_bus_routes_link, all_bus_routes_name = scrape_bus_routes(driver)
    all_bus_details = []

    # Scrape bus details for each route
    for link, name in zip(all_bus_routes_link, all_bus_routes_name):
        bus_details = scrape_bus_details(driver, link, name)
        if bus_details:
            all_bus_details.extend(bus_details)
    
    driver.quit()
    return all_bus_details

# Function to scrape from multiple URLs and save the results
def scrape_multiple_urls_and_save():
    for i, url in enumerate(URL_LIST):
        print(f"Scraping URL {i+1}/{len(URL_LIST)}: {url}")
        
        # Scrape bus details for the current URL
        all_bus_details = scrape_all_pages_for_url(url)
        
        # Convert the list of dictionaries to a DataFrame
        df = pd.DataFrame(all_bus_details)
        
        # Save the DataFrame to a CSV file, with a unique name for each URL
        filename = f'bus_det_final_csvfile_{i+1}.csv'
        df.to_csv(filename, index=False)
        
        print(f"Saved bus details to {filename}")

# Start the scraping process for all URLs
scrape_multiple_urls_and_save()


Scraping URL 1/10: https://www.redbus.in/online-booking/hrtc
Saved bus details to bus_det_final_csvfile_1.csv
Scraping URL 2/10: https://www.redbus.in/online-booking/kaac-transport
Saved bus details to bus_det_final_csvfile_2.csv
Scraping URL 3/10: https://www.redbus.in/online-booking/meghalaya-transport-corporation-mtc
Saved bus details to bus_det_final_csvfile_3.csv
Scraping URL 4/10: https://www.redbus.in/online-booking/sikkim-nationalised-transport-snt
Saved bus details to bus_det_final_csvfile_4.csv
Scraping URL 5/10: https://www.redbus.in/online-booking/astc
Saved bus details to bus_det_final_csvfile_5.csv
Scraping URL 6/10: https://www.redbus.in/online-booking/gsrtc
Saved bus details to bus_det_final_csvfile_6.csv
Scraping URL 7/10: https://www.redbus.in/online-booking/bihar-state-road-transport-corporation-bsrtc
Saved bus details to bus_det_final_csvfile_7.csv
Scraping URL 8/10: https://www.redbus.in/online-booking/ksrtc-kerala
Saved bus details to bus_det_final_csvfile_8.csv
S