### Trail 1: Automating onely one bus and one route to obtain all data.

In [49]:
### TRIAL 3 - Scraping One Route Inside a transport name: APSRTC: Vijaywada to Hyderadabad.


from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.common.keys import Keys
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Initializing the Chrome driver
driver = webdriver.Chrome()

# Opening the desired URL
driver.get('https://www.redbus.in/')

# Maximizing the browser window
driver.maximize_window()

# Waiting for the page to load
time.sleep(5)

# Scrollong the page a specific number of times
for _ in range(1):
    body = driver.find_element(By.TAG_NAME, "body")
    body.send_keys(Keys.PAGE_DOWN)
    time.sleep(2)  # Waiting for the page to load new content

# Clicking the button (replacing 'button_xpath' with the actual XPath of the button)
button_xpath = "//div[@class='rtcName']"  
button = driver.find_element(By.XPATH, button_xpath)
button.click()

# Waiting for the new content to load
time.sleep(5)

# Function to scrape hrefs on the current page
def scrape_hrefs():
    bus_routes_links = driver.find_elements(By.XPATH, "//div[contains(@class, 'route_details')]//a")
    print(f"Found {len(bus_routes_links)} links")
    hrefs = [link.get_attribute('href') for link in bus_routes_links]
    route_names = [link.text for link in bus_routes_links]
    for href, route in zip(hrefs, route_names):
        print(f"Scraped href: {href}, Route: {route}")
    return hrefs, route_names

# List to store all hrefs
all_hrefs = []
all_route_names = []

# Scrolling the page a specific number of times (e.g., twice) on the first page only
for _ in range(2):
    body = driver.find_element(By.TAG_NAME, "body")
    body.send_keys(Keys.PAGE_DOWN)
    time.sleep(2)  # Waiting for the page to load new content

# Scrape hrefs on the first page
hrefs, route_names = scrape_hrefs()
all_hrefs.extend(hrefs)
all_route_names.extend(route_names)

# Looping through the next 4 pages
for page_num in range(2, 6):
    # Clicking the page number button to go to the next page
    page_button_xpath = f"//div[contains(@class,'DC_117_pageTabs') and text()='{page_num}']"
    page_button = driver.find_element(By.XPATH, page_button_xpath)
    page_button.click()
    
    # Waiting for the new page to load
    time.sleep(5)
    
    # Scraping hrefs on the current page
    hrefs, route_names = scrape_hrefs()
    all_hrefs.extend(hrefs)
    all_route_names.extend(route_names)

# Only processing the first href and route for now as a trail
first_href = all_hrefs[0]
first_route = all_route_names[0]

driver.get(first_href)

# Waiting for the page to load
time.sleep(8)

# Clicking all the available buttons (if available)
click_buttons = driver.find_elements(By.XPATH, "//div[@class='button']")
for button in click_buttons:
        button.click()
    
time.sleep(2)

# Scrolling through the page
scrolling = True
max_scroll_attempts = 20  # Maximum number of scroll attempts
scroll_attempts = 0

while scrolling and scroll_attempts < max_scroll_attempts:
    old_page_source = driver.page_source

    # Scrolling down the page
    body = driver.find_element(By.TAG_NAME, "body")
    body.send_keys(Keys.PAGE_DOWN)

    # Waiting for the page to load new content
    time.sleep(3)  # Increased wait time for dynamic content to load

    new_page_source = driver.page_source

    # Checking if the page source has changed
    if new_page_source == old_page_source:
        scroll_attempts += 1
    else:
        scroll_attempts = 0  # Reset the counter if new content is loaded

    if scroll_attempts >= max_scroll_attempts:
        scrolling = False

def extract_text(elements):
    return [element.text for element in elements]

# Collecting data
bus_names = extract_text(driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']"))
bus_types = extract_text(driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']"))
departure_times = extract_text(driver.find_elements(By.XPATH, "//div[@class='dp-time f-19 d-color f-bold']"))
durations = extract_text(driver.find_elements(By.XPATH, "//div[@class='dur l-color lh-24']"))
reaching_times = extract_text(driver.find_elements(By.XPATH, "//div[@class='bp-time f-19 d-color disp-Inline']"))
prices = extract_text(driver.find_elements(By.XPATH, "//div[@class='fare d-block']"))

# For ratings, handling the missing case
rating_elements = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
ratings = ['new bus' if element.text == '' else element.text for element in rating_elements]

# Ensuring ratings list is aligned with other lists
if len(ratings) < len(bus_names):
    ratings.extend(['new bus'] * (len(bus_names) - len(ratings)))

seat_availability = extract_text(driver.find_elements(By.XPATH, "//div[@class='seat-left m-top-16' or @class='seat-left m-top-30']"))

# Finding the maximum length among the lists
max_length = max(len(bus_names), len(bus_types), len(departure_times), len(durations), len(reaching_times), len(ratings), len(prices), len(seat_availability))

# Function to pad lists to the maximum length
def pad_list(lst, max_length):
    return lst + [''] * (max_length - len(lst))

# Padding all lists to the same length
bus_names = pad_list(bus_names, max_length)
bus_types = pad_list(bus_types, max_length)
departure_times = pad_list(departure_times, max_length)
durations = pad_list(durations, max_length)
reaching_times = pad_list(reaching_times, max_length)
ratings = pad_list(ratings, max_length)
prices = pad_list(prices, max_length)
seat_availability = pad_list(seat_availability, max_length)

# Creating a DataFrame
vij_hyd_route_df = pd.DataFrame({
    'Route Name': [first_route] * max_length,
    'Route Link': [first_href] * max_length,
    'Bus Name': bus_names,
    'Bus Type': bus_types,
    'Departure Time': departure_times,
    'Duration': durations,
    'Reaching Time': reaching_times,
    'Bus Rating': ratings,
    'Price': prices,
    'Seat Availability': seat_availability
})

# Output the DataFrame to verify
print(vij_hyd_route_df)

# Close the browser
driver.quit()


Found 10 links
Scraped href: https://www.redbus.in/bus-tickets/vijayawada-to-hyderabad, Route: Vijayawada to Hyderabad
Scraped href: https://www.redbus.in/bus-tickets/hyderabad-to-vijayawada, Route: Hyderabad to Vijayawada
Scraped href: https://www.redbus.in/bus-tickets/kakinada-to-visakhapatnam, Route: Kakinada to Visakhapatnam
Scraped href: https://www.redbus.in/bus-tickets/visakhapatnam-to-kakinada, Route: Visakhapatnam to Kakinada
Scraped href: https://www.redbus.in/bus-tickets/chittoor-andhra-pradesh-to-bangalore, Route: Chittoor (Andhra Pradesh) to Bangalore
Scraped href: https://www.redbus.in/bus-tickets/kadapa-to-bangalore, Route: Kadapa to Bangalore
Scraped href: https://www.redbus.in/bus-tickets/ananthapur-to-bangalore, Route: Anantapur (andhra pradesh) to Bangalore
Scraped href: https://www.redbus.in/bus-tickets/tirupathi-to-bangalore, Route: Tirupati to Bangalore
Scraped href: https://www.redbus.in/bus-tickets/visakhapatnam-to-vijayawada, Route: Visakhapatnam to Vijayawada


In [50]:
vij_hyd_route_df

Unnamed: 0,Route Name,Route Link,Bus Name,Bus Type,Departure Time,Duration,Reaching Time,Bus Rating,Price,Seat Availability
0,Vijayawada to Hyderabad,https://www.redbus.in/bus-tickets/vijayawada-t...,TSRTC - 9887,RAJDHANI (A.C. Semi Sleeper),00:10,06h 10m,06:20,2.8,INR 480,37 Seats available
1,Vijayawada to Hyderabad,https://www.redbus.in/bus-tickets/vijayawada-t...,TSRTC - 31362,RAJDHANI (A.C. Semi Sleeper),00:30,05h 30m,06:00,3.2,INR 480,34 Seats available
2,Vijayawada to Hyderabad,https://www.redbus.in/bus-tickets/vijayawada-t...,TSRTC - 42162,"SUPER LUXURY (NON-AC, 2 + 2 PUSH BACK)",01:10,06h 10m,07:20,4.1,INR 396,32 Seats available
3,Vijayawada to Hyderabad,https://www.redbus.in/bus-tickets/vijayawada-t...,TSRTC - 30710,RAJDHANI (A.C. Semi Sleeper),01:20,05h 40m,07:00,3.7,INR 480,37 Seats available
4,Vijayawada to Hyderabad,https://www.redbus.in/bus-tickets/vijayawada-t...,TSRTC - 41510,"SUPER LUXURY (NON-AC, 2 + 2 PUSH BACK)",01:30,05h 40m,07:10,4.0,INR 396,33 Seats available
...,...,...,...,...,...,...,...,...,...,...
355,Vijayawada to Hyderabad,https://www.redbus.in/bus-tickets/vijayawada-t...,Apple Travels,Non A/C Seater / Sleeper (2+1),22:45,07h 20m,06:05,new bus,INR 6000,46 Seats available
356,Vijayawada to Hyderabad,https://www.redbus.in/bus-tickets/vijayawada-t...,SVR Tours and Travels,A/C Seater / Sleeper (2+1),02:00,04h 50m,06:50,new bus,INR 590,10 Seats available
357,Vijayawada to Hyderabad,https://www.redbus.in/bus-tickets/vijayawada-t...,Surya Dream Liner,NON A/C Sleeper (2+1),23:40,06h 20m,06:00,new bus,INR 3000,30 Seats available
358,Vijayawada to Hyderabad,https://www.redbus.in/bus-tickets/vijayawada-t...,Elegance Tours And Travels Pvt Ltd,Bharat Benz A/C Sleeper (2+1),04:00,05h 40m,09:40,new bus,INR 2999,29 Seats available


## TRAIL 1 - FOR ALL THE HREFS (ROUTE LINKS) IN APSRTC

In [4]:
# TRAIL 1 : Clicking issue in Ongole to Hyderabad page
# TRAIL 2 SUCCESS

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# Initializing the Chrome driver
driver = webdriver.Chrome()

# Opening the desired URL
driver.get('https://www.redbus.in/')

# Maximizing the browser window
driver.maximize_window()

# Wait for the page to load
time.sleep(5)

# Scroll the page a specific number of times (e.g., twice)
for _ in range(1):
    body = driver.find_element(By.TAG_NAME, "body")
    body.send_keys(Keys.PAGE_DOWN)
    time.sleep(2)  # Wait for the page to load new content

# Click the button (replace 'button_xpath' with the actual XPath of the button)
button_xpath = "//div[@class='rtcName']"  
button = driver.find_element(By.XPATH, button_xpath)
button.click()

# Optionally, wait for the new content to load
time.sleep(5)

# Function to scrape hrefs on the current page
def scrape_hrefs():
    bus_routes_links = driver.find_elements(By.XPATH, "//div[contains(@class, 'route_details')]//a")
    print(f"Found {len(bus_routes_links)} links")
    hrefs = [link.get_attribute('href') for link in bus_routes_links]
    route_names = [link.text for link in bus_routes_links]
    for href, route in zip(hrefs, route_names):
        print(f"Scraped href: {href}, Route: {route}")
    return hrefs, route_names

# List to store all hrefs
all_hrefs = []
all_route_names = []

# Scroll the page a specific number of times (e.g., twice) on the first page only
for _ in range(2):
    body = driver.find_element(By.TAG_NAME, "body")
    body.send_keys(Keys.PAGE_DOWN)
    time.sleep(2)  # Wait for the page to load new content

# Scrape hrefs on the first page
hrefs, route_names = scrape_hrefs()
all_hrefs.extend(hrefs)
all_route_names.extend(route_names)

# Loop through the next 4 pages
for page_num in range(2, 6):
    # Click the page number button to go to the next page
    page_button_xpath = f"//div[contains(@class,'DC_117_pageTabs') and text()='{page_num}']"
    page_button = driver.find_element(By.XPATH, page_button_xpath)
    page_button.click()
    
    # Wait for the new page to load
    time.sleep(5)
    
    # Scrape hrefs on the current page
    hrefs, route_names = scrape_hrefs()
    all_hrefs.extend(hrefs)
    all_route_names.extend(route_names)

# Initialize an empty DataFrame for all routes
APSRTC = pd.DataFrame()

# Function to extract text and handle missing ratings
def extract_text_with_default(elements, default_value='new bus'):
    return [element.text if element.text else default_value for element in elements]

# Function to pad lists to the maximum length
def pad_list(lst, max_length):
    return lst + [''] * (max_length - len(lst))

# Loop through each href and extract data
for href, route in zip(all_hrefs, all_route_names):
    driver.get(href)
    
    # Wait for the page to load
    time.sleep(8)

    # Click all the available buttons (if available)
    click_buttons = driver.find_elements(By.XPATH, "//div[@class='button']")
    for button in click_buttons:
        try:
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='button']")))
            button.click()
        except Exception as e:
            print(f"Could not click button: {e}")
        
    time.sleep(2)

    # Scrolling through the page using ActionChains
    scrolling = True
    max_scroll_attempts = 5  # Maximum number of scroll attempts
    scroll_attempts = 0

    while scrolling and scroll_attempts < max_scroll_attempts:
        old_page_source = driver.page_source

        # Use ActionChains to perform a PAGE_DOWN
        ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()

        # Wait for the page to load new content
        time.sleep(3)  # Increased wait time for dynamic content to load

        new_page_source = driver.page_source

        # Check if the page source has changed
        if new_page_source == old_page_source:
            scroll_attempts += 1
        else:
            scroll_attempts = 0  # Reset the counter if new content is loaded

        if scroll_attempts >= max_scroll_attempts:
            scrolling = False

    # Collect data
    bus_names = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']"))
    bus_types = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']"))
    departure_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dp-time f-19 d-color f-bold']"))
    durations = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dur l-color lh-24']"))
    reaching_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bp-time f-19 d-color disp-Inline']"))
    prices = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='fare d-block']"))
    
    rating_elements = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
    ratings = ['new bus' if element.text == '' else element.text for element in rating_elements]
    if len(ratings) < len(bus_names):
        ratings.extend(['new bus'] * (len(bus_names) - len(ratings)))

    seat_availability = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='seat-left m-top-16' or @class='seat-left m-top-30']"))

    # Find the maximum length among the lists
    max_length = max(len(bus_names), len(bus_types), len(departure_times), len(durations), len(reaching_times), len(ratings), len(prices), len(seat_availability))

    # Pad all lists to the same length
    bus_names = pad_list(bus_names, max_length)
    bus_types = pad_list(bus_types, max_length)
    departure_times = pad_list(departure_times, max_length)
    durations = pad_list(durations, max_length)
    reaching_times = pad_list(reaching_times, max_length)
    ratings = pad_list(ratings, max_length)
    prices = pad_list(prices, max_length)
    seat_availability = pad_list(seat_availability, max_length)

    # Create a DataFrame for the current route
    route_df = pd.DataFrame({
        'Route Name': [route] * max_length,
        'Route Link': [href] * max_length,
        'Bus Name': bus_names,
        'Bus Type': bus_types,
        'Departure Time': departure_times,
        'Duration': durations,
        'Reaching Time': reaching_times,
        'Bus Rating': ratings,
        'Price': prices,
        'Seat Availability': seat_availability
    })

    # Concatenate the current route DataFrame with the main DataFrame
    APSRTC = pd.concat([APSRTC, route_df], ignore_index=True)

# Output the DataFrame to verify
print(APSRTC)

# Save the DataFrame to a CSV file
APSRTC.to_csv('APSRTC.csv', index=False)

# Close the browser
driver.quit()


Found 10 links
Scraped href: https://www.redbus.in/bus-tickets/vijayawada-to-hyderabad, Route: Vijayawada to Hyderabad
Scraped href: https://www.redbus.in/bus-tickets/hyderabad-to-vijayawada, Route: Hyderabad to Vijayawada
Scraped href: https://www.redbus.in/bus-tickets/kakinada-to-visakhapatnam, Route: Kakinada to Visakhapatnam
Scraped href: https://www.redbus.in/bus-tickets/visakhapatnam-to-kakinada, Route: Visakhapatnam to Kakinada
Scraped href: https://www.redbus.in/bus-tickets/chittoor-andhra-pradesh-to-bangalore, Route: Chittoor (Andhra Pradesh) to Bangalore
Scraped href: https://www.redbus.in/bus-tickets/kadapa-to-bangalore, Route: Kadapa to Bangalore
Scraped href: https://www.redbus.in/bus-tickets/ananthapur-to-bangalore, Route: Anantapur (andhra pradesh) to Bangalore
Scraped href: https://www.redbus.in/bus-tickets/tirupathi-to-bangalore, Route: Tirupati to Bangalore
Scraped href: https://www.redbus.in/bus-tickets/visakhapatnam-to-vijayawada, Route: Visakhapatnam to Vijayawada


In [27]:
# Trial 2 : Kerala 
# SUCCESS
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# Initialize the Chrome driver
driver = webdriver.Chrome()

# Open the desired URL
driver.get('https://www.redbus.in/online-booking/ksrtc-kerala/?utm_source=rtchometile')

# Maximize the browser window
driver.maximize_window()

# Wait for the page to load
time.sleep(5)

# Function to scrape hrefs on the current page
def scrape_hrefs():
    bus_routes_links = driver.find_elements(By.XPATH, "//div[contains(@class, 'route_details')]//a")
    print(f"Found {len(bus_routes_links)} links")
    hrefs = [link.get_attribute('href') for link in bus_routes_links]
    route_names = [link.text for link in bus_routes_links]
    for href, route in zip(hrefs, route_names):
        print(f"Scraped href: {href}, Route: {route}")
    return hrefs, route_names

# List to store all hrefs
all_hrefs = []
all_route_names = []

# Scroll the page a specific number of times (e.g., twice) on the first page only
for _ in range(2):
    body = driver.find_element(By.TAG_NAME, "body")
    body.send_keys(Keys.PAGE_DOWN)
    time.sleep(2)  # Wait for the page to load new content

# Scrape hrefs on the first page
hrefs, route_names = scrape_hrefs()
all_hrefs.extend(hrefs)
all_route_names.extend(route_names)


 # Click the page number button to go to the next page
page_button_xpath = f"//div[contains(@class,'DC_117_pageTabs') and text()='2']"
page_button = driver.find_element(By.XPATH, page_button_xpath)
page_button.click()
    
# Wait for the new page to load
time.sleep(5)
    
# Scrape hrefs on the current page
hrefs, route_names = scrape_hrefs()
all_hrefs.extend(hrefs)
all_route_names.extend(route_names)

# Initialize an empty DataFrame for all routes
Kerala = pd.DataFrame()

# Function to extract text and handle missing ratings
def extract_text_with_default(elements, default_value='new bus'):
    return [element.text if element.text else default_value for element in elements]

# Function to pad lists to the maximum length
def pad_list(lst, max_length):
    return lst + [''] * (max_length - len(lst))

# Loop through each href and extract data
for href, route in zip(all_hrefs, all_route_names):
    driver.get(href)
    
    # Wait for the page to load
    time.sleep(8)

    # Click all the available buttons (if necessary)
    click_buttons = driver.find_elements(By.XPATH, "//div[@class='button']")
    for button in click_buttons:
        try:
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable(button)).click()
        except Exception as e:
            print(f"Could not click button: {e}")
        
    time.sleep(2)

    # Scrolling through the page using ActionChains
    scrolling = True
    max_scroll_attempts = 5  # Maximum number of scroll attempts
    scroll_attempts = 0

    while scrolling and scroll_attempts < max_scroll_attempts:
        old_page_source = driver.page_source

        # Use ActionChains to perform a PAGE_DOWN
        ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()

        # Wait for the page to load new content
        time.sleep(3)  # Increased wait time for dynamic content

        new_page_source = driver.page_source

        # Check if the page source has changed
        if new_page_source == old_page_source:
            scroll_attempts += 1
        else:
            scroll_attempts = 0  # Reset the counter if new content is loaded

        if scroll_attempts >= max_scroll_attempts:
            scrolling = False

    # Collect data
    bus_names = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']"))
    bus_types = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']"))
    departure_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dp-time f-19 d-color f-bold']"))
    durations = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dur l-color lh-24']"))
    reaching_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bp-time f-19 d-color disp-Inline']"))
    prices = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='fare d-block']"))
    
    rating_elements = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
    ratings = extract_text_with_default(rating_elements)
    if len(ratings) < len(bus_names):
        ratings.extend(['new bus'] * (len(bus_names) - len(ratings)))

    seat_availability = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='seat-left m-top-16' or @class='seat-left m-top-30']"))

    # Find the maximum length among the lists
    max_length = max(len(bus_names), len(bus_types), len(departure_times), len(durations), len(reaching_times), len(ratings), len(prices), len(seat_availability))

    # Pad all lists to the same length
    bus_names = pad_list(bus_names, max_length)
    bus_types = pad_list(bus_types, max_length)
    departure_times = pad_list(departure_times, max_length)
    durations = pad_list(durations, max_length)
    reaching_times = pad_list(reaching_times, max_length)
    ratings = pad_list(ratings, max_length)
    prices = pad_list(prices, max_length)
    seat_availability = pad_list(seat_availability, max_length)

    # Create a DataFrame for the current route
    route_df = pd.DataFrame({
        'Route Name': [route] * max_length,
        'Route Link': [href] * max_length,
        'Bus Name': bus_names,
        'Bus Type': bus_types,
        'Departure Time': departure_times,
        'Duration': durations,
        'Reaching Time': reaching_times,
        'Bus Rating': ratings,
        'Price': prices,
        'Seat Availability': seat_availability
    })

    # Concatenate the current route DataFrame with the main DataFrame
    KERALA_RTC = pd.concat([KERALA_RTC, route_df], ignore_index=True)

# Output the DataFrame to verify
print(KERALA_RTC)

# Save the DataFrame to a CSV file
KERALA_RTC.to_csv('KERALA_RTC.csv', index=False)

# Close the browser
driver.quit()


Found 10 links
Scraped href: https://www.redbus.in/bus-tickets/bangalore-to-kozhikode, Route: Bangalore to Kozhikode
Scraped href: https://www.redbus.in/bus-tickets/kozhikode-to-ernakulam, Route: Kozhikode to Ernakulam
Scraped href: https://www.redbus.in/bus-tickets/kozhikode-to-bangalore, Route: Kozhikode to Bangalore
Scraped href: https://www.redbus.in/bus-tickets/ernakulam-to-kozhikode, Route: Ernakulam to Kozhikode
Scraped href: https://www.redbus.in/bus-tickets/kozhikode-to-mysore, Route: Kozhikode to Mysore
Scraped href: https://www.redbus.in/bus-tickets/kozhikode-to-thiruvananthapuram, Route: Kozhikode to Thiruvananthapuram
Scraped href: https://www.redbus.in/bus-tickets/bangalore-to-kalpetta, Route: Bangalore to Kalpetta (kerala)
Scraped href: https://www.redbus.in/bus-tickets/mysore-to-kozhikode, Route: Mysore to Kozhikode
Scraped href: https://www.redbus.in/bus-tickets/kalpetta-to-bangalore, Route: Kalpetta (kerala) to Bangalore
Scraped href: https://www.redbus.in/bus-tickets

In [7]:
# Trail 2 : GOA KADAMBA
# SUCCESS
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# Initialize the Chrome driver
driver = webdriver.Chrome()

# Open the desired URL
driver.get('https://www.redbus.in/online-booking/ktcl/?utm_source=rtchometile')

# Maximize the browser window
driver.maximize_window()

# Function to scrape hrefs on the current page
def scrape_hrefs():
    bus_routes_links = driver.find_elements(By.XPATH, "//div[contains(@class, 'route_details')]//a")
    print(f"Found {len(bus_routes_links)} links")
    hrefs = [link.get_attribute('href') for link in bus_routes_links]
    route_names = [link.text for link in bus_routes_links]
    for href, route in zip(hrefs, route_names):
        print(f"Scraped href: {href}, Route: {route}")
    return hrefs, route_names

# List to store all hrefs
all_hrefs = []
all_route_names = []

# Scroll the page a specific number of times (e.g., twice) on the first page only
for _ in range(3):
    body = driver.find_element(By.TAG_NAME, "body")
    body.send_keys(Keys.PAGE_DOWN)
    time.sleep(3)  # Wait for the page to load new content

# Scrape hrefs on the first page
hrefs, route_names = scrape_hrefs()
all_hrefs.extend(hrefs)
all_route_names.extend(route_names)

# Loop through the next 3 pages
for page_num in range(2, 5):
    # Click the page number button to go to the next page
    page_button_xpath = f"//div[contains(@class,'DC_117_pageTabs') and text()='{page_num}']"
    page_button = driver.find_element(By.XPATH, page_button_xpath)
    page_button.click()
    
    # Wait for the new page to load
    time.sleep(5)
    
    # Scrape hrefs on the current page
    hrefs, route_names = scrape_hrefs()
    all_hrefs.extend(hrefs)
    all_route_names.extend(route_names)

# Initialize an empty DataFrame for all routes
Kadamba = pd.DataFrame()

# Function to extract text and handle missing ratings
def extract_text_with_default(elements, default_value='new bus'):
    return [element.text if element.text else default_value for element in elements]

# Function to pad lists to the maximum length
def pad_list(lst, max_length):
    return lst + [''] * (max_length - len(lst))

# Loop through each href and extract data
for href, route in zip(all_hrefs, all_route_names):
    driver.get(href)
    
    # Wait for the page to load
    time.sleep(8)

    # Click all the available buttons (if available)
    click_buttons = driver.find_elements(By.XPATH, "//div[@class='button']")
    for button in click_buttons:
        try:
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable(button)).click()
        except Exception as e:
            print(f"Could not click button: {e}")
        
    time.sleep(2)

    # Scrolling through the page using ActionChains
    scrolling = True
    max_scroll_attempts = 5  # Maximum number of scroll attempts
    scroll_attempts = 0

    while scrolling and scroll_attempts < max_scroll_attempts:
        old_page_source = driver.page_source

        # Use ActionChains to perform a PAGE_DOWN
        ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()

        # Wait for the page to load new content
        time.sleep(3)  # Increased wait time for dynamic content

        new_page_source = driver.page_source

        # Check if the page source has changed
        if new_page_source == old_page_source:
            scroll_attempts += 1
        else:
            scroll_attempts = 0  # Reset the counter if new content is loaded

        if scroll_attempts >= max_scroll_attempts:
            scrolling = False

    # Collect data
    bus_names = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']"))
    bus_types = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']"))
    departure_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dp-time f-19 d-color f-bold']"))
    durations = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dur l-color lh-24']"))
    reaching_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bp-time f-19 d-color disp-Inline']"))
    prices = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='fare d-block']"))
    
    rating_elements = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
    ratings = extract_text_with_default(rating_elements)
    if len(ratings) < len(bus_names):
        ratings.extend(['new bus'] * (len(bus_names) - len(ratings)))

    seat_availability = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='seat-left m-top-16' or @class='seat-left m-top-30']"))

    # Find the maximum length among the lists
    max_length = max(len(bus_names), len(bus_types), len(departure_times), len(durations), len(reaching_times), len(ratings), len(prices), len(seat_availability))

    # Pad all lists to the same length
    bus_names = pad_list(bus_names, max_length)
    bus_types = pad_list(bus_types, max_length)
    departure_times = pad_list(departure_times, max_length)
    durations = pad_list(durations, max_length)
    reaching_times = pad_list(reaching_times, max_length)
    ratings = pad_list(ratings, max_length)
    prices = pad_list(prices, max_length)
    seat_availability = pad_list(seat_availability, max_length)

    # Create a DataFrame for the current route
    route_df = pd.DataFrame({
        'Route Name': [route] * max_length,
        'Route Link': [href] * max_length,
        'Bus Name': bus_names,
        'Bus Type': bus_types,
        'Departure Time': departure_times,
        'Duration': durations,
        'Reaching Time': reaching_times,
        'Bus Rating': ratings,
        'Price': prices,
        'Seat Availability': seat_availability
    })

    # Concatenate the current route DataFrame with the main DataFrame
    Kadamba = pd.concat([Kadamba, route_df], ignore_index=True)

# Output the DataFrame to verify
print(Kadamba)

# Save the DataFrame to a CSV file
Kadamba.to_csv('KTCL.csv', index=False)

# Close the browser
driver.quit()


Found 10 links
Scraped href: https://www.redbus.in/bus-tickets/pune-to-goa, Route: Pune to Goa
Scraped href: https://www.redbus.in/bus-tickets/goa-to-pune, Route: Goa to Pune
Scraped href: https://www.redbus.in/bus-tickets/mumbai-to-goa, Route: Mumbai to Goa
Scraped href: https://www.redbus.in/bus-tickets/goa-to-mumbai, Route: Goa to Mumbai
Scraped href: https://www.redbus.in/bus-tickets/pandharpur-to-goa, Route: Pandharpur to Goa
Scraped href: https://www.redbus.in/bus-tickets/bangalore-to-goa, Route: Bangalore to Goa
Scraped href: https://www.redbus.in/bus-tickets/goa-to-pandharpur, Route: Goa to Pandharpur
Scraped href: https://www.redbus.in/bus-tickets/belagavi-to-goa, Route: Belagavi to Goa
Scraped href: https://www.redbus.in/bus-tickets/goa-to-bangalore, Route: Goa to Bangalore
Scraped href: https://www.redbus.in/bus-tickets/solapur-to-goa, Route: Solapur to Goa
Found 10 links
Scraped href: https://www.redbus.in/bus-tickets/goa-to-kolhapur-maharashtra, Route: Goa to Kolhapur(Maha

In [9]:
# Trail 1 : RSRTC
# SUCCESS
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# Initialize the Chrome driver
driver = webdriver.Chrome()

# Open the desired URL
driver.get('https://www.redbus.in/online-booking/rsrtc/?utm_source=rtchometile')

# Maximize the browser window
driver.maximize_window()

# Function to scrape hrefs on the current page
def scrape_hrefs():
    bus_routes_links = driver.find_elements(By.XPATH, "//div[contains(@class, 'route_details')]//a")
    print(f"Found {len(bus_routes_links)} links")
    hrefs = [link.get_attribute('href') for link in bus_routes_links]
    route_names = [link.text for link in bus_routes_links]
    for href, route in zip(hrefs, route_names):
        print(f"Scraped href: {href}, Route: {route}")
    return hrefs, route_names

# List to store all hrefs
all_hrefs = []
all_route_names = []

# Scroll the page a specific number of times (e.g., twice) on the first page only
for _ in range(3):
    body = driver.find_element(By.TAG_NAME, "body")
    body.send_keys(Keys.PAGE_DOWN)
    time.sleep(3)  # Wait for the page to load new content

# Scrape hrefs on the first page
hrefs, route_names = scrape_hrefs()
all_hrefs.extend(hrefs)
all_route_names.extend(route_names)

# Loop through the next 2 pages
for page_num in range(2, 4):
    # Click the page number button to go to the next page
    page_button_xpath = f"//div[contains(@class,'DC_117_pageTabs') and text()='{page_num}']"
    page_button = driver.find_element(By.XPATH, page_button_xpath)
    page_button.click()
    
    # Wait for the new page to load
    time.sleep(5)
    
    # Scrape hrefs on the current page
    hrefs, route_names = scrape_hrefs()
    all_hrefs.extend(hrefs)
    all_route_names.extend(route_names)

# Initialize an empty DataFrame for all routes
RSRTC = pd.DataFrame()

# Function to extract text and handle missing ratings
def extract_text_with_default(elements, default_value='new bus'):
    return [element.text if element.text else default_value for element in elements]

# Function to pad lists to the maximum length
def pad_list(lst, max_length):
    return lst + [''] * (max_length - len(lst))

# Loop through each href and extract data
for href, route in zip(all_hrefs, all_route_names):
    driver.get(href)
    
    # Wait for the page to load
    time.sleep(8)

    # Click all the available buttons (if available)
    click_buttons = driver.find_elements(By.XPATH, "//div[@class='button']")
    for button in click_buttons:
        try:
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable(button)).click()
        except Exception as e:
            print(f"Could not click button: {e}")
        
    time.sleep(2)

    # Scrolling through the page using ActionChains
    scrolling = True
    max_scroll_attempts = 5  # Maximum number of scroll attempts
    scroll_attempts = 0

    while scrolling and scroll_attempts < max_scroll_attempts:
        old_page_source = driver.page_source

        # Use ActionChains to perform a PAGE_DOWN
        ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()

        # Wait for the page to load new content
        time.sleep(3)  # Increased wait time for dynamic content

        new_page_source = driver.page_source

        # Check if the page source has changed
        if new_page_source == old_page_source:
            scroll_attempts += 1
        else:
            scroll_attempts = 0  # Reset the counter if new content is loaded

        if scroll_attempts >= max_scroll_attempts:
            scrolling = False

    # Collect data
    bus_names = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']"))
    bus_types = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']"))
    departure_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dp-time f-19 d-color f-bold']"))
    durations = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dur l-color lh-24']"))
    reaching_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bp-time f-19 d-color disp-Inline']"))
    prices = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='fare d-block']"))
    
    rating_elements = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
    ratings = extract_text_with_default(rating_elements)
    if len(ratings) < len(bus_names):
        ratings.extend(['new bus'] * (len(bus_names) - len(ratings)))

    seat_availability = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='seat-left m-top-16' or @class='seat-left m-top-30']"))

    # Find the maximum length among the lists
    max_length = max(len(bus_names), len(bus_types), len(departure_times), len(durations), len(reaching_times), len(ratings), len(prices), len(seat_availability))

    # Pad all lists to the same length
    bus_names = pad_list(bus_names, max_length)
    bus_types = pad_list(bus_types, max_length)
    departure_times = pad_list(departure_times, max_length)
    durations = pad_list(durations, max_length)
    reaching_times = pad_list(reaching_times, max_length)
    ratings = pad_list(ratings, max_length)
    prices = pad_list(prices, max_length)
    seat_availability = pad_list(seat_availability, max_length)

    # Create a DataFrame for the current route
    route_df = pd.DataFrame({
        'Route Name': [route] * max_length,
        'Route Link': [href] * max_length,
        'Bus Name': bus_names,
        'Bus Type': bus_types,
        'Departure Time': departure_times,
        'Duration': durations,
        'Reaching Time': reaching_times,
        'Bus Rating': ratings,
        'Price': prices,
        'Seat Availability': seat_availability
    })

    # Concatenate the current route DataFrame with the main DataFrame
    RSRTC = pd.concat([RSRTC, route_df], ignore_index=True)

# Output the DataFrame to verify
print(RSRTC)

# Save the DataFrame to a CSV file
RSRTC.to_csv('RSRTC.csv', index=False)

# Close the browser
driver.quit()

Found 10 links
Scraped href: https://www.redbus.in/bus-tickets/jodhpur-to-ajmer, Route: Jodhpur to Ajmer
Scraped href: https://www.redbus.in/bus-tickets/beawer-to-jaipur, Route: Beawar (Rajasthan) to Jaipur (Rajasthan)
Scraped href: https://www.redbus.in/bus-tickets/udaipur-to-jodhpur, Route: Udaipur to Jodhpur
Scraped href: https://www.redbus.in/bus-tickets/jaipur-to-jodhpur, Route: Jaipur (Rajasthan) to Jodhpur
Scraped href: https://www.redbus.in/bus-tickets/sikar-to-jaipur, Route: Sikar to Jaipur (Rajasthan)
Scraped href: https://www.redbus.in/bus-tickets/kishangarh-to-jaipur, Route: Kishangarh to Jaipur (Rajasthan)
Scraped href: https://www.redbus.in/bus-tickets/aligarh-uttar-pradesh-to-jaipur, Route: Aligarh (uttar pradesh) to Jaipur (Rajasthan)
Scraped href: https://www.redbus.in/bus-tickets/jodhpur-to-beawer, Route: Jodhpur to Beawar (Rajasthan)
Scraped href: https://www.redbus.in/bus-tickets/kota-rajasthan-to-jaipur, Route: Kota(Rajasthan) to Jaipur (Rajasthan)
Scraped href: ht

In [15]:
# TRAIL 1: SBSTC
# SUCCESS

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# Initialize the Chrome driver
driver = webdriver.Chrome()

# Open the desired URL
driver.get('https://www.redbus.in/online-booking/south-bengal-state-transport-corporation-sbstc/?utm_source=rtchometile')

# Maximize the browser window
driver.maximize_window()

# Function to scrape hrefs on the current page
def scrape_hrefs():
    bus_routes_links = driver.find_elements(By.XPATH, "//div[contains(@class, 'route_details')]//a")
    print(f"Found {len(bus_routes_links)} links")
    hrefs = [link.get_attribute('href') for link in bus_routes_links]
    route_names = [link.text for link in bus_routes_links]
    for href, route in zip(hrefs, route_names):
        print(f"Scraped href: {href}, Route: {route}")
    return hrefs, route_names

# List to store all hrefs
all_hrefs = []
all_route_names = []

# Scroll the page a specific number of times (e.g., twice) on the first page only
for _ in range(3):
    body = driver.find_element(By.TAG_NAME, "body")
    ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
    time.sleep(3)  # Wait for the page to load new content


# Scrape hrefs on the first page
hrefs, route_names = scrape_hrefs()
all_hrefs.extend(hrefs)
all_route_names.extend(route_names)

# Loop through the next 4 pages
for page_num in range(2, 6):
    # Click the page number button to go to the next page
    page_button_xpath = f"//div[contains(@class,'DC_117_pageTabs') and text()='{page_num}']"
    page_button = driver.find_element(By.XPATH, page_button_xpath)
    page_button.click()
    
    # Wait for the new page to load
    time.sleep(5)
    
    # Scrape hrefs on the current page
    hrefs, route_names = scrape_hrefs()
    all_hrefs.extend(hrefs)
    all_route_names.extend(route_names)

# Initialize an empty DataFrame for all routes
SBSTC = pd.DataFrame()

# Function to extract text and handle missing ratings
def extract_text_with_default(elements, default_value='new bus'):
    return [element.text if element.text else default_value for element in elements]

# Function to pad lists to the maximum length
def pad_list(lst, max_length):
    return lst + [''] * (max_length - len(lst))

# Loop through each href and extract data
for href, route in zip(all_hrefs, all_route_names):
    driver.get(href)
    
    # Wait for the page to load
    time.sleep(8)

    # Click all the available buttons (if necessary)
    click_buttons = driver.find_elements(By.XPATH, "//div[@class='button']")
    for button in click_buttons:
        try:
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable(button)).click()
        except Exception as e:
            print(f"Could not click button: {e}")
        
    time.sleep(2)

    # Scrolling through the page using ActionChains
    scrolling = True
    max_scroll_attempts = 5  # Maximum number of scroll attempts
    scroll_attempts = 0

    while scrolling and scroll_attempts < max_scroll_attempts:
        old_page_source = driver.page_source

        # Use ActionChains to perform a PAGE_DOWN
        ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()

        # Wait for the page to load new content
        time.sleep(3)  # Increased wait time for dynamic content

        new_page_source = driver.page_source

        # Check if the page source has changed
        if new_page_source == old_page_source:
            scroll_attempts += 1
        else:
            scroll_attempts = 0  # Reset the counter if new content is loaded

        if scroll_attempts >= max_scroll_attempts:
            scrolling = False

    # Collect data
    bus_names = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']"))
    bus_types = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']"))
    departure_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dp-time f-19 d-color f-bold']"))
    durations = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dur l-color lh-24']"))
    reaching_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bp-time f-19 d-color disp-Inline']"))
    prices = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='fare d-block']"))
    
    rating_elements = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
    ratings = extract_text_with_default(rating_elements)
    if len(ratings) < len(bus_names):
        ratings.extend(['new bus'] * (len(bus_names) - len(ratings)))

    seat_availability = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='seat-left m-top-16' or @class='seat-left m-top-30']"))

    # Find the maximum length among the lists
    max_length = max(len(bus_names), len(bus_types), len(departure_times), len(durations), len(reaching_times), len(ratings), len(prices), len(seat_availability))

    # Pad all lists to the same length
    bus_names = pad_list(bus_names, max_length)
    bus_types = pad_list(bus_types, max_length)
    departure_times = pad_list(departure_times, max_length)
    durations = pad_list(durations, max_length)
    reaching_times = pad_list(reaching_times, max_length)
    ratings = pad_list(ratings, max_length)
    prices = pad_list(prices, max_length)
    seat_availability = pad_list(seat_availability, max_length)

    # Create a DataFrame for the current route
    route_df = pd.DataFrame({
        'Route Name': [route] * max_length,
        'Route Link': [href] * max_length,
        'Bus Name': bus_names,
        'Bus Type': bus_types,
        'Departure Time': departure_times,
        'Duration': durations,
        'Reaching Time': reaching_times,
        'Bus Rating': ratings,
        'Price': prices,
        'Seat Availability': seat_availability
    })

    # Concatenate the current route DataFrame with the main DataFrame
    SBSTC = pd.concat([SBSTC, route_df], ignore_index=True)

# Output the DataFrame to verify
print(SBSTC)

# Save the DataFrame to a CSV file
SBSTC.to_csv('SBSTC.csv', index=False)

# Close the browser
driver.quit()

Found 10 links
Scraped href: https://www.redbus.in/bus-tickets/burdwan-to-kolkata, Route: Burdwan to Kolkata
Scraped href: https://www.redbus.in/bus-tickets/kolkata-to-burdwan, Route: Kolkata to Burdwan
Scraped href: https://www.redbus.in/bus-tickets/durgapur-to-kolkata, Route: Durgapur (West Bengal) to Kolkata
Scraped href: https://www.redbus.in/bus-tickets/kolkata-to-haldia, Route: Kolkata to Haldia
Scraped href: https://www.redbus.in/bus-tickets/haldia-to-kolkata, Route: Haldia to Kolkata
Scraped href: https://www.redbus.in/bus-tickets/kolkata-to-durgapur, Route: Kolkata to Durgapur (West Bengal)
Scraped href: https://www.redbus.in/bus-tickets/kolkata-to-arambagh-west-bengal, Route: Kolkata to Arambagh (West Bengal)
Scraped href: https://www.redbus.in/bus-tickets/digha-to-kolkata, Route: Digha to Kolkata
Scraped href: https://www.redbus.in/bus-tickets/kolkata-to-digha, Route: Kolkata to Digha
Scraped href: https://www.redbus.in/bus-tickets/kolkata-to-bankura, Route: Kolkata to Banku

In [16]:
# TRAIL 1: HRTC
# SUCCESS

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# Initialize the Chrome driver
driver = webdriver.Chrome()

# Open the desired URL
driver.get('https://www.redbus.in/online-booking/hrtc/?utm_source=rtchometile')

# Maximize the browser window
driver.maximize_window()

# Function to scrape hrefs on the current page
def scrape_hrefs():
    bus_routes_links = driver.find_elements(By.XPATH, "//div[contains(@class, 'route_details')]//a")
    print(f"Found {len(bus_routes_links)} links")
    hrefs = [link.get_attribute('href') for link in bus_routes_links]
    route_names = [link.text for link in bus_routes_links]
    for href, route in zip(hrefs, route_names):
        print(f"Scraped href: {href}, Route: {route}")
    return hrefs, route_names

# List to store all hrefs
all_hrefs = []
all_route_names = []

# Scroll the page a specific number of times (e.g., twice) on the first page only
for _ in range(3):
    body = driver.find_element(By.TAG_NAME, "body")
    ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
    time.sleep(3)  # Wait for the page to load new content


# Scrape hrefs on the first page
hrefs, route_names = scrape_hrefs()
all_hrefs.extend(hrefs)
all_route_names.extend(route_names)

# Loop through the next 4 pages
for page_num in range(2, 6):
    # Click the page number button to go to the next page
    page_button_xpath = f"//div[contains(@class,'DC_117_pageTabs') and text()='{page_num}']"
    page_button = driver.find_element(By.XPATH, page_button_xpath)
    page_button.click()
    
    # Wait for the new page to load
    time.sleep(5)
    
    # Scrape hrefs on the current page
    hrefs, route_names = scrape_hrefs()
    all_hrefs.extend(hrefs)
    all_route_names.extend(route_names)

# Initialize an empty DataFrame for all routes
HRTC = pd.DataFrame()

# Function to extract text and handle missing ratings
def extract_text_with_default(elements, default_value='new bus'):
    return [element.text if element.text else default_value for element in elements]

# Function to pad lists to the maximum length
def pad_list(lst, max_length):
    return lst + [''] * (max_length - len(lst))

# Loop through each href and extract data
for href, route in zip(all_hrefs, all_route_names):
    driver.get(href)
    
    # Wait for the page to load
    time.sleep(8)

    # Click all the available buttons (if necessary)
    click_buttons = driver.find_elements(By.XPATH, "//div[@class='button']")
    for button in click_buttons:
        try:
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable(button)).click()
        except Exception as e:
            print(f"Could not click button: {e}")
        
    time.sleep(2)

    # Scrolling through the page using ActionChains
    scrolling = True
    max_scroll_attempts = 5  # Maximum number of scroll attempts
    scroll_attempts = 0

    while scrolling and scroll_attempts < max_scroll_attempts:
        old_page_source = driver.page_source

        # Use ActionChains to perform a PAGE_DOWN
        ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()

        # Wait for the page to load new content
        time.sleep(3)  # Increased wait time for dynamic content

        new_page_source = driver.page_source

        # Check if the page source has changed
        if new_page_source == old_page_source:
            scroll_attempts += 1
        else:
            scroll_attempts = 0  # Reset the counter if new content is loaded

        if scroll_attempts >= max_scroll_attempts:
            scrolling = False

    # Collect data
    bus_names = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']"))
    bus_types = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']"))
    departure_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dp-time f-19 d-color f-bold']"))
    durations = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dur l-color lh-24']"))
    reaching_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bp-time f-19 d-color disp-Inline']"))
    prices = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='fare d-block']"))
    
    rating_elements = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
    ratings = extract_text_with_default(rating_elements)
    if len(ratings) < len(bus_names):
        ratings.extend(['new bus'] * (len(bus_names) - len(ratings)))

    seat_availability = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='seat-left m-top-16' or @class='seat-left m-top-30']"))

    # Find the maximum length among the lists
    max_length = max(len(bus_names), len(bus_types), len(departure_times), len(durations), len(reaching_times), len(ratings), len(prices), len(seat_availability))

    # Pad all lists to the same length
    bus_names = pad_list(bus_names, max_length)
    bus_types = pad_list(bus_types, max_length)
    departure_times = pad_list(departure_times, max_length)
    durations = pad_list(durations, max_length)
    reaching_times = pad_list(reaching_times, max_length)
    ratings = pad_list(ratings, max_length)
    prices = pad_list(prices, max_length)
    seat_availability = pad_list(seat_availability, max_length)

    # Create a DataFrame for the current route
    route_df = pd.DataFrame({
        'Route Name': [route] * max_length,
        'Route Link': [href] * max_length,
        'Bus Name': bus_names,
        'Bus Type': bus_types,
        'Departure Time': departure_times,
        'Duration': durations,
        'Reaching Time': reaching_times,
        'Bus Rating': ratings,
        'Price': prices,
        'Seat Availability': seat_availability
    })

    # Concatenate the current route DataFrame with the main DataFrame
    HRTC = pd.concat([HRTC, route_df], ignore_index=True)

# Output the DataFrame to verify
print(HRTC)

# Save the DataFrame to a CSV file
HRTC.to_csv('HRTC.csv', index=False)

# Close the browser
driver.quit()

Found 10 links
Scraped href: https://www.redbus.in/bus-tickets/delhi-to-shimla, Route: Delhi to Shimla
Scraped href: https://www.redbus.in/bus-tickets/shimla-to-delhi, Route: Shimla to Delhi
Scraped href: https://www.redbus.in/bus-tickets/manali-to-chandigarh, Route: Manali to Chandigarh
Scraped href: https://www.redbus.in/bus-tickets/chandigarh-to-manali, Route: Chandigarh to Manali
Scraped href: https://www.redbus.in/bus-tickets/delhi-to-manali, Route: Delhi to Manali
Scraped href: https://www.redbus.in/bus-tickets/hamirpur-himachal-pradesh-to-chandigarh, Route: Hamirpur (Himachal Pradesh) to Chandigarh
Scraped href: https://www.redbus.in/bus-tickets/delhi-to-hamirpur-himachal-pradesh, Route: Delhi to Hamirpur (Himachal Pradesh)
Scraped href: https://www.redbus.in/bus-tickets/delhi-to-chandigarh, Route: Delhi to Chandigarh
Scraped href: https://www.redbus.in/bus-tickets/manali-to-delhi, Route: Manali to Delhi
Scraped href: https://www.redbus.in/bus-tickets/hamirpur-himachal-pradesh-t

In [17]:
# TRAIL 1: ASTC
# SUCCESS

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# Initialize the Chrome driver
driver = webdriver.Chrome()

# Open the desired URL
driver.get('https://www.redbus.in/online-booking/astc/?utm_source=rtchometile')

# Maximize the browser window
driver.maximize_window()

# Function to scrape hrefs on the current page
def scrape_hrefs():
    bus_routes_links = driver.find_elements(By.XPATH, "//div[contains(@class, 'route_details')]//a")
    print(f"Found {len(bus_routes_links)} links")
    hrefs = [link.get_attribute('href') for link in bus_routes_links]
    route_names = [link.text for link in bus_routes_links]
    for href, route in zip(hrefs, route_names):
        print(f"Scraped href: {href}, Route: {route}")
    return hrefs, route_names

# List to store all hrefs
all_hrefs = []
all_route_names = []

# Scroll the page a specific number of times (e.g., twice) on the first page only
for _ in range(3):
    body = driver.find_element(By.TAG_NAME, "body")
    ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
    time.sleep(3)  # Wait for the page to load new content


# Scrape hrefs on the first page
hrefs, route_names = scrape_hrefs()
all_hrefs.extend(hrefs)
all_route_names.extend(route_names)

# Loop through the next 4 pages
for page_num in range(2, 6):
    # Click the page number button to go to the next page
    page_button_xpath = f"//div[contains(@class,'DC_117_pageTabs') and text()='{page_num}']"
    page_button = driver.find_element(By.XPATH, page_button_xpath)
    page_button.click()
    
    # Wait for the new page to load
    time.sleep(5)
    
    # Scrape hrefs on the current page
    hrefs, route_names = scrape_hrefs()
    all_hrefs.extend(hrefs)
    all_route_names.extend(route_names)

# Initialize an empty DataFrame for all routes
ASTC = pd.DataFrame()

# Function to extract text and handle missing ratings
def extract_text_with_default(elements, default_value='new bus'):
    return [element.text if element.text else default_value for element in elements]

# Function to pad lists to the maximum length
def pad_list(lst, max_length):
    return lst + [''] * (max_length - len(lst))

# Loop through each href and extract data
for href, route in zip(all_hrefs, all_route_names):
    driver.get(href)
    
    # Wait for the page to load
    time.sleep(8)

    # Click all the available buttons (if necessary)
    click_buttons = driver.find_elements(By.XPATH, "//div[@class='button']")
    for button in click_buttons:
        try:
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable(button)).click()
        except Exception as e:
            print(f"Could not click button: {e}")
        
    time.sleep(2)

    # Scrolling through the page using ActionChains
    scrolling = True
    max_scroll_attempts = 5  # Maximum number of scroll attempts
    scroll_attempts = 0

    while scrolling and scroll_attempts < max_scroll_attempts:
        old_page_source = driver.page_source

        # Use ActionChains to perform a PAGE_DOWN
        ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()

        # Wait for the page to load new content
        time.sleep(3)  # Increased wait time for dynamic content

        new_page_source = driver.page_source

        # Check if the page source has changed
        if new_page_source == old_page_source:
            scroll_attempts += 1
        else:
            scroll_attempts = 0  # Reset the counter if new content is loaded

        if scroll_attempts >= max_scroll_attempts:
            scrolling = False

    # Collect data
    bus_names = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']"))
    bus_types = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']"))
    departure_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dp-time f-19 d-color f-bold']"))
    durations = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dur l-color lh-24']"))
    reaching_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bp-time f-19 d-color disp-Inline']"))
    prices = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='fare d-block']"))
    
    rating_elements = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
    ratings = extract_text_with_default(rating_elements)
    if len(ratings) < len(bus_names):
        ratings.extend(['new bus'] * (len(bus_names) - len(ratings)))

    seat_availability = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='seat-left m-top-16' or @class='seat-left m-top-30']"))

    # Find the maximum length among the lists
    max_length = max(len(bus_names), len(bus_types), len(departure_times), len(durations), len(reaching_times), len(ratings), len(prices), len(seat_availability))

    # Pad all lists to the same length
    bus_names = pad_list(bus_names, max_length)
    bus_types = pad_list(bus_types, max_length)
    departure_times = pad_list(departure_times, max_length)
    durations = pad_list(durations, max_length)
    reaching_times = pad_list(reaching_times, max_length)
    ratings = pad_list(ratings, max_length)
    prices = pad_list(prices, max_length)
    seat_availability = pad_list(seat_availability, max_length)

    # Create a DataFrame for the current route
    route_df = pd.DataFrame({
        'Route Name': [route] * max_length,
        'Route Link': [href] * max_length,
        'Bus Name': bus_names,
        'Bus Type': bus_types,
        'Departure Time': departure_times,
        'Duration': durations,
        'Reaching Time': reaching_times,
        'Bus Rating': ratings,
        'Price': prices,
        'Seat Availability': seat_availability
    })

    # Concatenate the current route DataFrame with the main DataFrame
    ASTC = pd.concat([ASTC, route_df], ignore_index=True)

# Output the DataFrame to verify
print(ASTC)

# Save the DataFrame to a CSV file
ASTC.to_csv('ASTC.csv', index=False)

# Close the browser
driver.quit()

Found 10 links
Scraped href: https://www.redbus.in/bus-tickets/tezpur-to-guwahati, Route: Tezpur to Guwahati
Scraped href: https://www.redbus.in/bus-tickets/guwahati-to-tezpur, Route: Guwahati to Tezpur
Scraped href: https://www.redbus.in/bus-tickets/nagaon-to-guwahati, Route: Nagaon (Assam) to Guwahati
Scraped href: https://www.redbus.in/bus-tickets/guwahati-to-nagaon, Route: Guwahati to Nagaon (Assam)
Scraped href: https://www.redbus.in/bus-tickets/goalpara-to-guwahati, Route: Goalpara to Guwahati
Scraped href: https://www.redbus.in/bus-tickets/jorhat-to-north-lakhimpur, Route: Jorhat to North Lakhimpur
Scraped href: https://www.redbus.in/bus-tickets/dhubri-to-guwahati, Route: Dhubri to Guwahati
Scraped href: https://www.redbus.in/bus-tickets/guwahati-to-dhubri, Route: Guwahati to Dhubri
Scraped href: https://www.redbus.in/bus-tickets/north-lakhimpur-to-sibsagar, Route: North Lakhimpur to Sibsagar
Scraped href: https://www.redbus.in/bus-tickets/north-lakhimpur-to-jorhat, Route: North

In [18]:
# TRAIL 1: UPSRTC
# SUCCESS

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# Initialize the Chrome driver
driver = webdriver.Chrome()

# Open the desired URL
driver.get('https://www.redbus.in/online-booking/uttar-pradesh-state-road-transport-corporation-upsrtc/?utm_source=rtchometile')

# Maximize the browser window
driver.maximize_window()

# Function to scrape hrefs on the current page
def scrape_hrefs():
    bus_routes_links = driver.find_elements(By.XPATH, "//div[contains(@class, 'route_details')]//a")
    print(f"Found {len(bus_routes_links)} links")
    hrefs = [link.get_attribute('href') for link in bus_routes_links]
    route_names = [link.text for link in bus_routes_links]
    for href, route in zip(hrefs, route_names):
        print(f"Scraped href: {href}, Route: {route}")
    return hrefs, route_names

# List to store all hrefs
all_hrefs = []
all_route_names = []

# Scroll the page a specific number of times (e.g., twice) on the first page only
for _ in range(3):
    body = driver.find_element(By.TAG_NAME, "body")
    ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
    time.sleep(3)  # Wait for the page to load new content


# Scrape hrefs on the first page
hrefs, route_names = scrape_hrefs()
all_hrefs.extend(hrefs)
all_route_names.extend(route_names)

# Loop through the next 4 pages
for page_num in range(2, 6):
    # Click the page number button to go to the next page
    page_button_xpath = f"//div[contains(@class,'DC_117_pageTabs') and text()='{page_num}']"
    page_button = driver.find_element(By.XPATH, page_button_xpath)
    page_button.click()
    
    # Wait for the new page to load
    time.sleep(5)
    
    # Scrape hrefs on the current page
    hrefs, route_names = scrape_hrefs()
    all_hrefs.extend(hrefs)
    all_route_names.extend(route_names)

# Initialize an empty DataFrame for all routes
UPSRTC = pd.DataFrame()

# Function to extract text and handle missing ratings
def extract_text_with_default(elements, default_value='new bus'):
    return [element.text if element.text else default_value for element in elements]

# Function to pad lists to the maximum length
def pad_list(lst, max_length):
    return lst + [''] * (max_length - len(lst))

# Loop through each href and extract data
for href, route in zip(all_hrefs, all_route_names):
    driver.get(href)
    
    # Wait for the page to load
    time.sleep(8)

    # Click all the available buttons (if necessary)
    click_buttons = driver.find_elements(By.XPATH, "//div[@class='button']")
    for button in click_buttons:
        try:
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable(button)).click()
        except Exception as e:
            print(f"Could not click button: {e}")
        
    time.sleep(2)

    # Scrolling through the page using ActionChains
    scrolling = True
    max_scroll_attempts = 5  # Maximum number of scroll attempts
    scroll_attempts = 0

    while scrolling and scroll_attempts < max_scroll_attempts:
        old_page_source = driver.page_source

        # Use ActionChains to perform a PAGE_DOWN
        ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()

        # Wait for the page to load new content
        time.sleep(3)  # Increased wait time for dynamic content

        new_page_source = driver.page_source

        # Check if the page source has changed
        if new_page_source == old_page_source:
            scroll_attempts += 1
        else:
            scroll_attempts = 0  # Reset the counter if new content is loaded

        if scroll_attempts >= max_scroll_attempts:
            scrolling = False

    # Collect data
    bus_names = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']"))
    bus_types = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']"))
    departure_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dp-time f-19 d-color f-bold']"))
    durations = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dur l-color lh-24']"))
    reaching_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bp-time f-19 d-color disp-Inline']"))
    prices = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='fare d-block']"))
    
    rating_elements = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
    ratings = extract_text_with_default(rating_elements)
    if len(ratings) < len(bus_names):
        ratings.extend(['new bus'] * (len(bus_names) - len(ratings)))

    seat_availability = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='seat-left m-top-16' or @class='seat-left m-top-30']"))

    # Find the maximum length among the lists
    max_length = max(len(bus_names), len(bus_types), len(departure_times), len(durations), len(reaching_times), len(ratings), len(prices), len(seat_availability))

    # Pad all lists to the same length
    bus_names = pad_list(bus_names, max_length)
    bus_types = pad_list(bus_types, max_length)
    departure_times = pad_list(departure_times, max_length)
    durations = pad_list(durations, max_length)
    reaching_times = pad_list(reaching_times, max_length)
    ratings = pad_list(ratings, max_length)
    prices = pad_list(prices, max_length)
    seat_availability = pad_list(seat_availability, max_length)

    # Create a DataFrame for the current route
    route_df = pd.DataFrame({
        'Route Name': [route] * max_length,
        'Route Link': [href] * max_length,
        'Bus Name': bus_names,
        'Bus Type': bus_types,
        'Departure Time': departure_times,
        'Duration': durations,
        'Reaching Time': reaching_times,
        'Bus Rating': ratings,
        'Price': prices,
        'Seat Availability': seat_availability
    })

    # Concatenate the current route DataFrame with the main DataFrame
    UPSRTC = pd.concat([UPSRTC, route_df], ignore_index=True)

# Output the DataFrame to verify
print(UPSRTC)

# Save the DataFrame to a CSV file
UPSRTC.to_csv('UPSRTC.csv', index=False)

# Close the browser
driver.quit()

Found 10 links
Scraped href: https://www.redbus.in/bus-tickets/bareilly-to-delhi, Route: Bareilly to Delhi
Scraped href: https://www.redbus.in/bus-tickets/aligarh-uttar-pradesh-to-delhi, Route: Aligarh (uttar pradesh) to Delhi
Scraped href: https://www.redbus.in/bus-tickets/delhi-to-bareilly, Route: Delhi to Bareilly
Scraped href: https://www.redbus.in/bus-tickets/delhi-to-aligarh-uttar-pradesh, Route: Delhi to Aligarh (uttar pradesh)
Scraped href: https://www.redbus.in/bus-tickets/farrukhabad-up-to-delhi, Route: Farrukhabad (Uttar Pradesh) to Delhi
Scraped href: https://www.redbus.in/bus-tickets/badaun-to-delhi, Route: Badaun to Delhi
Scraped href: https://www.redbus.in/bus-tickets/lucknow-to-allahabad, Route: Lucknow to Allahabad
Scraped href: https://www.redbus.in/bus-tickets/lucknow-to-varanasi, Route: Lucknow to Varanasi
Scraped href: https://www.redbus.in/bus-tickets/delhi-to-badaun, Route: Delhi to Badaun
Scraped href: https://www.redbus.in/bus-tickets/agra-to-bareilly, Route: A

In [19]:
# TRAIL 1: WBTC
# SUCCESS

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# Initialize the Chrome driver
driver = webdriver.Chrome()

# Open the desired URL
driver.get('https://www.redbus.in/online-booking/wbtc-ctc/?utm_source=rtchometile')

# Maximize the browser window
driver.maximize_window()

# Function to scrape hrefs on the current page
def scrape_hrefs():
    bus_routes_links = driver.find_elements(By.XPATH, "//div[contains(@class, 'route_details')]//a")
    print(f"Found {len(bus_routes_links)} links")
    hrefs = [link.get_attribute('href') for link in bus_routes_links]
    route_names = [link.text for link in bus_routes_links]
    for href, route in zip(hrefs, route_names):
        print(f"Scraped href: {href}, Route: {route}")
    return hrefs, route_names

# List to store all hrefs
all_hrefs = []
all_route_names = []

# Scroll the page a specific number of times (e.g., twice) on the first page only
for _ in range(3):
    body = driver.find_element(By.TAG_NAME, "body")
    ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
    time.sleep(3)  # Wait for the page to load new content


# Scrape hrefs on the first page
hrefs, route_names = scrape_hrefs()
all_hrefs.extend(hrefs)
all_route_names.extend(route_names)

# Loop through the next 4 pages
for page_num in range(2, 6):
    # Click the page number button to go to the next page
    page_button_xpath = f"//div[contains(@class,'DC_117_pageTabs') and text()='{page_num}']"
    page_button = driver.find_element(By.XPATH, page_button_xpath)
    page_button.click()
    
    # Wait for the new page to load
    time.sleep(5)
    
    # Scrape hrefs on the current page
    hrefs, route_names = scrape_hrefs()
    all_hrefs.extend(hrefs)
    all_route_names.extend(route_names)

# Initialize an empty DataFrame for all routes
WBTC = pd.DataFrame()

# Function to extract text and handle missing ratings
def extract_text_with_default(elements, default_value='new bus'):
    return [element.text if element.text else default_value for element in elements]

# Function to pad lists to the maximum length
def pad_list(lst, max_length):
    return lst + [''] * (max_length - len(lst))

# Loop through each href and extract data
for href, route in zip(all_hrefs, all_route_names):
    driver.get(href)
    
    # Wait for the page to load
    time.sleep(8)

    # Click all the available buttons (if necessary)
    click_buttons = driver.find_elements(By.XPATH, "//div[@class='button']")
    for button in click_buttons:
        try:
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable(button)).click()
        except Exception as e:
            print(f"Could not click button: {e}")
        
    time.sleep(2)

    # Scrolling through the page using ActionChains
    scrolling = True
    max_scroll_attempts = 5  # Maximum number of scroll attempts
    scroll_attempts = 0

    while scrolling and scroll_attempts < max_scroll_attempts:
        old_page_source = driver.page_source

        # Use ActionChains to perform a PAGE_DOWN
        ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()

        # Wait for the page to load new content
        time.sleep(3)  # Increased wait time for dynamic content

        new_page_source = driver.page_source

        # Check if the page source has changed
        if new_page_source == old_page_source:
            scroll_attempts += 1
        else:
            scroll_attempts = 0  # Reset the counter if new content is loaded

        if scroll_attempts >= max_scroll_attempts:
            scrolling = False

    # Collect data
    bus_names = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']"))
    bus_types = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']"))
    departure_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dp-time f-19 d-color f-bold']"))
    durations = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dur l-color lh-24']"))
    reaching_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bp-time f-19 d-color disp-Inline']"))
    prices = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='fare d-block']"))
    
    rating_elements = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
    ratings = extract_text_with_default(rating_elements)
    if len(ratings) < len(bus_names):
        ratings.extend(['new bus'] * (len(bus_names) - len(ratings)))

    seat_availability = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='seat-left m-top-16' or @class='seat-left m-top-30']"))

    # Find the maximum length among the lists
    max_length = max(len(bus_names), len(bus_types), len(departure_times), len(durations), len(reaching_times), len(ratings), len(prices), len(seat_availability))

    # Pad all lists to the same length
    bus_names = pad_list(bus_names, max_length)
    bus_types = pad_list(bus_types, max_length)
    departure_times = pad_list(departure_times, max_length)
    durations = pad_list(durations, max_length)
    reaching_times = pad_list(reaching_times, max_length)
    ratings = pad_list(ratings, max_length)
    prices = pad_list(prices, max_length)
    seat_availability = pad_list(seat_availability, max_length)

    # Create a DataFrame for the current route
    route_df = pd.DataFrame({
        'Route Name': [route] * max_length,
        'Route Link': [href] * max_length,
        'Bus Name': bus_names,
        'Bus Type': bus_types,
        'Departure Time': departure_times,
        'Duration': durations,
        'Reaching Time': reaching_times,
        'Bus Rating': ratings,
        'Price': prices,
        'Seat Availability': seat_availability
    })

    # Concatenate the current route DataFrame with the main DataFrame
    WBTC = pd.concat([WBTC, route_df], ignore_index=True)

# Output the DataFrame to verify
print(WBTC)

# Save the DataFrame to a CSV file
WBTC.to_csv('WBTC.csv', index=False)

# Close the browser
driver.quit()

Found 10 links
Scraped href: https://www.redbus.in/bus-tickets/durgapur-to-kolkata, Route: Durgapur (West Bengal) to Kolkata
Scraped href: https://www.redbus.in/bus-tickets/digha-to-barasat-west-bengal, Route: Digha to Barasat (West Bengal)
Scraped href: https://www.redbus.in/bus-tickets/barasat-west-bengal-to-digha, Route: Barasat (West Bengal) to Digha
Scraped href: https://www.redbus.in/bus-tickets/suri-to-kolkata, Route: Suri to Kolkata
Scraped href: https://www.redbus.in/bus-tickets/kolkata-to-digha, Route: Kolkata to Digha
Scraped href: https://www.redbus.in/bus-tickets/bolpur-west-bengal-to-kolkata, Route: Bolpur (West Bengal) to Kolkata
Scraped href: https://www.redbus.in/bus-tickets/kolkata-to-suri, Route: Kolkata to Suri
Scraped href: https://www.redbus.in/bus-tickets/digha-to-kolkata, Route: Digha to Kolkata
Scraped href: https://www.redbus.in/bus-tickets/kolkata-to-durgapur, Route: Kolkata to Durgapur (West Bengal)
Scraped href: https://www.redbus.in/bus-tickets/habra-to-di

In [20]:
# TRAIL 1: CTU
# SUCCESS

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# Initialize the Chrome driver
driver = webdriver.Chrome()

# Open the desired URL
driver.get('https://www.redbus.in/online-booking/chandigarh-transport-undertaking-ctu')

# Maximize the browser window
driver.maximize_window()

# Function to scrape hrefs on the current page
def scrape_hrefs():
    bus_routes_links = driver.find_elements(By.XPATH, "//div[contains(@class, 'route_details')]//a")
    print(f"Found {len(bus_routes_links)} links")
    hrefs = [link.get_attribute('href') for link in bus_routes_links]
    route_names = [link.text for link in bus_routes_links]
    for href, route in zip(hrefs, route_names):
        print(f"Scraped href: {href}, Route: {route}")
    return hrefs, route_names

# List to store all hrefs
all_hrefs = []
all_route_names = []

# Scroll the page a specific number of times (e.g., twice) on the first page only
for _ in range(3):
    body = driver.find_element(By.TAG_NAME, "body")
    ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
    time.sleep(3)  # Wait for the page to load new content


# Scrape hrefs on the first page
hrefs, route_names = scrape_hrefs()
all_hrefs.extend(hrefs)
all_route_names.extend(route_names)

# Loop through the next 4 pages
for page_num in range(2, 6):
    # Click the page number button to go to the next page
    page_button_xpath = f"//div[contains(@class,'DC_117_pageTabs') and text()='{page_num}']"
    page_button = driver.find_element(By.XPATH, page_button_xpath)
    page_button.click()
    
    # Wait for the new page to load
    time.sleep(5)
    
    # Scrape hrefs on the current page
    hrefs, route_names = scrape_hrefs()
    all_hrefs.extend(hrefs)
    all_route_names.extend(route_names)

# Initialize an empty DataFrame for all routes
CTU = pd.DataFrame()

# Function to extract text and handle missing ratings
def extract_text_with_default(elements, default_value='new bus'):
    return [element.text if element.text else default_value for element in elements]

# Function to pad lists to the maximum length
def pad_list(lst, max_length):
    return lst + [''] * (max_length - len(lst))

# Loop through each href and extract data
for href, route in zip(all_hrefs, all_route_names):
    driver.get(href)
    
    # Wait for the page to load
    time.sleep(8)

    # Click all the available buttons (if necessary)
    click_buttons = driver.find_elements(By.XPATH, "//div[@class='button']")
    for button in click_buttons:
        try:
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable(button)).click()
        except Exception as e:
            print(f"Could not click button: {e}")
        
    time.sleep(2)

    # Scrolling through the page using ActionChains
    scrolling = True
    max_scroll_attempts = 5  # Maximum number of scroll attempts
    scroll_attempts = 0

    while scrolling and scroll_attempts < max_scroll_attempts:
        old_page_source = driver.page_source

        # Use ActionChains to perform a PAGE_DOWN
        ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()

        # Wait for the page to load new content
        time.sleep(3)  # Increased wait time for dynamic content

        new_page_source = driver.page_source

        # Check if the page source has changed
        if new_page_source == old_page_source:
            scroll_attempts += 1
        else:
            scroll_attempts = 0  # Reset the counter if new content is loaded

        if scroll_attempts >= max_scroll_attempts:
            scrolling = False

    # Collect data
    bus_names = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']"))
    bus_types = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']"))
    departure_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dp-time f-19 d-color f-bold']"))
    durations = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='dur l-color lh-24']"))
    reaching_times = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='bp-time f-19 d-color disp-Inline']"))
    prices = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='fare d-block']"))
    
    rating_elements = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
    ratings = extract_text_with_default(rating_elements)
    if len(ratings) < len(bus_names):
        ratings.extend(['new bus'] * (len(bus_names) - len(ratings)))

    seat_availability = extract_text_with_default(driver.find_elements(By.XPATH, "//div[@class='seat-left m-top-16' or @class='seat-left m-top-30']"))

    # Find the maximum length among the lists
    max_length = max(len(bus_names), len(bus_types), len(departure_times), len(durations), len(reaching_times), len(ratings), len(prices), len(seat_availability))

    # Pad all lists to the same length
    bus_names = pad_list(bus_names, max_length)
    bus_types = pad_list(bus_types, max_length)
    departure_times = pad_list(departure_times, max_length)
    durations = pad_list(durations, max_length)
    reaching_times = pad_list(reaching_times, max_length)
    ratings = pad_list(ratings, max_length)
    prices = pad_list(prices, max_length)
    seat_availability = pad_list(seat_availability, max_length)

    # Create a DataFrame for the current route
    route_df = pd.DataFrame({
        'Route Name': [route] * max_length,
        'Route Link': [href] * max_length,
        'Bus Name': bus_names,
        'Bus Type': bus_types,
        'Departure Time': departure_times,
        'Duration': durations,
        'Reaching Time': reaching_times,
        'Bus Rating': ratings,
        'Price': prices,
        'Seat Availability': seat_availability
    })

    # Concatenate the current route DataFrame with the main DataFrame
    CTU = pd.concat([CTU, route_df], ignore_index=True)

# Output the DataFrame to verify
print(CTU)

# Save the DataFrame to a CSV file
CTU.to_csv('CTU.csv', index=False)

# Close the browser
driver.quit()

Found 10 links
Scraped href: https://www.redbus.in/bus-tickets/chandigarh-to-delhi, Route: Chandigarh to Delhi
Scraped href: https://www.redbus.in/bus-tickets/delhi-to-chandigarh, Route: Delhi to Chandigarh
Scraped href: https://www.redbus.in/bus-tickets/yamuna-nagar-to-chandigarh, Route: Yamuna Nagar to Chandigarh
Scraped href: https://www.redbus.in/bus-tickets/chandigarh-to-shimla, Route: Chandigarh to Shimla
Scraped href: https://www.redbus.in/bus-tickets/chandigarh-to-vrindavan, Route: Chandigarh to Vrindavan
Scraped href: https://www.redbus.in/bus-tickets/chandigarh-to-yamuna-nagar, Route: Chandigarh to Yamuna Nagar
Scraped href: https://www.redbus.in/bus-tickets/chandigarh-to-sujanpur, Route: Chandigarh to Sujanpur (himachal pradesh)
Scraped href: https://www.redbus.in/bus-tickets/ludhiana-to-chandigarh, Route: Ludhiana to Chandigarh
Scraped href: https://www.redbus.in/bus-tickets/hamirpur-himachal-pradesh-to-chandigarh, Route: Hamirpur (Himachal Pradesh) to Chandigarh
Scraped hr