In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.keys import Keys

import time
import numpy as np

In [3]:
def get_routes_and_links(driver):
    """Fetch route names and links from the page."""
    wait = WebDriverWait(driver, 30)
    elements = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//a[@class='route']")))
    
    routes, links = [], []
    for element in elements:
        name = element.text
        link = element.get_attribute('href')
        routes.append(name)
        links.append(link)
    
    return routes, links

def navigate_to_next_page(driver, page_number):
    """Navigate to the next page if available."""
    wait = WebDriverWait(driver, 30)
    
    try:
        # Locate the pagination container
        pagination_container = wait.until(EC.presence_of_element_located(
            (By.XPATH, '//*[@id="root"]/div/div[4]/div[12]')
        ))

        # Locate the next page button
        next_page_button = pagination_container.find_element(
            By.XPATH, f'.//div[contains(@class, "DC_117_pageTabs") and text()="{page_number + 1}"]'
        )

        # Scroll to the next page button
        actions = ActionChains(driver)
        actions.move_to_element(next_page_button).perform()
        time.sleep(1)  # Allow time for scroll

        print(f"Clicking on page {page_number + 1}")
        next_page_button.click()

        # Wait for the page number to update
        wait.until(EC.text_to_be_present_in_element(
            (By.XPATH, '//div[contains(@class, "DC_117_pageTabs DC_117_pageActive")]'), str(page_number + 1)
        ))

        print(f"Successfully navigated to page {page_number + 1}")
        time.sleep(3)  # Wait for the new page to load

    except Exception as e:
        print(f"An error occurred while navigating to page {page_number + 1}: {e}")

def main(url, num_pages):
    """Main function to scrape routes and links."""
    driver = webdriver.Chrome()  # Ensure the correct WebDriver is installed
    driver.get(url)

    all_routes, all_links = [], []

    for page_number in range(1, num_pages + 1):
        routes, links = get_routes_and_links(driver)
        all_routes.extend(routes)
        all_links.extend(links)

        if page_number < num_pages:  # Skip navigation on the last page
            navigate_to_next_page(driver, page_number)

    # Save the results to a CSV file
    df = pd.DataFrame({'Route_Name': all_routes, 'Link1': all_links})#read the csv file
    df.to_csv('HR1.csv', index=False)
    driver.quit()  # Close the driver


if __name__ == "__main__":
    url = "https://www.redbus.in/online-booking/hrtc/?utm_source=rtchometile"  # Your target URL
    main(url, 4)  # Scrape data from the first 4 pages


Clicking on page 2
Successfully navigated to page 2
Clicking on page 3
Successfully navigated to page 3
Clicking on page 4
Successfully navigated to page 4


In [4]:
#read the csv file
df1_HR=pd.read_csv("HR1.csv")
df1_HR


Unnamed: 0,Route_Name,Link1
0,Delhi to Shimla,https://www.redbus.in/bus-tickets/delhi-to-shimla
1,Shimla to Delhi,https://www.redbus.in/bus-tickets/shimla-to-delhi
2,Chandigarh to Hamirpur (Himachal Pradesh),https://www.redbus.in/bus-tickets/chandigarh-t...
3,Hamirpur (Himachal Pradesh) to Chandigarh,https://www.redbus.in/bus-tickets/hamirpur-him...
4,Delhi to Chandigarh,https://www.redbus.in/bus-tickets/delhi-to-cha...
5,Manali to Chandigarh,https://www.redbus.in/bus-tickets/manali-to-ch...
6,Shimla to Chandigarh,https://www.redbus.in/bus-tickets/shimla-to-ch...
7,Chandigarh to Manali,https://www.redbus.in/bus-tickets/chandigarh-t...
8,Shimla to Manali,https://www.redbus.in/bus-tickets/shimla-to-ma...
9,Dharamshala (Himachal Pradesh) to Chandigarh,https://www.redbus.in/bus-tickets/dharamshala-...


In [5]:
#retrive the bus details
driver = webdriver.Chrome()
Bus_names_Hr=[]
Bus_types_Hr= []
Start_Time_Hr=[]
End_Time_Hr= []
Ratings_Hr= []
Total_Duration_Hr= []
Prices_Hr= []
Seats_Available_Hr= []
Route_names = []
Route_links = []

for i,r in df1_HR.iterrows():
    link=r["Link1"]
    routes=r["Route_Name"]

# Loop through each link
    driver.get(link)
    time.sleep(2)  

    # Click on elements to reveal bus details
    elements = driver.find_elements(By.XPATH, f"//a[contains(@href, '{link}')]")
    for element in elements:
        element.click()
        time.sleep(2)
    try:
        clicks = driver.find_element(By.XPATH, "//div[@class='button']")
        clicks.click()
    except:
        continue  
    time.sleep(2)
    scrolling = True
    while scrolling:
            old_page_source = driver.page_source
            # Use ActionChains to perform a PAGE_DOWN
            ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
            time.sleep(5)
            new_page_source = driver.page_source
            if new_page_source == old_page_source:
                scrolling = False

    # Extract bus details
    bus_name = driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']")
    bus_type = driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']")
    start_time = driver.find_elements(By.XPATH, "//*[@class='dp-time f-19 d-color f-bold']")
    end_time = driver.find_elements(By.XPATH, "//*[@class='bp-time f-19 d-color disp-Inline']")
    total_duration = driver.find_elements(By.XPATH, "//*[@class='dur l-color lh-24']")
    try:
        rating = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
    except:
        continue
    price = driver.find_elements(By.XPATH, '//*[@class="fare d-block"]')
    seats = driver.find_elements(By.XPATH, "//div[contains(@class, 'seat-left')]")

    # Append data to respective lists
    for bus in bus_name:
        Bus_names_Hr.append(bus.text)
        Route_links.append(link)
        Route_names.append(routes)
    for bus_type_elem in bus_type:
        Bus_types_Hr.append(bus_type_elem.text)
    for start_time_elem in start_time:
        Start_Time_Hr.append(start_time_elem.text)
    for end_time_elem in end_time:
        End_Time_Hr.append(end_time_elem.text)
    for total_duration_elem in total_duration:
        Total_Duration_Hr.append(total_duration_elem.text)
    for ratings in rating:
        Ratings_Hr.append(ratings.text)
    for price_elem in price:
        Prices_Hr.append(price_elem.text)
    for seats_elem in seats:
        Seats_Available_Hr.append(seats_elem.text)
        
print("Successfully Completed")



Successfully Completed


In [6]:

import numpy as np
# Proceed with creating the DataFrame
data_hr = {
    'Bus_name': Bus_names_Hr,
    'Bus_type': Bus_types_Hr,
    'Start_time': Start_Time_Hr,
    'End_time': End_Time_Hr,
    'Total_duration': Total_Duration_Hr,
    'Price': Prices_Hr,
    "Seats_Available": Seats_Available_Hr,
    "Rating" :Ratings_Hr,
    'Route_link': Route_links,
    'Route_name': Route_names
}


# Determine the maximum length
max_len = max(len(values) for values in data_hr.values())

# Normalize the length of each column by padding with NaN
for key in data_hr:
    while len(data_hr[key]) < max_len:
        data_hr[key].append(np.nan)

# Create DataFramedf2=pd.read_csv("ap3.csv")

df_buses_1 = pd.DataFrame(data_hr)

# Save to CSV
df_buses_1.to_csv('HR2.csv', index=False)
print("Data saved to HR2.csv successfully!")


Data saved to HR2.csv successfully!


In [7]:
df2_HR=pd.read_csv("HR2.csv")
df2_HR

Unnamed: 0,Bus_name,Bus_type,Start_time,End_time,Total_duration,Price,Seats_Available,Rating,Route_link,Route_name
0,Bedi Travels,Volvo A/C Semi Sleeper (2+2),23:00,08:00,09h 00m,710,16 Seats available,4.7,https://www.redbus.in/bus-tickets/delhi-to-shimla,Delhi to Shimla
1,Laxmi holidays,Scania Multi-Axle AC Semi Sleeper (2+2),23:30,08:00,08h 30m,INR 499,41 Seats available,4.7,https://www.redbus.in/bus-tickets/delhi-to-shimla,Delhi to Shimla
2,Zingbus Plus,Bharat Benz A/C Semi Sleeper (2+2),22:35,08:10,09h 35m,512,24 Seats available,4.7,https://www.redbus.in/bus-tickets/delhi-to-shimla,Delhi to Shimla
3,AdRam Dalal Holidays,,21:50,07:10,09h 20m,798,40 Seats available,3.9,https://www.redbus.in/bus-tickets/delhi-to-shimla,Delhi to Shimla
4,,,,,,,,,https://www.redbus.in/bus-tickets/delhi-to-shimla,Delhi to Shimla
...,...,...,...,...,...,...,...,...,...,...
212,HRTC - 291,Himmani Deluxe 2+2 Non AC Seater,20:20,03:00,06h 40m,INR 545,20 Seats available,3.9,https://www.redbus.in/bus-tickets/shimla-to-kullu,Shimla to Kullu
213,HRTC - 154,,21:00,04:50,07h 50m,INR 818,11 Seats available,4.5,https://www.redbus.in/bus-tickets/shimla-to-kullu,Shimla to Kullu
214,,,,,,,,,https://www.redbus.in/bus-tickets/shimla-to-kullu,Shimla to Kullu
215,,,,,,,,,https://www.redbus.in/bus-tickets/shimla-to-kullu,Shimla to Kullu
