In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.keys import Keys
import numpy as np
import time

In [2]:
def get_routes_and_links(driver):
    """Fetch route names and links from the page."""
    wait = WebDriverWait(driver, 30)
    elements = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//a[@class='route']")))
    
    routes, links = [], []
    for element in elements:
        name = element.text
        link = element.get_attribute('href')
        routes.append(name)
        links.append(link)
    
    return routes, links

def navigate_to_next_page(driver, page_number):
    """Navigate to the next page if available."""
    wait = WebDriverWait(driver, 30)
    
    try:
        # Locate the pagination container
        pagination_container = wait.until(EC.presence_of_element_located(
            (By.XPATH, '//*[@id="root"]/div/div[4]/div[12]')
        ))

        # Locate the next page button
        next_page_button = pagination_container.find_element(
            By.XPATH, f'.//div[contains(@class, "DC_117_pageTabs") and text()="{page_number + 1}"]'
        )

        # Scroll to the next page button
        actions = ActionChains(driver)
        actions.move_to_element(next_page_button).perform()
        time.sleep(1)  # Allow time for scroll

        print(f"Clicking on page {page_number + 1}")
        next_page_button.click()

        # Wait for the page number to update
        wait.until(EC.text_to_be_present_in_element(
            (By.XPATH, '//div[contains(@class, "DC_117_pageTabs DC_117_pageActive")]'), str(page_number + 1)
        ))

        print(f"Successfully navigated to page {page_number + 1}")
        time.sleep(3)  # Wait for the new page to load

    except Exception as e:
        print(f"An error occurred while navigating to page {page_number + 1}: {e}")

def main(url, num_pages):
    """Main function to scrape routes and links."""
    driver = webdriver.Chrome()  # Ensure the correct WebDriver is installed
    driver.get(url)

    all_routes, all_links = [], []

    for page_number in range(1, num_pages + 1):
        routes, links = get_routes_and_links(driver)
        all_routes.extend(routes)
        all_links.extend(links)

        if page_number < num_pages:  # Skip navigation on the last page
            navigate_to_next_page(driver, page_number)

    # Save the results to a CSV file
    df = pd.DataFrame({'Route_Name': all_routes, 'Link1': all_links})#read the csv file
    df.to_csv('up1.csv', index=False)
    driver.quit()  # Close the driver


if __name__ == "__main__":
    url = "https://www.redbus.in/online-booking/uttar-pradesh-state-road-transport-corporation-upsrtc/?utm_source=rtchometile"  # Your target URL
    main(url, 5)  # Scrape data from the first 5  pages


Clicking on page 2
Successfully navigated to page 2
Clicking on page 3
Successfully navigated to page 3
Clicking on page 4
Successfully navigated to page 4
Clicking on page 5
Successfully navigated to page 5


In [3]:
df1_UP=pd.read_csv("up1.csv")
df1_UP

Unnamed: 0,Route_Name,Link1
0,Bareilly (Uttar Pradesh) to Delhi,https://www.redbus.in/bus-tickets/bareilly-to-...
1,Lucknow to Prayagraj(Uttar Pradesh),https://www.redbus.in/bus-tickets/lucknow-to-a...
2,Prayagraj(Uttar Pradesh) to Lucknow,https://www.redbus.in/bus-tickets/allahabad-to...
3,Delhi to Bareilly (Uttar Pradesh),https://www.redbus.in/bus-tickets/delhi-to-bar...
4,Aligarh (uttar pradesh) to Delhi,https://www.redbus.in/bus-tickets/aligarh-utta...
5,Lucknow to Bareilly (Uttar Pradesh),https://www.redbus.in/bus-tickets/lucknow-to-b...
6,Lucknow to Agra,https://www.redbus.in/bus-tickets/lucknow-to-agra
7,Ayodhya to Allahabad,https://www.redbus.in/bus-tickets/ayodhya-to-a...
8,Prayagraj(Uttar Pradesh) to Ayodhya,https://www.redbus.in/bus-tickets/allahabad-to...
9,Bareilly (Uttar Pradesh) to Lucknow,https://www.redbus.in/bus-tickets/bareilly-to-...


In [5]:
#retrive the bus details
driver = webdriver.Chrome()
Bus_names_ut = []
Bus_types_ut = []
Start_Time_ut = []
End_Time_ut = []
Ratings_ut = []
Total_Duration_ut = []
Prices_ut = []
Seats_Available_ut= []
Route_names = []
Route_links = []

for i,r in df1_UP.iterrows():
    link=r["Link1"]
    routes=r["Route_Name"]

# Loop through each link
    driver.get(link)
    time.sleep(2)  

    # Click on elements to reveal bus details
    elements = driver.find_elements(By.XPATH, f"//a[contains(@href, '{link}')]")
    for element in elements:
        element.click()
        time.sleep(2)
    try:
        clicks = driver.find_element(By.XPATH, "//div[@class='button']")
        clicks.click()
    except:
       print("no view button") 
    time.sleep(2)
    scrolling = True
    while scrolling:
            old_page_source = driver.page_source
            # Use ActionChains to perform a PAGE_DOWN
            ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
            time.sleep(5)
            new_page_source = driver.page_source
            if new_page_source == old_page_source:
                scrolling = False

    # Extract bus details
    bus_name = driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']")
    bus_type = driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']")
    start_time = driver.find_elements(By.XPATH, "//*[@class='dp-time f-19 d-color f-bold']")
    end_time = driver.find_elements(By.XPATH, "//*[@class='bp-time f-19 d-color disp-Inline']")
    total_duration = driver.find_elements(By.XPATH, "//*[@class='dur l-color lh-24']")
    rating = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
    price = driver.find_elements(By.XPATH, '//*[@class="fare d-block"]')
    seats = driver.find_elements(By.XPATH, "//div[contains(@class, 'seat-left')]")

    # Append data to respective lists
    for bus in bus_name:
        Bus_names_ut.append(bus.text)
        Route_links.append(link)
        Route_names.append(routes)
    for bus_type_elem in bus_type:
        Bus_types_ut.append(bus_type_elem.text)
    for start_time_elem in start_time:
        Start_Time_ut.append(start_time_elem.text)
    for end_time_elem in end_time:
        End_Time_ut.append(end_time_elem.text)
    for total_duration_elem in total_duration:
        Total_Duration_ut.append(total_duration_elem.text)
    for ratings in rating:
        Ratings_ut.append(ratings.text)
    for price_elem in price:
        Prices_ut.append(price_elem.text)
    for seats_elem in seats:
        Seats_Available_ut.append(seats_elem.text)
        
print("Successfully Completed")



no view button
no view button
no view button
no view button
no view button
Successfully Completed


In [7]:
# Proceed with creating the DataFrame
data_UP = {
    'Bus_name': Bus_names_ut,
    'Bus_type': Bus_types_ut,
    'Start_time': Start_Time_ut,
    'End_time': End_Time_ut,
    'Total_duration': Total_Duration_ut,
    'Price': Prices_ut,
    "Seats_Available": Seats_Available_ut,
    "Rating" :Ratings_ut,
    'Route_link': Route_links,
    'Route_name': Route_names
}
# Determine the maximum length
max_len = max(len(values) for values in data_UP.values())

# Normalize the length of each column by padding with NaN
for key in data_UP:
    while len(data_UP[key]) < max_len:
        data_UP[key].append(np.nan)
        
# Create DataFramedf2=pd.read_csv("ap3.csv")

df_buses_1 = pd.DataFrame(data_UP)

# Save to CSV
df_buses_1.to_csv('up2.csv', index=False)
print("Data saved to up2.csv successfully!")

Data saved to up2.csv successfully!


In [8]:
df2_UP=pd.read_csv("up2.csv")
df2_UP

Unnamed: 0,Bus_name,Bus_type,Start_time,End_time,Total_duration,Price,Seats_Available,Rating,Route_link,Route_name
0,UPSRTC - NDD0516,Ordinary Non AC Seater 2+3,13:57,20:29,06h 32m,INR 424,52 Seats available,3.3,https://www.redbus.in/bus-tickets/bareilly-to-...,Bareilly (Uttar Pradesh) to Delhi
1,UPSRTC - GRH0161,Ordinary Non AC Seater 2+3,14:00,20:30,06h 30m,INR 418,52 Seats available,3.3,https://www.redbus.in/bus-tickets/bareilly-to-...,Bareilly (Uttar Pradesh) to Delhi
2,UPSRTC - BLY0103,Shatabdi AC Seater 2+2,14:01,19:00,04h 59m,INR 598,33 Seats available,3.7,https://www.redbus.in/bus-tickets/bareilly-to-...,Bareilly (Uttar Pradesh) to Delhi
3,UPSRTC - BRT0064,Ordinary Non AC Seater 2+3,14:25,21:00,06h 35m,INR 448,51 Seats available,3.3,https://www.redbus.in/bus-tickets/bareilly-to-...,Bareilly (Uttar Pradesh) to Delhi
4,UPSRTC - PLB0001,Shatabdi AC Seater 2+2,14:30,20:00,05h 30m,INR 598,35 Seats available,2.2,https://www.redbus.in/bus-tickets/bareilly-to-...,Bareilly (Uttar Pradesh) to Delhi
...,...,...,...,...,...,...,...,...,...,...
400,Gola Bus Service,Bharat Benz A/C Seater /Sleeper (2+1),22:30,06:30,08h 00m,INR 399,28 Seats available,,https://www.redbus.in/bus-tickets/delhi-to-luc...,Delhi to Lucknow
401,PTC-SKYBUS,VE A/C Seater / Sleeper (2+1),23:50,07:30,07h 40m,INR 399,29 Seats available,,https://www.redbus.in/bus-tickets/delhi-to-luc...,Delhi to Lucknow
402,PTC-SKYBUS,VE A/C Seater / Sleeper (2+1),23:45,07:45,08h 00m,INR 399,38 Seats available,,https://www.redbus.in/bus-tickets/delhi-to-luc...,Delhi to Lucknow
403,,,,,,,,,https://www.redbus.in/bus-tickets/delhi-to-luc...,Delhi to Lucknow
