In [9]:
pip install selenium




In [11]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.keys import Keys

import time

In [12]:
def get_routes_and_links(driver):
    """Fetch route names and links from the page."""
    wait = WebDriverWait(driver, 30)
    elements = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//a[@class='route']")))
    
    routes, links = [], []
    for element in elements:
        name = element.text
        link = element.get_attribute('href')
        routes.append(name)
        links.append(link)
    
    return routes, links

def navigate_to_next_page(driver, page_number):
    """Navigate to the next page if available."""
    wait = WebDriverWait(driver, 30)
    
    try:
        # Locate the pagination container
        pagination_container = wait.until(EC.presence_of_element_located(
            (By.XPATH, '//*[@id="root"]/div/div[4]/div[12]')
        ))

        # Locate the next page button
        next_page_button = pagination_container.find_element(
            By.XPATH, f'.//div[contains(@class, "DC_117_pageTabs") and text()="{page_number + 1}"]'
        )

        # Scroll to the next page button
        actions = ActionChains(driver)
        actions.move_to_element(next_page_button).perform()
        time.sleep(1)  # Allow time for scroll

        print(f"Clicking on page {page_number + 1}")
        next_page_button.click()

        # Wait for the page number to update
        wait.until(EC.text_to_be_present_in_element(
            (By.XPATH, '//div[contains(@class, "DC_117_pageTabs DC_117_pageActive")]'), str(page_number + 1)
        ))

        print(f"Successfully navigated to page {page_number + 1}")
        time.sleep(3)  # Wait for the new page to load

    except Exception as e:
        print(f"An error occurred while navigating to page {page_number + 1}: {e}")

def main(url, num_pages):
    """Main function to scrape routes and links."""
    driver = webdriver.Chrome()  # Ensure the correct WebDriver is installed
    driver.get(url)

    all_routes, all_links = [], []

    for page_number in range(1, num_pages + 1):
        routes, links = get_routes_and_links(driver)
        all_routes.extend(routes)
        all_links.extend(links)

        if page_number < num_pages:  # Skip navigation on the last page
            navigate_to_next_page(driver, page_number)

    # Save the results to a CSV file
    df = pd.DataFrame({'Route_Name': all_routes, 'Link1': all_links})
    df.to_csv('ap1.csv', index=False)
    driver.quit()  # Close the driver

if __name__ == "__main__":
    url = "https://www.redbus.in/online-booking/apsrtc/?utm_source=rtchometile"  # Your target URL
    main(url, 5)  # Scrape data from the first 2 pages

Clicking on page 2
Successfully navigated to page 2
Clicking on page 3
Successfully navigated to page 3
Clicking on page 4
Successfully navigated to page 4
Clicking on page 5
Successfully navigated to page 5


In [13]:
#read the csv file
df1_A=pd.read_csv("ap1.csv")
df1_A


Unnamed: 0,Route_Name,Link1
0,Hyderabad to Vijayawada,https://www.redbus.in/bus-tickets/hyderabad-to...
1,Vijayawada to Hyderabad,https://www.redbus.in/bus-tickets/vijayawada-t...
2,Bangalore to Tirupati,https://www.redbus.in/bus-tickets/bangalore-to...
3,Bangalore to Kadapa,https://www.redbus.in/bus-tickets/bangalore-to...
4,Kakinada to Visakhapatnam,https://www.redbus.in/bus-tickets/kakinada-to-...
5,Hyderabad to Ongole,https://www.redbus.in/bus-tickets/hyderabad-to...
6,Ongole to Hyderabad,https://www.redbus.in/bus-tickets/ongole-to-hy...
7,Bangalore to Anantapur (andhra pradesh),https://www.redbus.in/bus-tickets/bangalore-to...
8,Bangalore to Chittoor (Andhra Pradesh),https://www.redbus.in/bus-tickets/bangalore-to...
9,Chittoor (Andhra Pradesh) to Bangalore,https://www.redbus.in/bus-tickets/chittoor-and...


In [14]:
#retrive the bus details
driver = webdriver.Chrome()
Bus_names_A=[]
Bus_type_A=[]
Start_Time_A=[]
End_Time_A=[]
Ratings_A=[]
Total_Duration_A=[]
Prices_A=[]
Seats_Available_A=[]
Route_names=[]
Route_links=[]

for i,r in df1_A.iterrows():
    link=r["Link1"]
    routes=r["Route_Name"]

# Loop through each link
    driver.get(link)
    time.sleep(2)  

    # Click on elements to reveal bus details
    elements = driver.find_elements(By.XPATH, f"//a[contains(@href, '{link}')]")
    for element in elements:
        element.click()
        time.sleep(2)
    try:
        clicks = driver.find_element(By.XPATH, "//div[@class='button']")
        clicks.click()
    except:
        continue  
    time.sleep(2)
        
    scrolling = True
    while scrolling:
            old_page_source = driver.page_source
            # Use ActionChains to perform a PAGE_DOWN
            ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
            time.sleep(5)
            new_page_source = driver.page_source
            if new_page_source == old_page_source:
                scrolling = False

    # Extract bus details
    bus_name = driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']")
    bus_type = driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']")
    start_time = driver.find_elements(By.XPATH, "//*[@class='dp-time f-19 d-color f-bold']")
    end_time = driver.find_elements(By.XPATH, "//*[@class='bp-time f-19 d-color disp-Inline']")
    total_duration = driver.find_elements(By.XPATH, "//*[@class='dur l-color lh-24']")
    rating = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
    price = driver.find_elements(By.XPATH, '//*[@class="fare d-block"]')
    seats = driver.find_elements(By.XPATH, "//div[contains(@class, 'seat-left')]")

    # Append data to respective lists
    for bus in bus_name:
        Bus_names_A.append(bus.text)
        Route_links.append(link)
        Route_names.append(routes)
    for bus_type_elem in bus_type:
        Bus_type_A.append(bus_type_elem.text)
    for start_time_elem in start_time:
        Start_Time_A.append(start_time_elem.text)
    for end_time_elem in end_time:
        End_Time_A.append(end_time_elem.text)
    for total_duration_elem in total_duration:
        Total_Duration_A.append(total_duration_elem.text)
    for ratings in rating:
        Ratings_A.append(ratings.text)
    for price_elem in price:
        Prices_A.append(price_elem.text)
    for seats_elem in seats:
        Seats_Available_A.append(seats_elem.text)
        
print("Successfully Completed")



Successfully Completed


In [None]:


import numpy as np

# Proceed with creating the DataFrame
data_A= {
    'Bus_name': Bus_names_A,
    'Bus_type': Bus_type_A,
    'Start_time': Start_Time_A,
    'End_time': End_Time_A,
    'Total_duration': Total_Duration_A,
    'Price': Prices_A,
    "Seats_Available": Seats_Available_A,
    "Rating":Ratings_A,
    'Route_link': Route_links,
    'Route_name': Route_names
}

# Determine the maximum length
max_len = max(len(values) for values in data_A.values())

# Normalize the length of each column by padding with NaN
for key in data_A:
    while len(data_A[key]) < max_len:
        data_A[key].append(np.nan)

# Create DataFramedf2=pd.read_csv("ap3.csv")

df_buses_1 = pd.DataFrame(data_A)

# Save to CSV
df_buses_1.to_csv('ap2.csv', index=False) 
print("Data saved to ap2.csv successfully!")


Data saved to ap2.csv successfully!


In [16]:
df2_A=pd.read_csv("ap2.csv")
df2_A

Unnamed: 0,Bus_name,Bus_type,Start_time,End_time,Total_duration,Price,Seats_Available,Rating,Route_link,Route_name
0,APSRTC - 35188,VENNELA (A.C. SLEEPER),00:40,06:45,06h 05m,INR 737,15 Seats available,3.9,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
1,APSRTC - 3563,"SUPER LUXURY (NON-AC, 2 + 2 PUSH BACK)",01:00,07:15,06h 15m,INR 412,31 Seats available,4.0,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
2,APSRTC - 35266,VENNELA (A.C. SLEEPER),01:25,07:30,06h 05m,INR 737,23 Seats available,3.8,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
3,,,,,,,,,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
4,,,,,,,,,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
...,...,...,...,...,...,...,...,...,...,...
447,V Kaveri Travels,A/C Sleeper (2+1),21:30,07:00,09h 30m,INR 900,18 Seats available,4.2,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Kanigiri
448,Kaveri Travels,A/C Sleeper (2+1),21:30,07:00,09h 30m,INR 819,18 Seats available,4.1,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Kanigiri
449,Raja Buses,NON AC Seater / Sleeper 2+1,22:00,06:55,08h 55m,INR 500,28 Seats available,4.0,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Kanigiri
450,,,,,,,,,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Kanigiri
