In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.keys import Keys

import time

In [3]:
def get_routes_and_links(driver):
    """Fetch route names and links from the page."""
    wait = WebDriverWait(driver, 30)
    elements = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//a[@class='route']")))
    
    routes, links = [], []
    for element in elements:
        name = element.text
        link = element.get_attribute('href')
        routes.append(name)
        links.append(link)
    
    return routes, links

def navigate_to_next_page(driver, page_number):
    """Navigate to the next page if available."""
    wait = WebDriverWait(driver, 30)
    
    try:
        # Locate the pagination container
        pagination_container = wait.until(EC.presence_of_element_located(
            (By.XPATH, '//*[@id="root"]/div/div[4]/div[12]')
        ))

        # Locate the next page button
        next_page_button = pagination_container.find_element(
            By.XPATH, f'.//div[contains(@class, "DC_117_pageTabs") and text()="{page_number + 1}"]'
        )

        # Scroll to the next page button
        actions = ActionChains(driver)
        actions.move_to_element(next_page_button).perform()
        time.sleep(1)  # Allow time for scroll

        print(f"Clicking on page {page_number + 1}")
        next_page_button.click()

        # Wait for the page number to update
        wait.until(EC.text_to_be_present_in_element(
            (By.XPATH, '//div[contains(@class, "DC_117_pageTabs DC_117_pageActive")]'), str(page_number + 1)
        ))

        print(f"Successfully navigated to page {page_number + 1}")
        time.sleep(3)  # Wait for the new page to load

    except Exception as e:
        print(f"An error occurred while navigating to page {page_number + 1}: {e}")

def main(url, num_pages):
    """Main function to scrape routes and links."""
    driver = webdriver.Chrome()  # Ensure the correct WebDriver is installed
    driver.get(url)

    all_routes, all_links = [], []

    for page_number in range(1, num_pages + 1):
        routes, links = get_routes_and_links(driver)
        all_routes.extend(routes)
        all_links.extend(links)

        if page_number < num_pages:  # Skip navigation on the last page
            navigate_to_next_page(driver, page_number)

    # Save the results to a CSV file
    df = pd.DataFrame({'Route_Name': all_routes, 'Link1': all_links})#read the csv file
    df.to_csv('as1.csv', index=False)
    driver.quit()  # Close the driver


if __name__ == "__main__":
    url = "https://www.redbus.in/online-booking/astc/?utm_source=rtchometile"  # Your target URL
    main(url, 4)  # Scrape data from the first 4   pages


Clicking on page 2
Successfully navigated to page 2
Clicking on page 3
Successfully navigated to page 3
Clicking on page 4
Successfully navigated to page 4


In [4]:
df1_As=pd.read_csv("as1.csv")
df1_As

Unnamed: 0,Route_Name,Link1
0,Guwahati to Tezpur,https://www.redbus.in/bus-tickets/guwahati-to-...
1,Guwahati to Nagaon (Assam),https://www.redbus.in/bus-tickets/guwahati-to-...
2,Guwahati to Dhubri,https://www.redbus.in/bus-tickets/guwahati-to-...
3,Jorhat to North Lakhimpur,https://www.redbus.in/bus-tickets/jorhat-to-no...
4,North Lakhimpur to Sibsagar (Assam),https://www.redbus.in/bus-tickets/north-lakhim...
5,Dhubri to Guwahati,https://www.redbus.in/bus-tickets/dhubri-to-gu...
6,North Lakhimpur to Jorhat,https://www.redbus.in/bus-tickets/north-lakhim...
7,Goalpara to Guwahati,https://www.redbus.in/bus-tickets/goalpara-to-...
8,Sibsagar (Assam) to North Lakhimpur,https://www.redbus.in/bus-tickets/sibsagar-to-...
9,Jorhat to Dibrugarh,https://www.redbus.in/bus-tickets/jorhat-to-di...


In [5]:
#retrive the bus details
driver = webdriver.Chrome()
Bus_names_As=[]
Bus_types_As=[]
Start_Time_As=[]
End_Time_As=[]
Ratings_As=[]
Total_Duration_As=[]
Prices_As=[]
Seats_Available_As=[]
Route_names=[]
Route_links=[]

for i,r in df1_As.iterrows():
    link=r["Link1"]
    routes=r["Route_Name"]

# Loop through each link
    driver.get(link)
    time.sleep(2)  

    # Click on elements to reveal bus details
    elements = driver.find_elements(By.XPATH, f"//a[contains(@href, '{link}')]")
    for element in elements:
        element.click()
        time.sleep(2)
    try:
        clicks = driver.find_element(By.XPATH, "//div[@class='button']")
        clicks.click()
    except:
       print("No View Buses")
    time.sleep(2)
    scrolling = True
    while scrolling:
            old_page_source = driver.page_source
            # Use ActionChains to perform a PAGE_DOWN
            ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
            time.sleep(5)
            new_page_source = driver.page_source
            if new_page_source == old_page_source:
                scrolling = False

    # Extract bus details
    bus_name = driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']")
    bus_type = driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']")
    start_time = driver.find_elements(By.XPATH, "//*[@class='dp-time f-19 d-color f-bold']")
    end_time = driver.find_elements(By.XPATH, "//*[@class='bp-time f-19 d-color disp-Inline']")
    total_duration = driver.find_elements(By.XPATH, "//*[@class='dur l-color lh-24']")
    rating = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
    price = driver.find_elements(By.XPATH, '//*[@class="fare d-block"]')
    seats = driver.find_elements(By.XPATH, "//div[contains(@class, 'seat-left')]")

    time.sleep(10)

   

    # Append data to respective lists
    for bus in bus_name:
        Bus_names_As.append(bus.text)
        Route_links.append(link)
        Route_names.append(routes)
    for bus_type_elem in bus_type:
        Bus_types_As.append(bus_type_elem.text)
    for start_time_elem in start_time:
        Start_Time_As.append(start_time_elem.text)
    for end_time_elem in end_time:
        End_Time_As.append(end_time_elem.text)
    for total_duration_elem in total_duration:
        Total_Duration_As.append(total_duration_elem.text)
    for ratings in rating:
        Ratings_As.append(ratings.text)
    for price_elem in price:
        Prices_As.append(price_elem.text)
    for seats_elem in seats:
        Seats_Available_As.append(seats_elem.text)
        
print("Successfully Completed")



No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
No View Buses
Successfully Completed


In [8]:
import numpy as np
# Proceed with creating the DataFrame
data_As = {
    'Bus_name': Bus_names_As,
    'Bus_type': Bus_types_As,
    'Start_time': Start_Time_As,
    'End_time': End_Time_As,
    'Total_duration': Total_Duration_As,
    'Price': Prices_As,
    "Seats_Available": Seats_Available_As,
    "Rating":Ratings_As,
    'Route_link': Route_links,
    'Route_name': Route_names
}
# Determine the maximum length
max_len = max(len(values) for values in data_As.values())

# Normalize the length of each column by padding with NaN
for key in data_As:
    while len(data_As[key]) < max_len:
        data_As[key].append(np.nan)

# Create DataFramedf2=pd.read_csv("ap3.csv")

df_buses_1 = pd.DataFrame(data_As)

# Save to CSV
df_buses_1.to_csv('as2.csv', index=False)
print("Data saved to as2.csv successfully!")

Data saved to as2.csv successfully!


In [9]:
df2_As=pd.read_csv("as2.csv")
df2_As

Unnamed: 0,Bus_name,Bus_type,Start_time,End_time,Total_duration,Price,Seats_Available,Rating,Route_link,Route_name
0,Assam State Transport Corporation (ASTC) - 79324,Volvo AC Seater 2+2,13:45,18:30,04h 45m,INR 298,27 Seats available,3.9,https://www.redbus.in/bus-tickets/guwahati-to-...,Guwahati to Tezpur
1,Mahi Travels(Under ASTC),Bharat Benz A/C Seater (2+2),10:50,14:55,04h 05m,356,26 Seats available,3.4,https://www.redbus.in/bus-tickets/guwahati-to-...,Guwahati to Tezpur
2,Mahi Travels(Under ASTC),A/C Seater (2+2),10:50,14:45,03h 55m,356,23 Seats available,3.8,https://www.redbus.in/bus-tickets/guwahati-to-...,Guwahati to Tezpur
3,Orient Transline,A/C Seater (2+1),11:20,15:30,04h 10m,INR 380.95,17 Seats available,4.2,https://www.redbus.in/bus-tickets/guwahati-to-...,Guwahati to Tezpur
4,Orient Transline,A/C Seater (2+1),12:10,16:40,04h 30m,INR 380.95,11 Seats available,4.5,https://www.redbus.in/bus-tickets/guwahati-to-...,Guwahati to Tezpur
...,...,...,...,...,...,...,...,...,...,...
176,Assam State Transport Corporation (ASTC) - 143958,Bharat Benz A/C Seater (2+2),15:15,20:30,05h 15m,INR 316,22 Seats available,,https://www.redbus.in/bus-tickets/tumuki-tezpu...,Tumuki (Tezpur Medical) to Guwahati
177,Maa Laxmi,NON A/C Seater Push Back (2+1),17:00,22:30,05h 30m,394,36 Seats available,,https://www.redbus.in/bus-tickets/dibrugarh-to...,Dibrugarh to Bokakhat
178,Baikuntha Transport Service,Bharat Benz A/C Seater /Sleeper (2+1),19:30,01:00,05h 30m,INR 600,35 Seats available,,https://www.redbus.in/bus-tickets/dibrugarh-to...,Dibrugarh to Bokakhat
179,Purple Wings Coaches,A/C Seater / Sleeper (2+1),20:30,01:20,04h 50m,585,44 Seats available,,https://www.redbus.in/bus-tickets/dibrugarh-to...,Dibrugarh to Bokakhat
