In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.keys import Keys
import numpy as np
import time

In [3]:
def get_routes_and_links(driver):
    """Fetch route names and links from the page."""
    wait = WebDriverWait(driver, 30)
    elements = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//a[@class='route']")))
    
    routes, links = [], []
    for element in elements:
        name = element.text
        link = element.get_attribute('href')
        routes.append(name)
        links.append(link)
    
    return routes, links

def navigate_to_next_page(driver, page_number):
    """Navigate to the next page if available."""
    wait = WebDriverWait(driver, 30)
    
    try:
        # Locate the pagination container
        pagination_container = wait.until(EC.presence_of_element_located(
            (By.XPATH, '//*[@id="root"]/div/div[4]/div[12]')
        ))

        # Locate the next page button
        next_page_button = pagination_container.find_element(
            By.XPATH, f'.//div[contains(@class, "DC_117_pageTabs") and text()="{page_number + 1}"]'
        )

        # Scroll to the next page button
        actions = ActionChains(driver)
        actions.move_to_element(next_page_button).perform()
        time.sleep(1)  # Allow time for scroll

        print(f"Clicking on page {page_number + 1}")
        next_page_button.click()

        # Wait for the page number to update
        wait.until(EC.text_to_be_present_in_element(
            (By.XPATH, '//div[contains(@class, "DC_117_pageTabs DC_117_pageActive")]'), str(page_number + 1)
        ))

        print(f"Successfully navigated to page {page_number + 1}")
        time.sleep(3)  # Wait for the new page to load

    except Exception as e:
        print(f"An error occurred while navigating to page {page_number + 1}: {e}")

def main(url, num_pages):
    """Main function to scrape routes and links."""
    driver = webdriver.Chrome()  # Ensure the correct WebDriver is installed
    driver.get(url)

    all_routes, all_links = [], []

    for page_number in range(1, num_pages + 1):
        routes, links = get_routes_and_links(driver)
        all_routes.extend(routes)
        all_links.extend(links)

        if page_number < num_pages:  # Skip navigation on the last page
            navigate_to_next_page(driver, page_number)

    # Save the results to a CSV file
    df = pd.DataFrame({'Route_Name': all_routes, 'Link1': all_links})#read the csv file
    df.to_csv('wb1.csv', index=False)
    driver.quit()  # Close the driver


if __name__ == "__main__":
    url = "https://www.redbus.in/online-booking/wbtc-ctc/?utm_source=rtchometile"  # Your target URL
    main(url, 4)  # Scrape data from the first 4   pages


Clicking on page 2
Successfully navigated to page 2
Clicking on page 3
Successfully navigated to page 3
Clicking on page 4
Successfully navigated to page 4


In [4]:
df1_wb=pd.read_csv("wb1.csv")
df1_wb

Unnamed: 0,Route_Name,Link1
0,Kolkata to Digha,https://www.redbus.in/bus-tickets/kolkata-to-d...
1,Digha to Barasat (West Bengal),https://www.redbus.in/bus-tickets/digha-to-bar...
2,Digha to Kolkata,https://www.redbus.in/bus-tickets/digha-to-kol...
3,Kolkata to Bolpur (West Bengal),https://www.redbus.in/bus-tickets/kolkata-to-b...
4,Midnapore to Kolkata,https://www.redbus.in/bus-tickets/midnapore-to...
5,Barasat (West Bengal) to Midnapore,https://www.redbus.in/bus-tickets/barasat-west...
6,Durgapur (West Bengal) to Kolkata,https://www.redbus.in/bus-tickets/durgapur-to-...
7,Barasat (West Bengal) to Digha,https://www.redbus.in/bus-tickets/barasat-west...
8,Midnapore to Barasat (West Bengal),https://www.redbus.in/bus-tickets/midnapore-to...
9,Barasat (West Bengal) to Nandakumar (west bengal),https://www.redbus.in/bus-tickets/barasat-west...


In [5]:
#retrive the bus details
driver = webdriver.Chrome()
Bus_names_wb = []
Bus_types_wb = []
Start_Time_wb= []
End_Time_wb= []
Ratings_wb = []
Total_Duration_wb = []
Prices_wb = []
Seats_Available_wb = []
Route_names = []
Route_links = []

for i,r in df1_wb.iterrows():
    link=r["Link1"]
    routes=r["Route_Name"]

# Loop through each link
    driver.get(link)
    time.sleep(2)  

    # Click on elements to reveal bus details
    elements = driver.find_elements(By.XPATH, f"//a[contains(@href, '{link}')]")
    for element in elements:
        element.click()
        time.sleep(2)
    try:
        clicks = driver.find_element(By.XPATH, "//div[@class='button']")
        clicks.click()
    except:
        print("no view bus button") 
    time.sleep(2)
    scrolling = True
    while scrolling:
            old_page_source = driver.page_source
            # Use ActionChains to perform a PAGE_DOWN
            ActionChains(driver).send_keys(Keys.PAGE_DOWN).perform()
            time.sleep(5)
            new_page_source = driver.page_source
            if new_page_source == old_page_source:
                scrolling = False

    # Extract bus details
    bus_name = driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']")
    bus_type = driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']")
    start_time = driver.find_elements(By.XPATH, "//*[@class='dp-time f-19 d-color f-bold']")
    end_time = driver.find_elements(By.XPATH, "//*[@class='bp-time f-19 d-color disp-Inline']")
    total_duration = driver.find_elements(By.XPATH, "//*[@class='dur l-color lh-24']")
    rating = driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']")
    price = driver.find_elements(By.XPATH, '//*[@class="fare d-block"]')
    seats = driver.find_elements(By.XPATH, "//div[contains(@class, 'seat-left')]")

    # Append data to respective lists
    for bus in bus_name:
        Bus_names_wb.append(bus.text)
        Route_links.append(link)
        Route_names.append(routes)
    for bus_type_elem in bus_type:
        Bus_types_wb.append(bus_type_elem.text)
    for start_time_elem in start_time:
        Start_Time_wb.append(start_time_elem.text)
    for end_time_elem in end_time:
        End_Time_wb.append(end_time_elem.text)
    for total_duration_elem in total_duration:
        Total_Duration_wb.append(total_duration_elem.text)
    for ratings_elem in rating:
        Ratings_wb.append(ratings_elem.text)
    for price_elem in price:
        Prices_wb.append(price_elem.text)
    for seats_elem in seats:
        Seats_Available_wb.append(seats_elem.text)
        
print("Successfully Completed")


no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
no view bus button
Successfully Completed


In [6]:


# Proceed with creating the DataFrame
data_wb = {
    'Bus_name': Bus_names_wb,
    'Bus_type': Bus_types_wb,
    'Start_time': Start_Time_wb,
    'End_time': End_Time_wb,
    'Total_duration': Total_Duration_wb,
    'Price': Prices_wb,
    "Seats_Available": Seats_Available_wb,
    "Rating": Ratings_wb,
    'Route_link': Route_links,
    'Route_name': Route_names
}


# Determine the maximum length
max_len = max(len(values) for values in data_wb.values())

# Normalize the length of each column by padding with NaN
for key in data_wb:
    while len(data_wb[key]) < max_len:
        data_wb[key].append(np.nan)
# Create DataFramedf2=pd.read_csv("ap3.csv")

df_buses_1 = pd.DataFrame(data_wb)

# Save to CSV
df_buses_1.to_csv('wb2.csv', index=False)
print("Data saved to wb2.csv successfully!")

Data saved to wb2.csv successfully!


In [7]:
df2_wb=pd.read_csv("wb2.csv")
df2_wb

Unnamed: 0,Bus_name,Bus_type,Start_time,End_time,Total_duration,Price,Seats_Available,Rating,Route_link,Route_name
0,WBTC (CTC) Kolkata - Digha - 124|14:40,Non AC Seater (2+3),14:40,19:05,04h 25m,INR 145,47 Seats available,3.1,https://www.redbus.in/bus-tickets/kolkata-to-d...,Kolkata to Digha
1,WBTC (CTC) Belgachia - Digha via Esplanade - 5...,Non AC Seater (2+3),14:45,20:10,05h 25m,INR 155,48 Seats available,3.3,https://www.redbus.in/bus-tickets/kolkata-to-d...,Kolkata to Digha
2,WBTC (CTC) Belgachia - Digha via Esplanade - 5...,Non AC Seater (2+3),15:45,20:10,04h 25m,INR 145,48 Seats available,3.6,https://www.redbus.in/bus-tickets/kolkata-to-d...,Kolkata to Digha
3,WBTC (CTC) Khidderpur - Digha - 97|16:00,Non AC Seater (2+3),16:00,21:00,05h 00m,INR 151,49 Seats available,3.3,https://www.redbus.in/bus-tickets/kolkata-to-d...,Kolkata to Digha
4,WBTC (CTC) Khidderpur - Digha - 97|16:45,Non AC Seater (2+3),16:45,21:00,04h 15m,INR 145,49 Seats available,3.7,https://www.redbus.in/bus-tickets/kolkata-to-d...,Kolkata to Digha
...,...,...,...,...,...,...,...,...,...,...
150,Shyamoli Paribahan Pvt Ltd,Scania Multi-Axle AC Semi Sleeper (2+2),15:00,19:00,04h 00m,INR 524,28 Seats available,4.5,https://www.redbus.in/bus-tickets/kolkata-to-a...,Kolkata to Asansol (West Bengal)
151,Express Line,Volvo A/C Seater (2+2),15:30,19:40,04h 10m,INR 524,43 Seats available,4.0,https://www.redbus.in/bus-tickets/kolkata-to-a...,Kolkata to Asansol (West Bengal)
152,Express Line (Karunamoyee),Volvo 9600 A/C Seater (2+2),15:45,20:45,05h 00m,INR 524,38 Seats available,,https://www.redbus.in/bus-tickets/kolkata-to-a...,Kolkata to Asansol (West Bengal)
153,Greenline,Volvo 9600 Multi Axle Semi-Sleeper (2+2),16:00,20:00,04h 00m,INR 524,32 Seats available,,https://www.redbus.in/bus-tickets/kolkata-to-a...,Kolkata to Asansol (West Bengal)
