In [11]:
######## code to scrape all bus details for a given state
import time
import csv
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (TimeoutException, NoSuchElementException, ElementNotInteractableException, 
                                         ElementClickInterceptedException, StaleElementReferenceException)
from selenium.webdriver.common.action_chains import ActionChains

# Initialize the WebDriver
driver = webdriver.Chrome()
driver.get('https://www.redbus.in/online-booking/wbtc-ctc/?utm_source=rtchometile')  # Replace with the actual URL

# Wait for the elements to be present
wait = WebDriverWait(driver, 5)

# List to store all route links
all_route_links = []

def scroll_down(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

def scrape_current_page(driver):
    try:
        links = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'route')))
        for link in links:
            href = link.get_attribute('href')
            route_name = link.text
            all_route_links.append((route_name, href))
    except TimeoutException:
        print("Error: Route links not found on the page.")
    except Exception as e:
        print(f"An error occurred while scraping the page: {e}")

def scrape_all_pages(driver):
    page_index = 0
    while True:
        try:
            if driver.current_window_handle not in driver.window_handles:
                print("Window handle is no longer valid.")
                break
            scrape_current_page(driver)
            pages = driver.find_elements(By.CLASS_NAME, "DC_117_pageTabs")
            if page_index < len(pages) - 1:
                page_index += 1
                actions = ActionChains(driver)
                actions.move_to_element(pages[page_index]).click().perform()
                time.sleep(3)
            else:
                break
        except (NoSuchElementException, ElementNotInteractableException, ElementClickInterceptedException, 
                StaleElementReferenceException, ElementNotVisibleException) as e:
            print(f"An error occurred while navigating pages: {e}")

def scrape_bus_details(driver):
    def click_elements_if_present(driver, xpaths, wait):
        # Reverse the list of XPaths to click the second button first and the first button second
        reversed_xpaths = xpaths[::-1]

        for xpath in reversed_xpaths:
            try:
                # Wait for the element to be present in the DOM
                element = wait.until(EC.presence_of_element_located((By.XPATH, xpath)))
                
                # Check if the element is visible and clickable
                driver.execute_script("arguments[0].scrollIntoView(true);", element)
                time.sleep(2)  # Allow time for visibility
                if element.is_displayed() and element.is_enabled():
                    driver.execute_script("arguments[0].click();", element)
                    #print(f"Clicked element with XPath '{xpath}'")
                    time.sleep(2)  # Allow time for action to take effect
            except (TimeoutException, NoSuchElementException, ElementClickInterceptedException,
                    StaleElementReferenceException, ElementNotInteractableException) as e:
                #print(f"Failed to click element with XPath '{xpath}': {e}")
                pass
                

    with open('westbengal_details.csv', 'w', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['ID', 'Routename', 'Busname', 'Bustype', 'Departingtime', 
                            'Duration', 'Reachingtime', 'Starrating', 'Price', 'Seatsavailable'])
        id_counter = 1

        for route_name, href in all_route_links:
            try:
                driver.get(href)
                print(f"Scraping bus details for route: {route_name}")
                time.sleep(1)  # Wait for the page to load

                # Click elements to expand details if necessary
                xpaths_to_click = [
                    "/html/body/section/div[2]/div[4]/div/div[2]/div/div[2]/div[2]/div[1]/div/div[2]/div/div[4]/div[2]",
                    "/html/body/section/div[2]/div[4]/div/div[2]/div/div[2]/div[2]/div[2]/div/div[2]/div/div[4]/div[2]"
                ]
                click_elements_if_present(driver, xpaths_to_click, wait)

                # Wait for details to expand and load
                time.sleep(2)
                scroll_down(driver)

                # Scrape bus details
                bus_details = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "clearfix.bus-item-details")))
                for detail in bus_details:
                    try:
                        detail_html = detail.get_attribute('outerHTML')

                        # Extract details using regex
                        bus_name = re.search(r'class="travels lh-24 f-bold d-color">([^<]+)<', detail_html)
                        bus_name = bus_name.group(1).strip() if bus_name else ''
                        bus_type = re.search(r'<div class="bus-type f-12 m-top-16 l-color evBus">([^<]+)<', detail_html)
                        bus_type = bus_type.group(1).strip() if bus_type else ''
                        departing_time = re.search(r'<div class="dp-time f-19 d-color f-bold">(\d{2}:\d{2})</div>', detail_html)
                        departing_time = departing_time.group(1) if departing_time else ''
                        duration = re.search(r'<div class="dur l-color lh-24">([^<]+)</div>', detail_html)
                        duration = duration.group(1).strip() if duration else ''
                        seats_available = re.search(r'<div class="seat-left m-top-30">(\d+)<span class="l-color"> Seats available</span></div>', detail_html)
                        seats_available = int(seats_available.group(1)) if seats_available else 0
                        price = re.search(r'<div class="fare d-block">INR <span class="f-19 f-bold">(\d+)</span></div>', detail_html)
                        price = int(price.group(1)) if price else 0
                        star_rating = re.search(r'<div class="lh-18 rating rat-green ">\s*<i class="icon icon-ic-star d-block"></i>\s*<span class="">(\d+(\.\d+)?)</span>', detail_html)
                        star_rating = float(star_rating.group(1)) if star_rating else 0.0
                        reaching_time = re.search(r'<div class="bp-time f-19 d-color disp-Inline">(\d{2}:\d{2})</div>', detail_html)
                        reaching_time = reaching_time.group(1) if reaching_time else ''

                        # Write to CSV
                        row = [
                            id_counter,
                            route_name,
                            bus_name,
                            bus_type,
                            departing_time,
                            duration,
                            reaching_time,
                            star_rating,
                            price,
                            seats_available
                        ]
                        csvwriter.writerow(row)
                        id_counter += 1  # Increment the ID for the next row

                        print(f"ID: {id_counter - 1}, Route name: {route_name}, Bus name: {bus_name}, Bus type: {bus_type}, "
                              f"Departing time: {departing_time}, Duration: {duration}, Reaching time: {reaching_time},  "
                              f"Star rating: {star_rating}, Price: {price}, Seats available: {seats_available}")
                    except Exception as e:
                        print(f"An error occurred while processing bus details: {e}")
            except Exception as e:
                print(f"An error occurred while scraping bus details for route '{route_name}': {e}")

# Start scraping from the first page
scrape_all_pages(driver)

# Scrape bus details from each route link and save to CSV
scrape_bus_details(driver)
driver.quit()


Scraping bus details for route: Durgapur (West Bengal) to Kolkata
ID: 1, Route name: Durgapur (West Bengal) to Kolkata, Bus name: SHYAMOLI Pvt. Ltd.(Karunamoyee), Bus type: Scania Multi-Axle AC Semi Sleeper (2+2), Departing time: 16:45, Duration: 04h 00m, Reaching time: 20:45,  Star rating: 4.0, Price: 486, Seats available: 0
Scraping bus details for route: Digha to Barasat (West Bengal)
ID: 2, Route name: Digha to Barasat (West Bengal), Bus name: Satya Paribahan, Bus type: A/C Seater (2+3), Departing time: 23:00, Duration: 06h 10m, Reaching time: 05:10,  Star rating: 3.3, Price: 0, Seats available: 23
ID: 3, Route name: Digha to Barasat (West Bengal), Bus name: Snemita Paribahan (Shinjini), Bus type: AC Seater (2+3), Departing time: 23:35, Duration: 05h 25m, Reaching time: 05:00,  Star rating: 0.0, Price: 420, Seats available: 17
ID: 4, Route name: Digha to Barasat (West Bengal), Bus name: Aradhana Travels, Bus type: A/C Seater / Sleeper (2+2), Departing time: 18:35, Duration: 05h 20m

In [2]:
# CODE TO SCRAPE ALL THE ROUTE DETAILS AND LINKS OF ALL STATES AT ONCE AND STORE IN A .csv FILE
import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementNotInteractableException, ElementClickInterceptedException, StaleElementReferenceException, ElementNotVisibleException, NoSuchWindowException
from selenium.webdriver.common.action_chains import ActionChains

# List of URLs to scrape
urls = [
    'https://www.redbus.in/online-booking/apsrtc/?utm_source=rtchometile',
    'https://www.redbus.in/online-booking/tsrtc/?utm_source=rtchometile',
    'https://www.redbus.in/online-booking/ksrtc-kerala/?utm_source=rtchometile',
    'https://www.redbus.in/online-booking/ktcl/?utm_source=rtchometile',
    'https://www.redbus.in/online-booking/rsrtc/?utm_source=rtchometile',
    'https://www.redbus.in/online-booking/south-bengal-state-transport-corporation-sbstc/?utm_source=rtchometile',
    'https://www.redbus.in/online-booking/hrtc/?utm_source=rtchometile',
    'https://www.redbus.in/online-booking/astc/?utm_source=rtchometile',
    'https://www.redbus.in/online-booking/uttar-pradesh-state-road-transport-corporation-upsrtc/?utm_source=rtchometile',
    'https://www.redbus.in/online-booking/wbtc-ctc/?utm_source=rtchometile'
]

# Initialize the WebDriver
driver = webdriver.Chrome()

# Wait for the elements to be present
wait = WebDriverWait(driver, 10)
actions = ActionChains(driver)

# Set to store unique route links
unique_route_links = set()

# Function to scroll down the page
def scroll_down(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# Function to scrape data from the current page
def scrape_current_page(driver):
    try:
        # Find all the route links on the current page
        links = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'route')))  # Replace 'route' with the actual class name for route elements

        for link in links:
            href = link.get_attribute('href')
            route_name = link.text
            unique_route_links.add((route_name, href))  # Use set to avoid duplicates

        # Print the collected links for debugging
        print(f"Links collected on this page: {len(links)}")
        for route in unique_route_links:
            print(route)
    except TimeoutException:
        print("Error: Route links not found on the page.")
    except Exception as e:
        print(f"An error occurred while scraping the page: {e}")

# Function to handle pagination and scrape all pages
def scrape_all_pages(driver):
    page_index = 0
    while True:
        try:
            # Scrape the current page
            scrape_current_page(driver)
            
            # Find the pagination elements
            pages = driver.find_elements(By.CLASS_NAME, "DC_117_pageTabs")
            
            # Check if there are more pages to navigate
            if page_index < len(pages) - 1:
                page_index += 1
                actions.move_to_element(pages[page_index]).click().perform()
                print(f"Clicked on page {page_index + 1}")
                time.sleep(3)  # Wait for the page to load
            else:
                break  # Exit the loop if no more pages
        except (NoSuchElementException, ElementNotInteractableException, ElementClickInterceptedException, StaleElementReferenceException, ElementNotVisibleException, NoSuchWindowException) as e:
            print(f"An error occurred while navigating pages: {e}")
            break  # Exit the loop if an error occurs

# Iterate over each URL and scrape data
for url in urls:
    driver.get(url)
    print(f"Scraping data from: {url}")
    scrape_all_pages(driver)

# Close the WebDriver
driver.quit()

# Save the collected data to a CSV file
csv_filename = 'routes.csv'
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Route Name', 'Route Link'])
    writer.writerows(unique_route_links)

print(f"Data saved to CSV file '{csv_filename}'")

Scraping data from: https://www.redbus.in/online-booking/apsrtc/?utm_source=rtchometile
Links collected on this page: 10
('Vijayawada to Hyderabad', 'https://www.redbus.in/bus-tickets/vijayawada-to-hyderabad')
('Kadapa to Bangalore', 'https://www.redbus.in/bus-tickets/kadapa-to-bangalore')
('Visakhapatnam to Vijayawada', 'https://www.redbus.in/bus-tickets/visakhapatnam-to-vijayawada')
('Tirupati to Bangalore', 'https://www.redbus.in/bus-tickets/tirupathi-to-bangalore')
('Ongole to Hyderabad', 'https://www.redbus.in/bus-tickets/ongole-to-hyderabad')
('Chittoor (Andhra Pradesh) to Bangalore', 'https://www.redbus.in/bus-tickets/chittoor-andhra-pradesh-to-bangalore')
('Visakhapatnam to Kakinada', 'https://www.redbus.in/bus-tickets/visakhapatnam-to-kakinada')
('Anantapur (andhra pradesh) to Bangalore', 'https://www.redbus.in/bus-tickets/ananthapur-to-bangalore')
('Kakinada to Visakhapatnam', 'https://www.redbus.in/bus-tickets/kakinada-to-visakhapatnam')
('Hyderabad to Vijayawada', 'https://

In [4]:
import pandas as pd
from sqlalchemy import create_engine

# Load the CSV file
file_path = 'modified_busdetails_file.csv'
df = pd.read_csv(file_path)

# Database connection details
user = 'root'
password = '12345678'
host = 'localhost'  # or your database host
database = 'redbus'

# Create a connection to the database
engine = create_engine(f'mysql+pymysql://{user}:{password}@{host}/{database}')

# Insert the DataFrame into the MySQL table
table_name = 'bus_details'  # Name of the table where you want to insert the data
df.to_sql(table_name, con=engine, if_exists='replace', index=False)

print("DataFrame has been successfully added to the database.")


DataFrame has been successfully added to the database.


In [18]:
import pandas as pd
from sqlalchemy import create_engine

# Load the CSV file
file_path = 'merged_data.csv'
df = pd.read_csv(file_path)

# Database connection details
user = 'root'
password = '12345678'
host = 'localhost'  # or your database host
database = 'redbus'

# Create a connection to the database
engine = create_engine(f'mysql+pymysql://{user}:{password}@{host}/{database}')

# Insert the DataFrame into the MySQL table
table_name = 'bus_details'  # Name of the table where you want to insert the data
try:
    df.to_sql(table_name, con=engine, if_exists='replace', index=False)
    print("DataFrame has been successfully added to the database.")
except Exception as e:
    print(f"An error occurred: {e}")

DataFrame has been successfully added to the database.


In [16]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv(r"C:\Users\suhas\merged_data.csv")

# Rename the column
df.rename(columns={'Route Link': 'RouteLink'}, inplace=True)

# Save the DataFrame to the same CSV file
df.to_csv(r"C:\Users\suhas\merged_data.csv", index=False)
print("DataFrame saved with the updated column name.")



DataFrame saved with the updated column name.
