Python with selenium working flow


![+ Flowchart - Python with Selenium Flow (3).jpg](<attachment:+ Flowchart - Python with Selenium Flow (3).jpg>)

Python with selenium code for scraping redbus website


In [27]:
import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import (
    ElementNotInteractableException,
    TimeoutException,
    ElementClickInterceptedException,
    NoSuchElementException
)

class Redbus:
    def __init__(self, Xpath):
        self.Xpath = Xpath
        self.Bus = {}  
        
        # Initialize the Chrome driver
        self.driver = webdriver.Chrome()

        # Open the Redbus page
        self.driver.get('https://www.redbus.in/')
        time.sleep(5)

        # Scroll horizontally to bring the element into view
        target_element = WebDriverWait(self.driver, 20).until(EC.presence_of_element_located((By.XPATH, self.Xpath)))
        self.driver.execute_script("arguments[0].scrollIntoView({block: 'center', inline: 'center'});", target_element)
        time.sleep(2)

        # Click on the state bus link
        WebDriverWait(self.driver, 20).until(EC.element_to_be_clickable((By.XPATH, self.Xpath))).click()
        time.sleep(10)

        Bus_Route_link = []
        Bus_Route_name = []

        # Loop to gather bus routes and their links from all pages until no more pages are available
        page = 1
        while True:
            try:
                current_page_links = [i.get_attribute('href') for i in self.driver.find_elements(By.XPATH, "//div[@class='route_details']//a")]
                current_page_names = [i.text for i in self.driver.find_elements(By.XPATH, "//a[@class='route']")]

                # Only add unique links and names to the main list
                for link, name in zip(current_page_links, current_page_names):
                    if link not in Bus_Route_link:
                        Bus_Route_link.append(link)
                        Bus_Route_name.append(name)
                # Print the bus routes and links to verify they are being collected
                print(f"Page {page} - Bus Routes and Links:")
                for name, link in zip(Bus_Route_name, Bus_Route_link):
                    print(f"Route Name: {name}, Route Link: {link}")
                print("\n")

                # Try to navigate to the next page
                next_page_xpath = f"//div[12]/div[{page + 1}]"

                if not self.driver.find_elements(By.XPATH, next_page_xpath):
                    print(f"Page {page + 1} does not exist. Exiting loop.")
                    break

                # Wait for the element to be present and visible
                element = WebDriverWait(self.driver, 25).until(EC.visibility_of_element_located((By.XPATH, next_page_xpath)))

                # Scroll the element into view before clicking
                self.driver.execute_script("arguments[0].scrollIntoView(true);", element)
                time.sleep(1)  

                # Attempt to click the element
                element.click()
                time.sleep(5)

                page += 1

            except ElementNotInteractableException as e:
                print(f"Error navigating to page {page + 1}: {e}")
                break
            except ElementClickInterceptedException as e:
                print(f"Click intercepted when trying to navigate to page {page + 1}: {e}")
                break
            except (TimeoutException, NoSuchElementException) as e:
                print(f"Error loading page {page + 1}: {e}")
                break

        def smooth_scroll():
            """Function to scroll to the bottom of the page to ensure all elements are loaded."""
            last_height = self.driver.execute_script("return document.body.scrollHeight")
            while True:
                # Scroll down to the bottom
                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                # Wait for new content to load
                time.sleep(2)
                # Calculate new scroll height and compare with last height
                new_height = self.driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    break
                last_height = new_height

        # Start scraping bus details from each link
        for name, link in zip(Bus_Route_name, Bus_Route_link):
            self.driver.get(link)
            time.sleep(10)
            self.Bus[name] = {"Private": {}, "Government": {}}

            # Wait for elements to load and fetch private bus details
            time.sleep(10) 
            try:
                # Smooth scroll before scraping private bus details
                smooth_scroll()

                # Scrape private bus details
                Bus_travel_Name, Bus_Confort_Type, Bus_start_time, Bus_end_time, Total_travel_time, Rating, Seat_availability, Price, Reach_date = WebDriverWait(self.driver, 25).until(
                    lambda driver: (
                        driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']"),
                        driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']"),
                        driver.find_elements(By.XPATH, "//div[@class='dp-time f-19 d-color f-bold']"),
                        driver.find_elements(By.XPATH, "//div[@class='bp-time f-19 d-color disp-Inline']"),
                        driver.find_elements(By.XPATH, "//div[@class='dur l-color lh-24']"),
                        driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']//span"),
                        driver.find_elements(By.XPATH, "//div[@class='seat-left m-top-30']"),
                        driver.find_elements(By.XPATH, "//div[@class='fare d-block']"),
                        driver.find_elements(By.XPATH, "//div[@class='next-day-dp-lbl m-top-16']")
                    )
                )

                # Print the bus details to verify they are being scraped
                print(f"Scraped data for route: {name}")
                print("Bus travel Name:", [elem.text.strip() for elem in Bus_travel_Name if elem.text != ''])
                print("Bus Comfort Type:", [elem.text.strip() for elem in Bus_Confort_Type if elem.text != ''])
                print("Bus Start Time:", [elem.text.strip() for elem in Bus_start_time if elem.text != ''])
                print("Bus End Time:", [elem.text.strip() for elem in Bus_end_time if elem.text != ''])
                print("Total Travel Time:", [elem.text.strip() for elem in Total_travel_time if elem.text != ''])
                print("Rating:", [float(elem.text.strip()) for elem in Rating if elem.text != ''])
                print("Seat Availability:", [int(i.text.split()[0]) for i in Seat_availability if i.text != ''])
                print("Price:", [i.text[3:].strip() for i in Price if i.text[3:] != ''])
                print("Reach Date:", [elem.text.strip() for elem in Reach_date if elem.text != ''])
                print("\n")

                # Store scraped data for private buses
                self.Bus[name]["Private"]["Bus_Name"] = [elem.text.strip() for elem in Bus_travel_Name if elem.text != '']
                self.Bus[name]["Private"]["Bus_Type"] = [elem.text.strip() for elem in Bus_Confort_Type if elem.text != '']
                self.Bus[name]["Private"]["Departing_Time"] = [elem.text.strip() for elem in Bus_start_time if elem.text != '']
                self.Bus[name]["Private"]["Reaching_Time"] = [elem.text.strip() for elem in Bus_end_time if elem.text != '']
                self.Bus[name]["Private"]["Duration"] = [elem.text.strip() for elem in Total_travel_time if elem.text != '']
                self.Bus[name]["Private"]["Star_Rating"] = [float(elem.text.strip()) for elem in Rating if elem.text != '']
                self.Bus[name]["Private"]["Seat_availability"] = [int(i.text.split()[0]) for i in Seat_availability if i.text != '']
                self.Bus[name]["Private"]["Price"] = [i.text[3:].strip() for i in Price if i.text[3:] != '']
                self.Bus[name]["Private"]["Reach_date"] = [elem.text.strip() for elem in Reach_date if elem.text != '']

                # Click to switch to government buses
                try:
                    button = WebDriverWait(self.driver, 15).until(EC.element_to_be_clickable((By.XPATH, "//div[@class='button']")))

                    # Scroll the button into view
                    self.driver.execute_script("arguments[0].scrollIntoView(true);", button)
                    time.sleep(1)  
                    self.driver.execute_script("arguments[0].click();", button)

                except ElementClickInterceptedException as e:
                    print(f"Error clicking the button to switch to government buses: {e}")

                # Smooth scroll before scraping government bus details
                smooth_scroll()

                # Scrape government bus details
                Bus_travel_Name, Bus_Confort_Type, Bus_start_time, Bus_end_time, Total_travel_time, Rating, Seat_availability, Price, Reach_date = WebDriverWait(self.driver, 25).until(
                    lambda driver: (
                        driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']"),
                        driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']"),
                        driver.find_elements(By.XPATH, "//div[@class='dp-time f-19 d-color f-bold']"),
                        driver.find_elements(By.XPATH, "//div[@class='bp-time f-19 d-color disp-Inline']"),
                        driver.find_elements(By.XPATH, "//div[@class='dur l-color lh-24']"),
                        driver.find_elements(By.XPATH, "//div[@class='rating-sec lh-24']//span"),
                        driver.find_elements(By.XPATH, "//div[@class='seat-left m-top-30']"),
                        driver.find_elements(By.XPATH, "//div[@class='fare d-block']"),
                        driver.find_elements(By.XPATH, "//div[@class='next-day-dp-lbl m-top-16']")
                    )
                )

                # Print the bus details to verify they are being scraped
                print(f"Scraped data for government buses on route: {name}")
                print("Bus travel Name:", [elem.text.strip() for elem in Bus_travel_Name if elem.text != ''])
                print("Bus Comfort Type:", [elem.text.strip() for elem in Bus_Confort_Type if elem.text != ''])
                print("Bus Start Time:", [elem.text.strip() for elem in Bus_start_time if elem.text != ''])
                print("Bus End Time:", [elem.text.strip() for elem in Bus_end_time if elem.text != ''])
                print("Total Travel Time:", [elem.text.strip() for elem in Total_travel_time if elem.text != ''])
                print("Rating:", [float(elem.text.strip()) for elem in Rating if elem.text != ''])
                print("Seat Availability:", [int(i.text.split()[0]) for i in Seat_availability if i.text != ''])
                print("Price:", [i.text[3:].strip() for i in Price if i.text[3:] != ''])
                print("Reach Date:", [elem.text.strip() for elem in Reach_date if elem.text != ''])
                print("\n")

                # Store scraped data for government buses
                self.Bus[name]["Government"]["Bus_Name"] = [elem.text.strip() for elem in Bus_travel_Name if elem.text != '']
                self.Bus[name]["Government"]["Bus_Type"] = [elem.text.strip() for elem in Bus_Confort_Type if elem.text != '']
                self.Bus[name]["Government"]["Departing_Time"] = [elem.text.strip() for elem in Bus_start_time if elem.text != '']
                self.Bus[name]["Government"]["Reaching_Time"] = [elem.text.strip() for elem in Bus_end_time if elem.text != '']
                self.Bus[name]["Government"]["Duration"] = [elem.text.strip() for elem in Total_travel_time if elem.text != '']
                self.Bus[name]["Government"]["Star_Rating"] = [float(elem.text.strip()) for elem in Rating if elem.text != '']
                self.Bus[name]["Government"]["Seat_availability"] = [int(i.text.split()[0]) for i in Seat_availability if i.text != '']
                self.Bus[name]["Government"]["Price"] = [i.text[3:].strip() for i in Price if i.text[3:] != '']
                self.Bus[name]["Government"]["Reach_date"] = [elem.text.strip() for elem in Reach_date if elem.text != '']

            except TimeoutException as e:
                print(f"Error fetching bus details for {name}: {e}")
        
        # Close the WebDriver
        self.driver.quit()

    def get_bus_data(self):
        """Return the scraped bus data."""
        return self.Bus

# Creating an object of Redbus
Buses = Redbus("//div[@class='rtcNameMain']/div[@class='rtcName' and text()='WBSTC']")

Value = Buses.get_bus_data()


Page 1 - Bus Routes and Links:
Route Name: Kolkata to Digha, Route Link: https://www.redbus.in/bus-tickets/kolkata-to-digha
Route Name: Digha to Kolkata, Route Link: https://www.redbus.in/bus-tickets/digha-to-kolkata
Route Name: Mandarmani to Kolkata, Route Link: https://www.redbus.in/bus-tickets/mandarmani-to-kolkata
Route Name: Kolkata to Mandarmani, Route Link: https://www.redbus.in/bus-tickets/kolkata-to-mandarmani
Route Name: Kolkata to Bakkhali, Route Link: https://www.redbus.in/bus-tickets/kolkata-to-bakkhali


Error loading page 2: Message: 

Scraped data for route: Kolkata to Digha
Bus travel Name: ['SBSTC-KARUNAMOYEE - DIGHA - VIA - KOLKATA - 22:30 (VOLVO - 3969', 'Shyamoli Paribahan Pvt Ltd', 'Express Line', 'Maa Chandi Travels', 'Shyamoli Paribahan Pvt Ltd', 'Sagufta Travels(Maa Chandi)', 'Maa Chandi Travels', 'Sagufta Travels (Bhorpet Travels)', 'Parameswar Travels', 'Sagufta Travels (Mayuri)', 'Sagufta Travels(Maa Chandi)', 'Parameswar Travels', 'Sagufta Travels (Atithi)'

To save the file in json format for future use

In [30]:
with open("WBSTC.json", "w") as file:
    json.dump(Value, file, indent=2)

Dumping the value(Scraped data and loading it in a variable)

In [29]:
Dumping = json.dumps(Value)
WBSTC_Buses= json.loads(Dumping)

In [28]:
Value

{'Kolkata to Digha': {'Private': {'Bus_Name': ['SBSTC-KARUNAMOYEE - DIGHA - VIA - KOLKATA - 22:30 (VOLVO - 3969',
    'Shyamoli Paribahan Pvt Ltd',
    'Express Line',
    'Maa Chandi Travels',
    'Shyamoli Paribahan Pvt Ltd',
    'Sagufta Travels(Maa Chandi)',
    'Maa Chandi Travels',
    'Sagufta Travels (Bhorpet Travels)',
    'Parameswar Travels',
    'Sagufta Travels (Mayuri)',
    'Sagufta Travels(Maa Chandi)',
    'Parameswar Travels',
    'Sagufta Travels (Atithi)',
    'Parameswar Travels',
    'Snemita Paribahan (Bhorpet)',
    'Parameswar Travels',
    'Basu Travels',
    'City Express (Shinjini)',
    'Parameswar Travels',
    'Sagufta Travels (Sagar Kanya)',
    'Snemita Paribahan(Shinjini)',
    'Haimanti Bus Service',
    'Santosh Bus Service',
    'Ankita Paribahan( Haimanti)',
    'Santosh Bus Service',
    'Bengal Surface Transport',
    'Basu Travels',
    'Snemita Parisheba',
    'Bengal Surface Transport',
    'Snemita Parisheba',
    'Mayuri Express',
    'Snemi

Loading the json file by file path for data insertion

In [31]:
import json

# Path to your JSON file
json_file_path = r'C:\Users\sandy\Desktop\Data science\RedBus Project 1\WBSTC.json'

# Open and load the JSON data
with open(json_file_path, 'r') as file:
    WBSTC = json.load(file)


SQL Code for inserting the data in to the Database

In [32]:
import mysql.connector

class Importing:
    def __init__(self, Bus_state_name, Table_insert, table_name, database_name):
        self.Bus_state_name = Bus_state_name
        self.Table_insert = Table_insert
        self.table_name = table_name
        self.database_name = database_name

        # Connect to MySQL
        self.mydb = mysql.connector.connect(
            host="localhost",
            user="root",
            password="",
            autocommit=True  # Enable autocommit
        )

        print(self.mydb)
        self.mycursor = self.mydb.cursor(buffered=True)

        # Create the database
        self.mycursor.execute(f"CREATE DATABASE IF NOT EXISTS {self.database_name}")
        self.mycursor.execute(f"USE {self.database_name}")  # Switch to the new database

        # Escape table names with backticks to handle spaces and reserved keywords
        table_name_1 = f"`{self.table_name}`"

        # Create the table with the specified schema, including Bus_route_name
        self.mycursor.execute(f"""
            CREATE TABLE IF NOT EXISTS {table_name_1} (
                id INT NOT NULL AUTO_INCREMENT,
                Bus_state_name VARCHAR(100),
                Bus_route_name VARCHAR(100),
                Bus_Operator_type VARCHAR(15),
                BusName VARCHAR(50),
                BusType VARCHAR(50),
                Departing_Time TIME,
                Duration VARCHAR(30),
                Reaching_Time TIME,
                Star_rating FLOAT(5),
                Price FLOAT(10),
                Seats_available INT(5),
                PRIMARY KEY (id)
            )
        """)

        # Insert data into the table
        for bus_route_name, operator_data in self.Table_insert.items():
            for operator_type in ['Private', 'Government']:
                print(f"Inserting data for operator type: {operator_type} on route: {bus_route_name}")
                try:
                    bus_name = operator_data[operator_type]['Bus_Name']
                    bus_type = operator_data[operator_type]['Bus_Type']
                    departing_time = operator_data[operator_type]['Departing_Time']
                    duration = operator_data[operator_type]['Duration']
                    reaching_time = operator_data[operator_type]['Reaching_Time']
                    star_rating = operator_data[operator_type]['Star_Rating']
                    price = operator_data[operator_type]['Price']
                    seats_available = operator_data[operator_type]['Seat_availability']
                except KeyError as e:
                    print(f"KeyError: {e} in route {bus_route_name} for operator type {operator_type}. Skipping...")
                    continue

                insert_query = f"INSERT INTO {table_name_1} (Bus_state_name, Bus_route_name, Bus_Operator_type, BusName, BusType, Departing_Time, Duration, Reaching_Time, Star_rating, Price, Seats_available) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s)"

                # Get the minimum length among the lists to avoid index errors
                value_lengths = [len(i) for i in [bus_name, bus_type, departing_time, duration, reaching_time, star_rating, price, seats_available]]
                min_length = min(value_lengths)
                print(f"Minimum length of lists: {min_length}")

                # Insert each value in the list as a separate row
                for i in range(min_length):
                    print(f"Inserting row {i+1} for {bus_route_name} - {operator_type}")
                    print((
                        self.Bus_state_name,
                        bus_route_name,
                        operator_type, 
                        bus_name[i], 
                        bus_type[i], 
                        departing_time[i], 
                        duration[i], 
                        reaching_time[i], 
                        star_rating[i], 
                        price[i], 
                        seats_available[i]
                    ))
                    self.mycursor.execute(insert_query, (
                        self.Bus_state_name,
                        bus_route_name,
                        operator_type, 
                        bus_name[i], 
                        bus_type[i], 
                        departing_time[i], 
                        duration[i], 
                        reaching_time[i], 
                        star_rating[i], 
                        price[i], 
                        seats_available[i]
                    ))
                    self.mydb.commit()  # Commit after each insertion

#Inputing the arguments State bus name,Variable containing scraped data,Table name,Database name
importer = Importing('WBSTC-West Bengal State Transport',WBSTC, "Bus_Data", "RedBus")


<mysql.connector.connection_cext.CMySQLConnection object at 0x00000185A1FBE690>
Inserting data for operator type: Private on route: Kolkata to Digha
Minimum length of lists: 26
Inserting row 1 for Kolkata to Digha - Private
('WBSTC-West Bengal State Transport', 'Kolkata to Digha', 'Private', 'SBSTC-KARUNAMOYEE - DIGHA - VIA - KOLKATA - 22:30 (VOLVO - 3969', 'Volvo AC Seater (2+2)', '22:30', '05h 00m', '03:30', 3.6, '460', 29)
Inserting row 2 for Kolkata to Digha - Private
('WBSTC-West Bengal State Transport', 'Kolkata to Digha', 'Private', 'Shyamoli Paribahan Pvt Ltd', 'VE A/C Seater (2+2)', '23:30', '05h 00m', '04:30', 4.5, '749', 2)
Inserting row 3 for Kolkata to Digha - Private
('WBSTC-West Bengal State Transport', 'Kolkata to Digha', 'Private', 'Express Line', 'Volvo 9600 A/C Seater (2+2)', '23:45', '05h 00m', '04:45', 4.5, '500', 7)
Inserting row 4 for Kolkata to Digha - Private
('WBSTC-West Bengal State Transport', 'Kolkata to Digha', 'Private', 'Maa Chandi Travels', 'A/C Seater 