## DATA SCRAPING WITH SELENIUM AND PYTHON

### Importing required modules

In [2]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains as AC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC

### Code to scrape bus details of each state

In [None]:
# Create a function to get bus details from their respective route names and links
def get_route_links_and_names(name, url, pages, path):
    driver = webdriver.Chrome()
    driver.get(url)
    time.sleep(2)  # Wait for the page to load
    driver.maximize_window()
    
    # Create empty lists to store route name and link details
    links = []
    routes = []
    
    wait = WebDriverWait(driver, 10) # Wait for the page to load it's contents
    
    for i in range(1, pages + 1):
        elements = driver.find_elements(By.XPATH, path)
            
        for element in elements:
            links.append(element.get_attribute("href"))
            routes.append(element.text)
        
        try:
            # wait until the required element is located
            pagination_container = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@class="DC_117_paginationTable"]')))
            next_pg_btn = pagination_container.find_element(By.XPATH, f'//div[@class="DC_117_pageTabs " and text()="{i+1}"]')
            
            AC(driver).move_to_element(next_pg_btn).perform() # move the cursor onto the element
            time.sleep(2)
            
            next_pg_btn.click() # click the element to load the next page
            time.sleep(2)  
                
        except NoSuchElementException:
            print("No more pages to paginate")
            break
            
    driver.quit()

    driver = webdriver.Chrome()

    # Create empty lists to store required bus details
    Bus_names = []
    Bus_types = []
    Departure = []
    Arrival = []
    Ratings = []
    Duration = []
    Price = []
    Seats_available = []
    Route_names = []
    Route_links = []

    # Loop through each link to get respective bus details 
    for link, route in zip(links, routes):
        driver.get(link)
        driver.maximize_window()
        time.sleep(2)
    
        # Click on each element to show respective bus details
        elements = driver.find_elements(By.XPATH, f"//a[contains(@href, '{link}')]")
        for element in elements:
            element.click()
            time.sleep(5)
    
        # Find and Click the view bus element
        try:
            clicks = driver.find_element(By.XPATH, "//div[@class='button']")
            clicks.click() 
            time.sleep(5)
    
            # Scroll through the entire page
            scrolling = True
            while scrolling:
                old_pg_src = driver.page_source # gives the current page location before scrolling
                
                AC(driver).send_keys(Keys.END).perform() # goes to the end of the page
                time.sleep(5)
                
                new_pg_src = driver.page_source # gives the current page location after scrolling 
                
                if new_pg_src == old_pg_src:
                    scrolling = False
                    
        # If element not found            
        except NoSuchElementException:

            #  Scroll through the entire page
            scrolling = True
            while scrolling:
                old_pg_src = driver.page_source # gives the current page location before scrolling
                
                AC(driver).send_keys(Keys.END).perform() # goes to the end of the page
                time.sleep(5)
                
                new_pg_src = driver.page_source # gives the current page location after scrolling
                
                if new_pg_src == old_pg_src:
                    scrolling = False

        # Extract the required bus deatils
        bus_name = driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']")
        bus_type = driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']")
        departure = driver.find_elements(By.XPATH, "//*[@class='dp-time f-19 d-color f-bold']")
        arrival = driver.find_elements(By.XPATH, "//*[@class='bp-time f-19 d-color disp-Inline']")
        duration = driver.find_elements(By.XPATH, "//*[@class='dur l-color lh-24']")
        rating = driver.find_elements(By.XPATH, "//div[@class='clearfix row-one']/div[@class='column-six p-right-10 w-10 fl']")
        price = driver.find_elements(By.XPATH, "//*[@class='fare d-block']")
        seats = driver.find_elements(By.XPATH, "//div[contains(@class, 'seat-left')]")
    
        # Append data to respective lists
        for bus in bus_name:
            Bus_names.append(bus.text)
            Route_links.append(link)
            Route_names.append(route)
    
        for type_e in bus_type:
            Bus_types.append(type_e.text)
            
        for dep in departure:
            Departure.append(dep.text)
            
        for arr in arrival:
            Arrival.append(arr.text)
            
        for dur in duration:
            Duration.append(dur.text)
            
        for pr in price:
            Price.append(pr.text)
    
        for ratings in rating:
            Ratings.append(ratings.text)
    
        for seat in seats:
            Seats_available.append(seat.text)
            
    driver.quit()
    
    # Return scrapped data as a df
    return pd.DataFrame({
        "State": name,
        "Bus_name": Bus_names,
        "Bus_type": Bus_types,
        "Departure": Departure,
        "Arrival": Arrival,
        "Duration": Duration,
        "Price": Price,
        "Ratings": Ratings,
        "Seats_Available": Seats_available,
        "Route_link": Route_links,
        "Route_name": Route_names
    })
                        
# List of states, URLs and pages to scrape 
states = [{"name": "Kerala", "url": "https://www.redbus.in/online-booking/ksrtc-kerala/?utm_source=rtchometile", "pages": 2},
          {"name": "Goa", "url": "https://www.redbus.in/online-booking/ktcl/?utm_source=rtchometile", "pages": 4},
          {"name": "Rajastan", "url": "https://www.redbus.in/online-booking/rsrtc/?utm_source=rtchometile", "pages":2},
          {"name": "Uttar Pradesh", "url": "https://www.redbus.in/online-booking/uttar-pradesh-state-road-transport-corporation-upsrtc/?utm_source=rtchometile", "pages":5},
          {"name": "South Bengal", "url": "https://www.redbus.in/online-booking/south-bengal-state-transport-corporation-sbstc/?utm_source=rtchometile'", "pages":5},
          {"name": "West Bengal", "url": "https://www.redbus.in/online-booking/wbtc-ctc/?utm_source=rtchometile", "pages":4},
          {"name": "Assam", "url": "https://www.redbus.in/online-booking/astc/?utm_source=rtchometile", "pages":5},
          {"name": "Punjab", "url": "https://www.redbus.in/online-booking/pepsu/?utm_source=rtchometile", "pages":2},
          {"name": "Telangana", "url": "https://www.redbus.in/online-booking/tsrtc/?utm_source=rtchometile", "pages":3},
          {"name": "Andhra Pradesh", "url": "https://www.redbus.in/online-booking/apsrtc/?utm_source=rtchometile", "pages": 5}
         ]

# XPATH to find route links and names
route_xpath = "//a[@class='route']"

# DataFrame to store all the data
all_data = pd.DataFrame()

# Loop through each state and scrape the data
for state in states:
    df = get_route_links_and_names(state["name"], state["url"], state["pages"], route_xpath)
    all_data = pd.concat([all_data, df], ignore_index=True)

In [5]:
all_data

Unnamed: 0,State,Bus_name,Bus_type,Departure,Arrival,Duration,Price,Ratings,Seats_Available,Route_link,Route_name
0,Kerala,MMK Travels,A/C Seater / Sleeper (2+1),22:30,07:30,09h 00m,INR 1299,4.3\n516,15 Seats available,https://www.redbus.in/bus-tickets/bangalore-to...,Bangalore to Kozhikode
1,Kerala,Kyros Connect,Bharat Benz A/C Semi Sleeper (2+2),23:00,09:00,10h 00m,INR 1699,4.3\n349,20 Seats available,https://www.redbus.in/bus-tickets/bangalore-to...,Bangalore to Kozhikode
2,Kerala,SAAM BUS,VE A/C Sleeper (2+1),21:45,06:45,09h 00m,INR 1800,4.4\n189,11 Seats available,https://www.redbus.in/bus-tickets/bangalore-to...,Bangalore to Kozhikode
3,Kerala,AdSAAM BUS,VE A/C Sleeper (2+1),20:10,06:45,10h 35m,INR 1800,4.3,9 Seats available,https://www.redbus.in/bus-tickets/bangalore-to...,Bangalore to Kozhikode
4,Kerala,NS Transports,Non A/C Seater / Sleeper (2+1),21:45,06:10,08h 25m,INR 1825,4.1\n284,2 Seats available,https://www.redbus.in/bus-tickets/bangalore-to...,Bangalore to Kozhikode
...,...,...,...,...,...,...,...,...,...,...,...
11693,Andhra Pradesh,True Bus,Volvo Multi Axle B9R A/C Sleeper (2+1),17:00,23:59,06h 59m,INR 4400,3.2,36 Seats available,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Nandyal
11694,Andhra Pradesh,DHANUNJAYA TRAVELS,A/C Sleeper (2+1),21:00,03:00,06h 00m,INR 1199,2.2,36 Seats available,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Nandyal
11695,Andhra Pradesh,DHANUNJAYA TRAVELS,A/C Sleeper (2+1),19:10,03:00,07h 50m,INR 1190,2.2,25 Seats available,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Nandyal
11696,Andhra Pradesh,Tranzindia Travels,Bharat Benz A/C Sleeper (2+1),21:45,02:50,05h 05m,INR 1200,1.7,20 Seats available,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Nandyal


### Data cleaning

In [6]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11698 entries, 0 to 11697
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   State            11698 non-null  object
 1   Bus_name         11698 non-null  object
 2   Bus_type         11698 non-null  object
 3   Departure        11698 non-null  object
 4   Arrival          11698 non-null  object
 5   Duration         11698 non-null  object
 6   Price            11698 non-null  object
 7   Ratings          11698 non-null  object
 8   Seats_Available  11698 non-null  object
 9   Route_link       11698 non-null  object
 10  Route_name       11698 non-null  object
dtypes: object(11)
memory usage: 1005.4+ KB


In [10]:
# Using regular expression \d+ matches one or more digits
# The parentheses () create a capture group, extracting only the digit part of the string
# astype(int) is used to convert the extracted value to an int datatype

all_data['Price'] = all_data['Price'].str.extract(r'(\d+)').astype(int)
all_data['Price']

0        1299
1        1699
2        1800
3        1800
4        1825
         ... 
11693    4400
11694    1199
11695    1190
11696    1200
11697    3000
Name: Price, Length: 11698, dtype: int32

In [12]:
# str.split() splits each value in the column based on spaces
# str[0] extracts the 1st element from the split result
# errors = 'coerce' is used so that the values that cannot be converted into numeric datatype are replaced with NaN (Not a Number) 
# fillna(0.0) replaces NaN values with 0.0
# pd.to_numeric converts the extracted string values into numeric datatype

all_data['Ratings'] = pd.to_numeric(all_data['Ratings'].str.split().str[0], errors = 'coerce')
all_data['Ratings'] = all_data['Ratings'].fillna(0.0)
all_data['Ratings']

0        4.3
1        4.3
2        4.4
3        4.3
4        4.1
        ... 
11693    3.2
11694    2.2
11695    2.2
11696    1.7
11697    0.0
Name: Ratings, Length: 11698, dtype: float64

In [13]:
# Using regular expression \d+ matches one or more digits
# The parentheses () create a capture group, extracting only the digit part of the string
# astype(int) is used to convert the extracted value to an int datatype

all_data['Seats_Available'] = all_data['Seats_Available'].str.extract(r'(\d+)').astype(int)
all_data['Seats_Available']

0        15
1        20
2        11
3         9
4         2
         ..
11693    36
11694    36
11695    25
11696    20
11697    40
Name: Seats_Available, Length: 11698, dtype: int32

In [14]:
# str.replace is used to replace 'h' with ':', 'm' with ':00' and ' ' with ''

all_data['Duration'] = all_data['Duration'].str.replace('h', ':').str.replace('m', ':00').str.replace(' ', '')
all_data['Duration']

0        09:00:00
1        10:00:00
2        09:00:00
3        10:35:00
4        08:25:00
           ...   
11693    06:59:00
11694    06:00:00
11695    07:50:00
11696    05:05:00
11697    06:19:00
Name: Duration, Length: 11698, dtype: object

In [16]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11698 entries, 0 to 11697
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   State            11698 non-null  object 
 1   Bus_name         11698 non-null  object 
 2   Bus_type         11698 non-null  object 
 3   Departure        11698 non-null  object 
 4   Arrival          11698 non-null  object 
 5   Duration         11698 non-null  object 
 6   Price            11698 non-null  int32  
 7   Ratings          11698 non-null  float64
 8   Seats_Available  11698 non-null  int32  
 9   Route_link       11698 non-null  object 
 10  Route_name       11698 non-null  object 
dtypes: float64(1), int32(2), object(8)
memory usage: 914.0+ KB


In [15]:
# Convert df to csv
all_data.to_csv('Redbus_data.csv', index = False) 