In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from datetime import datetime, timedelta
import requests
from bs4 import BeautifulSoup

In [None]:
def extract_route_link(url, buses):
  response = requests.get(url)
  bus_route_link = []
  for i in buses:
    if response.status_code == 200:
      soup = BeautifulSoup(response.content, 'html.parser')

      anchor_tag = soup.find('a', string=i)

      if anchor_tag:
        link = anchor_tag.get('href')
        bus_route_link.append(link)
      else:
        print("Couldn't find the anchor tag with the specified text.")
    else:
      print(f"Error getting the webpage. Status code: {response.status_code}")
  return bus_route_link


url = 'https://www.redbus.in'
buses = ["Andhra Pradesh State Road Transport Corporation "," Assam State Transport Corporation",
         "Bihar State Tourism Development Corporation ","Himachal Road Transport Corporation",
         "ammu and Kashmir State Road Transport Corporation","Kerala RTC","Kadamba Transport Corporation",
         "Patiala and the East Punjab States Union"," Puducherry Road Transport Corporation",
         "Rajasthan State Road Transport Corporation","South Bengal State Transport Corporation",
         "Uttarakhand Transport Corporation","West Bengal Transport Corporation","North Bengal State Transport Corporation ",
         "Chandigarh Transport Undertaking (CTU)"]
bus_route_link = extract_route_link(url, buses)

In [None]:
def extract_links_by_class(url, class_name):
    driver = webdriver.Chrome()  # Replace with your preferred webdriver
    driver.get(url)
    
    routes = []
        # Scrape route names and their links
    route_elements = driver.find_elements(By.CLASS_NAME, class_name)
    for route_element in route_elements:
        route = route_element.text  # Get the route name
        route_link = route_element.get_attribute('href')  # Get the route link
        routes.append((route, route_link))
    driver.quit()
    return routes
routes = []
for link in bus_route_link:
    div_locator_1 = "route"
    routes.append(extract_links_by_class(link, div_locator_1))

In [None]:
def convert_to_datetime(time_str, reference_date):
    try:
        dt = datetime.strptime(time_str, '%H:%M').replace(year=reference_date.year, month=reference_date.month, day=reference_date.day)
        return dt
    except ValueError:
        return None

    
bus_details = []
for i in routes:
    for route, route_link in i:
        try:
            driver = webdriver.Chrome()  # Replace with your preferred webdriver
            driver.get(route_link)
            driver.maximize_window()
            time.sleep(5)
            view_buses_button = driver.find_element(By.CLASS_NAME, 'button')
            view_buses_button.click()
            time.sleep(5)
            last_height = driver.execute_script("return document.body.scrollHeight")
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)  # Wait to load the page
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        except:
            pass 
        try:
            bus_elements = driver.find_elements(By.CSS_SELECTOR, "div.bus-item")
        except:
            print("No bus elements found")
            continue
        for bus in bus_elements:
            try:
                busname = bus.find_element(By.CSS_SELECTOR, "div.travels.lh-24.f-bold.d-color").text
            except:
                busname = "N/A"

            try:
                bustype = bus.find_element(By.CSS_SELECTOR, "div.bus-type.f-12.m-top-16.l-color.evBus").text
            except:
                bustype = "N/A"

            try:
                departing_time = bus.find_element(By.CSS_SELECTOR, "div.dp-time.f-19.d-color.f-bold").text
                departing_time_dt = convert_to_datetime(departing_time, datetime.now())
            except:
                departing_time_dt = None

            try:
                duration = bus.find_element(By.CSS_SELECTOR, "div.dur.l-color.lh-24").text
            except:
                duration = "N/A"
            try:
                reaching_time = bus.find_element(By.CSS_SELECTOR, "div.bp-time.f-19.d-color.disp-Inline").text
                reaching_time_dt = convert_to_datetime(reaching_time, datetime.now())
                if reaching_time_dt and departing_time_dt and reaching_time_dt < departing_time_dt:
                    reaching_time_dt += timedelta(days=1)
            except:
                reaching_time_dt = None

            try:
                star_rating = bus.find_element(By.CSS_SELECTOR, "div.rating-sec.lh-24").text
                star_rating = float(star_rating) if star_rating != "N/A" else 0.0
            except:
                star_rating = 0.0

            try:
                price = bus.find_element(By.CSS_SELECTOR, "span.f-19.f-bold").text
                price = float(price.replace('₹', '').replace(',', '').strip()) if price != "N/A" else None
            except:
                price = None

            try:
                    # Try the first selector for seats available
                try:
                    seats_available = bus.find_element(By.CSS_SELECTOR, "div.seat-left.m-top-16").text
                except:
                        # If the first selector fails, try the second one
                    seats_available = bus.find_element(By.CSS_SELECTOR, "div.seat-left.m-top-30").text  # Replace with the actual second selector

                seats_available = int(seats_available.split()[0]) if seats_available != "N/A" else 0
            except:
                    seats_available = 0

            bus_details.append((route, route_link, busname, bustype, departing_time_dt, duration, reaching_time_dt, star_rating, price, seats_available))



In [None]:
import pandas as pd
columns = ['route', 'route_link', 'busname', 'bustype','departing_time_dt','duration','reaching_time_dt','star_rating','price','seats_available']
redbus_data =  pd.DataFrame(bus_details,columns=columns)

In [None]:
From = []
To = []
location_data = redbus_data['route']
for i in location_data:
    From_1, To_1 = i.split(" to ")
    From.append(From_1)
    To.append(To_1)
redbus_data["From"] = From
redbus_data["To"] = To

In [None]:
redbus_data = redbus_data[['From','To','busname','price','star_rating', 'bustype','departing_time_dt','duration','reaching_time_dt','seats_available','route','route_link']]

In [None]:
import pandas as pd
from sqlalchemy import create_engine
import mysql.connector

# Your MySQL connection
mydb = mysql.connector.connect(
 host="localhost",
 user="root",
 password="",
)

# Create a SQLAlchemy engine
engine = create_engine('mysql+mysqlconnector://root:@localhost/redbus_data')


# Upload the DataFrame to MySQL
redbus_data.to_sql('route_data', con=engine, if_exists='replace', index=False)

print("Data uploaded successfully!")
