## 1. Set Up Environment

In [1]:
from bs4 import BeautifulSoup as bs
from datetime import date
from datetime import datetime
from datetime import timedelta
import pandas as pd
import pyodbc
import random
import re
import requests
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import undetected_chromedriver as uc

## 2. Skyscanner Scraper

### 2.1 Change Some Preliminary Options For ChromeDriver To Try And Avoid Bot Detection

In [5]:
options = webdriver.ChromeOptions()
# options.add_argument("--headless")
options.add_argument('--user-data-dir=C:\\Users\\Weihan\\AppData\\Local\\Google\\Chrome\\User Data\\Default')

useragentarray = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36",
]
random_user_agent = random.choice(useragentarray)

options.add_argument(f"--user-agent={random_user_agent}")

options.add_argument("--disable-blink-features=AutomationControlled")

options.add_argument("--incognito")

chrome_driver = "C:/ChromeDrive/chromedriver"

### 2.2 Create The Main Function To Scrape Skyscanner Flights

In [7]:
flight_inputs = {'economy': 'Economy', 
                 'premium economy': 'Premium economy', 
                 'business class': 'Business Class', 
                 'first class': 'First Class'}


def skyscanner_flight(type_='roundtrip', class_='economy', leave="tvr", going="tpe", 
                   depart=date.today(), return_=date.today()+timedelta(days=1), 
                   trav=[1,0], nonstop=False):
    
    ### Handle out of bound dates ###################################################################################
    if (depart - date.today()).days < 0:
        print("depart date cannot be earlier than current date")
        return
    elif (return_ - depart).days < 0:
        print("return date cannot be earlier than depart date")
        return
    elif ((depart - date.today()).days > 360) or ((return_ - date.today()).days > 360):
        print("Selected dates are too far in the future")
        return
    else:
        pass
    
    ### Handle invalid traveller counts #############################################################################
    if trav[0] < 1:
        print("must have at least 1 adult selected")
        return
    elif sum(trav) > 16 or trav[0] > 8 or trav[1] > 8:
        print("total travellers can't exceed 16")
        return
    else:
        pass
    
    ### If there are children selected ##############################################################################
    if trav[1] > 0:
        num = 1
        child_ages = []
        for i in range(0, trav[1]):
            print(f"Enter the age of child {num} (ages: 0-15)")
            child_ages.append(input())
            num += 1
    
    ### Request the webpage #########################################################################################
    browser = uc.Chrome(executable_path = chrome_driver, options=options)
    browser.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    url = "https://www.skyscanner.ca/"
    browser.get(url)
    
    ### Select ticket "class" and Travellers ########################################################################
    time.sleep(random.uniform(1, 2))
    trav_class_xpath = '//button[@aria-label="Select number of travelers and cabin class"]'
    trav_class_element = WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.XPATH, trav_class_xpath)))
    trav_class_element.click()
    time.sleep(random.uniform(1, 2))
    select = Select(browser.find_element(By.ID, "search-controls-cabin-class-dropdown"))
    select.select_by_visible_text(flight_inputs[class_])
    time.sleep(random.uniform(1, 2))
    
    trav_add_xpath = {1: '//button[@aria-controls="adult-nudger"][2]',
                     2: '//button[@aria-controls="children-nudger"][2]'}
    
    x = 1
    for traveller_type in trav:
        if x == 1:
            for i in range(0, traveller_type - 1):
                browser.find_element(By.XPATH, trav_add_xpath[x]).click()
                time.sleep(random.uniform(1, 2))
        else:
            for i in range(0, traveller_type):
                browser.find_element(By.XPATH, trav_add_xpath[x]).click()
                time.sleep(1)
                select = Select(browser.find_element(By.XPATH, f'//select[@aria-label="Age of child {i+1}"]'))
                select.select_by_value(child_ages[i])
                time.sleep(random.uniform(1, 2))
    
    ### Select Leaving from and Going to locations ##################################################################
    leave_xpath = '//input[@aria-controls="originInput-menu"]'
    leave_element = WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.XPATH, leave_xpath)))
    leave_element.clear
    leave_element.click()
    time.sleep(random.uniform(1, 2))
    leave_element.send_keys(leave)
    time.sleep(1)
    leave_element.send_keys(Keys.DOWN, Keys.ENTER)
    
    going_xpath = '//input[@aria-controls="destinationInput-menu"]'
    going_element = WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.XPATH, going_xpath)))
    going_element.clear
    going_element.click()
    time.sleep(1)
    going_element.send_keys(going)
    time.sleep(random.uniform(1, 2))
    going_element.send_keys(Keys.DOWN, Keys.ENTER)
    
    ### Select Departure & Return Dates #############################################################################
    
    # Toggle direct flights option before dates, because the last popup will change "select" to "search"
    if nonstop:
        nonstop_xpath = '//input[@aria-label="Direct flights"]'
        nonstop_element = WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.XPATH, nonstop_xpath)))
        nonstop_element.click()
        time.sleep(random.uniform(1, 2))
    
    # Calculate the difference in months from our desired date vs display (skyscanner uses today's date)
    month_diff_depart = (depart.year - date.today().year)*12 + (depart.month - date.today().month)
    month_diff_return = (return_.year - depart.year)*12 + (return_.month - depart.month)
    
    depart_format = depart.strftime("%A, %B %#d, %Y")
    return_format = return_.strftime("%A, %B %#d, %Y")
    
    depart_xpath = '//button[@aria-label="Select departure date"]'
    depart_element = WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.XPATH, depart_xpath)))
    depart_element.click()
    time.sleep(random.uniform(1, 2))
    
    if type_ == 'roundtrip':
        # Find and click the depart date
        if month_diff_depart > 0:
            dateNext_xpath = '//button[contains(@aria-label, "Next month")]'
            dateNext_element = WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.XPATH, dateNext_xpath)))
            for i in range(0, month_diff_depart):
                time.sleep(1)
                dateNext_element.click()
                time.sleep(random.uniform(1, 2))
        else:
            pass
        pickDepart_xpath = f'//button[contains(@aria-label, "{depart_format}")]'
        time.sleep(1)
        browser.find_element(By.XPATH, pickDepart_xpath).click()
        time.sleep(1)  
        
        # Find and click the return date
        if month_diff_return > 0:
            dateNext_xpath = '//button[contains(@aria-label, "Next month")]'
            dateNext_element = WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.XPATH, dateNext_xpath)))
            for i in range(0, month_diff_return):
                time.sleep(random.uniform(1, 2))
                dateNext_element.click()
                time.sleep(1)
        else:
            pass
        pickReturn_xpath = f'//button[contains(@aria-label, "{return_format}")]'
        time.sleep(1)
        browser.find_element(By.XPATH, pickReturn_xpath).click()
        time.sleep(1)  
        
    elif type_ == 'one-way':
        if month_diff_depart > 0:
            dateNext_xpath = '//button[contains(@aria-label, "Next month")]'
            dateNext_element = WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.XPATH, dateNext_xpath)))
            for i in range(0, month_diff_depart):
                time.sleep(1)
                dateNext_element.click()
                time.sleep(random.uniform(1, 2))
        else:
            pass
        pickDepart_xpath = f'//button[contains(@aria-label, "{depart_format}")]'
        time.sleep(1)
        browser.find_element(By.XPATH, pickDepart_xpath).click()
        time.sleep(random.uniform(1, 2))
    
    else:
        print('something broke')
        browser.quit()
        return
    
    ### Click "Search" and wait for results ########################################################################    
    search_xpath = '//button[contains(text(), "Search")]'
    search_element = WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.XPATH, search_xpath)))
    search_element.click()
    time.sleep(random.uniform(6, 7))
    
    select = Select(browser.find_element(By.ID, "secondarySort"))
    select.select_by_visible_text("Cheapest first")
    
    show_more_xpath = '//button[contains(text(), "Show more results")]'
    show_more_element = WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.XPATH, show_more_xpath)))
    show_more_element.click()
    time.sleep(random.uniform(3, 4))
    
    ### Getting the Info and placing it in SQL #####################################################################
    date_scrape = []
    airline = []
    ticket_type = [type_]
    ticket_class = [class_]
    
    ticket_css_selector = '.BpkTicket_bpk-ticket__main--horizontal__YjNmY'
    content = browser.find_elements(By.CSS_SELECTOR, ticket_css_selector) 
    
    price_css_selector = '.BpkTicket_bpk-ticket__stub--horizontal__ZmQzY'
    price_info = browser.find_elements(By.CSS_SELECTOR, price_css_selector)
    
    # Check if there are any available flights
    if len(content) > 0:
        for e in content:
            start = e.get_attribute('innerHTML')
            soup = bs(start, features='lxml')
            raw = soup.get_text().strip()
            
            print(raw)
            print('***')
            
            # Get the time of scrape
            date_scrape.append(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
            
        for p in price_info:
            start = p.get_attribute('innerHTML')
            soup = bs(start, features='lxml')
            raw = soup.get_text().strip()
            
            print(raw)
            print('***')

    browser.quit()

In [8]:
skyscanner_flight(class_='economy', leave='yvr', going='tpe', 
                  depart=date(2024,3,3), return_=date(2024,3,23), 
                  trav=[1,0], nonstop=False)

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="secondarySort"]"}
  (Session info: chrome=118.0.5993.70); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00284DE3+43907]
	(No symbol) [0x00210741]
	(No symbol) [0x001033ED]
	(No symbol) [0x00136760]
	(No symbol) [0x00136C2B]
	(No symbol) [0x00166F62]
	(No symbol) [0x00152BA4]
	(No symbol) [0x001655CA]
	(No symbol) [0x00152956]
	(No symbol) [0x0012E17E]
	(No symbol) [0x0012F32D]
	GetHandleVerifier [0x00535AF9+2865305]
	GetHandleVerifier [0x0057E78B+3163435]
	GetHandleVerifier [0x00578441+3138017]
	GetHandleVerifier [0x0030E0F0+605840]
	(No symbol) [0x0021A64C]
	(No symbol) [0x00216638]
	(No symbol) [0x0021675F]
	(No symbol) [0x00208DB7]
	BaseThreadInitThunk [0x7553FCC9+25]
	RtlGetAppContainerNamedObjectPath [0x77187C6E+286]
	RtlGetAppContainerNamedObjectPath [0x77187C3E+238]
	(No symbol) [0x00000000]
