In [1]:
# Import dependencies
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.firefox import GeckoDriverManager
from selenium.common.exceptions import StaleElementReferenceException
import os 

In [2]:
# Set up splinter
executable_path = {'executable_path': GeckoDriverManager().install()}
browser = Browser('firefox', **executable_path, headless=False)



Current firefox version is 103.0
Get LATEST geckodriver version for 103.0 firefox
Driver [/Users/zcvalis/.wdm/drivers/geckodriver/macos/v0.31.0/geckodriver] found in cache


In [3]:
# Visit Formula 1 site
url = 'https://www.formula1.com/en/results.html'
browser.visit(url)
# Optional delay for loading the page
browser.is_element_present_by_css('div.resultsarchive-filter-wrap', wait_time=5)

True

In [4]:
# Set up html parser
html = browser.html
news_soup = soup(html, 'html.parser')
slide_elem = news_soup.select_one('div.resultsarchive-filter-container')

In [5]:
# Click accept cookies
browser.find_by_id('truste-consent-button').click()

In [6]:
# Get list of partial race links
partial_link = slide_elem.findAll('a', class_='resultsarchive-filter-item-link FilterTrigger')
race_partial_hrefs = [link.get('href') for link in partial_link[-22:-9]]

In [7]:
for count, value in enumerate(race_partial_hrefs):
    print(f"Count: {count}")
    print(f"Value: {value}")

Count: 0
Value: /en/results.html/2022/races/1124/bahrain/race-result.html
Count: 1
Value: /en/results.html/2022/races/1125/saudi-arabia/race-result.html
Count: 2
Value: /en/results.html/2022/races/1108/australia/race-result.html
Count: 3
Value: /en/results.html/2022/races/1109/italy/race-result.html
Count: 4
Value: /en/results.html/2022/races/1110/miami/race-result.html
Count: 5
Value: /en/results.html/2022/races/1111/spain/race-result.html
Count: 6
Value: /en/results.html/2022/races/1112/monaco/race-result.html
Count: 7
Value: /en/results.html/2022/races/1126/azerbaijan/race-result.html
Count: 8
Value: /en/results.html/2022/races/1113/canada/race-result.html
Count: 9
Value: /en/results.html/2022/races/1114/great-britain/race-result.html
Count: 10
Value: /en/results.html/2022/races/1115/austria/race-result.html
Count: 11
Value: /en/results.html/2022/races/1116/france/race-result.html
Count: 12
Value: /en/results.html/2022/races/1117/hungary/race-result.html


In [8]:
# Function adapted to Python from http://darrellgrainger.blogspot.com/2012/06/staleelementexception.html
def retryingFindClick(link):
    result = False
    attempts = 0
    while (attempts < 2):
        try:
            browser.links.find_by_partial_href(link).click()
            result = True
            break
        except StaleElementReferenceException:
            attempts += 1
    
    return result

In [9]:
race_result_data = pd.DataFrame()
side_col_data_df = pd.DataFrame()
f1_url = 'https://www.formula1.com'
os.makedirs('./data/', exist_ok=True)

# Scrape data
for count, race_href in enumerate(race_partial_hrefs):
    print(f"{count} iterate")
    print(f"race link | {race_href}")
    
    # Click link using partial
    retryingFindClick(race_href)
    
    # Get full race results link
    race_result_full_url = f'{f1_url}{race_href}'
    print(race_result_full_url)
    
    # Export race result table to CSV
    race_result_data = pd.read_html(race_result_full_url)[0]
    race_result_data.to_csv(f'./data/{count}_0_df.csv', index=False)
    
    # Delay for loading the page
    browser.is_element_present_by_css('a.side-nav-item-link ArchiveLink')
    
    # Get side column data
    # Set up new html parser and get partial link
    html = browser.html
    news_soup = soup(html, 'html.parser')
    slide_elem = news_soup.select_one('div.resultsarchive-wrapper')
    
    # Delay for loading the page
    browser.is_element_present_by_css('ul.resultsarchive-side-nav')
    
    side_col_data = slide_elem.findAll('a', class_='side-nav-item-link ArchiveLink')
    # Loop through side column data
    for i in range(len(side_col_data)):
        # Get URL to read table into DF
        side_col_data_partial_link = side_col_data[i].get('href')
        print(side_col_data_partial_link)
        side_col_data_full_url = f'{f1_url}{side_col_data_partial_link}'
        # Export side column tables to CSVs
        side_col_data_df = pd.read_html(side_col_data_full_url)[0]
        side_col_data_df.to_csv(f'./data/{count}_{i+1}_df.csv', index=False)
        
    # Reset html parser
    html = browser.html
    news_soup = soup(html, 'html.parser')
    slide_elem = news_soup.select_one('div.resultsarchive-filter-item-link FilterTrigger')

0 iterate
race link | /en/results.html/2022/races/1124/bahrain/race-result.html
https://www.formula1.com/en/results.html/2022/races/1124/bahrain/race-result.html
/en/results.html/2022/races/1124/bahrain/fastest-laps.html
/en/results.html/2022/races/1124/bahrain/pit-stop-summary.html
/en/results.html/2022/races/1124/bahrain/starting-grid.html
/en/results.html/2022/races/1124/bahrain/qualifying.html
/en/results.html/2022/races/1124/bahrain/practice-3.html
/en/results.html/2022/races/1124/bahrain/practice-2.html
/en/results.html/2022/races/1124/bahrain/practice-1.html
1 iterate
race link | /en/results.html/2022/races/1125/saudi-arabia/race-result.html
https://www.formula1.com/en/results.html/2022/races/1125/saudi-arabia/race-result.html
/en/results.html/2022/races/1125/saudi-arabia/fastest-laps.html
/en/results.html/2022/races/1125/saudi-arabia/pit-stop-summary.html
/en/results.html/2022/races/1125/saudi-arabia/starting-grid.html
/en/results.html/2022/races/1125/saudi-arabia/qualifying.ht