# Tyler Albert, Jake Angelucci
## Data Wrangling Project Fall 2024
### Website Scraped: https://www.pro-football-reference.com/years/2010/draft.htm
### Webite homepage: https://www.pro-football-reference.com/
#### In this notebook we will be scraping the url above for data on players drafted into the NFL from years 2009-2019. We will be scraping for the year the player was drafted, the overall pick number, the players name, and their position. Note that in our code we had to use an f-string in our url to ensure that the scraping would only be between a certain year range, so that "2010" in the data would vary depending on which year was currently being scraped. Above is listed the homepage of the website, we then naviagted to the "draft" page and selected year "2009" to start our scrape.

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import random

# Setup ChromeDriver using webdriver_manager
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
service = Service(ChromeDriverManager().install())
browser = webdriver.Chrome(service=service, options=chrome_options)

# Function to scroll down the page randomly
def random_scroll(browser, total_wait_time=5):
    total_height = browser.execute_script("return document.body.scrollHeight")
    scroll_steps = random.randint(3, 7)
    scroll_increment = total_height // scroll_steps
    time_per_step = total_wait_time / scroll_steps
    for step in range(scroll_steps):
        browser.execute_script("window.scrollBy(0, arguments[0]);", scroll_increment)
        time.sleep(random.uniform(0.5 * time_per_step, 1.5 * time_per_step))
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")

# Function to scrape draft data from the page
def scrape_draft(browser):
    picks = []
    players = []
    positions = []
    
    try:
        # Wait until the table is present
        WebDriverWait(browser, 10).until(
            EC.presence_of_element_located((By.XPATH, '//table[contains(@id,"drafts")]'))
        )
        
        # Locate the draft table rows
        table_rows = browser.find_elements(By.XPATH, '//table[contains(@id,"drafts")]//tbody//tr')
        
        for row in table_rows:
            try:
                cells = row.find_elements(By.TAG_NAME, 'td')
                
                # Skip rows that are not data rows
                if len(cells) > 3:  # Ensure the row has enough columns
                    draft_pick = cells[0].text  # "pick" column
                    player = cells[2].text     # "player" column
                    position = cells[3].text   # "position" column
                    
                    # Append data only if all required fields are present
                    if draft_pick and player and position:
                        picks.append(draft_pick)
                        players.append(player)
                        positions.append(position)
            except Exception as e:
                print(f"Error encountered while processing row: {e}")
                continue
    except Exception as e:
        print(f"Error encountered while locating table rows: {e}")
    
    return picks, players, positions

# List to hold all data
all_picks = []
all_players = []
all_positions = []
all_years = []

# Years to scrape
years = list(range(2009, 2020))

# Scrape data for each year
for year in years:
    try:
        url = f'https://www.pro-football-reference.com/years/{year}/draft.htm'
        print(f"Accessing URL: {url}")
        browser.get(url)
        time.sleep(random.uniform(3, 7))
        random_scroll(browser)
        
        picks, players, positions = scrape_draft(browser)
        all_picks.extend(picks)
        all_players.extend(players)
        all_positions.extend(positions)
        all_years.extend([year] * len(players))
    except Exception as e:
        print(f"Error encountered for year {year}: {e}")
        continue

# Create DataFrame
draft_data = pd.DataFrame({
    'Year': all_years,
    'Overall Pick': all_picks,
    'Player Name': all_players,
    'Position': all_positions
})

# Save to CSV
if not draft_data.empty:
    draft_data.to_csv('players_draft_data_2009_2019.csv', index=False)
    print("Data saved to players_draft_data_2009_2019.csv")
else:
    print("No data scraped. The DataFrame is empty.")

# Close the browser
browser.quit()

Accessing URL: https://www.pro-football-reference.com/years/2009/draft.htm
Accessing URL: https://www.pro-football-reference.com/years/2010/draft.htm
Accessing URL: https://www.pro-football-reference.com/years/2011/draft.htm
Accessing URL: https://www.pro-football-reference.com/years/2012/draft.htm
Accessing URL: https://www.pro-football-reference.com/years/2013/draft.htm
Accessing URL: https://www.pro-football-reference.com/years/2014/draft.htm
Accessing URL: https://www.pro-football-reference.com/years/2015/draft.htm
Accessing URL: https://www.pro-football-reference.com/years/2016/draft.htm
Accessing URL: https://www.pro-football-reference.com/years/2017/draft.htm
Accessing URL: https://www.pro-football-reference.com/years/2018/draft.htm
Accessing URL: https://www.pro-football-reference.com/years/2019/draft.htm
Data saved to players_draft_data_2009_2019.csv


In [3]:
scraped_draft = pd.read_csv('players_draft_data_2009_2019.csv')
display(scraped_draft.head())

Unnamed: 0,Year,Overall Pick,Player Name,Position
0,2009,1,Matthew Stafford,QB
1,2009,2,Jason Smith,T
2,2009,3,Tyson Jackson,DE
3,2009,4,Aaron Curry,LB
4,2009,5,Mark Sanchez,QB
