In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import pandas as pd

In [2]:
#selenium path to chromedriver
path = Service("C:\\Users\\siyun\\Downloads\\chromedriver-win64\\chromedriver-win64\\chromedriver.exe")

In [3]:
#scrapes headers from nfl combine website to be used as column names for dataframe
NFL_url = 'https://nflcombineresults.com/nflcombinedata.php?'

driver = webdriver.Chrome(service=path)
driver.get(NFL_url)

headers = driver.find_elements(By.TAG_NAME, "thead")
for element in headers:
    text = element.text

col_names = text.split("\n")
col_names = col_names[:-1] + col_names[-1].split(' ')
#col_names.remove('Wonderlic')

driver.quit()

In [4]:
#create data table to hold all scraped values
data_table = []

In [5]:
#scrape player combine data
for year in range(1987, 2024):
    
    NFL_url = 'https://nflcombineresults.com/nflcombinedata.php?year={yrs}&pos=&college='.format(yrs=year)
    driver = webdriver.Chrome(service=path)
    driver.get(NFL_url)

    table = driver.find_element(By.ID, 'datatable')
    rows = table.find_elements(By.TAG_NAME, 'tr')

    for row in rows[1:]:  # Skip the header row
        cols = row.find_elements(By.TAG_NAME, 'td')
        player_data = [col.text if col.text.strip() != '' else '0' for col in cols]
        data_table.append(player_data)
    driver.quit()

In [13]:
#transform data table into dataframe
df_all_players = pd.DataFrame(data_table, columns=col_names)
#export dataframe to csv file to checkpoint progress
df_all_players.to_csv('csv_files//combine_dataset.csv', index=False)

In [3]:
#create data table to hold all pro bowl player data
all_pro_players = []

In [4]:
for year in range(1987, 1988):

    NFL_url ='https://www.pro-football-reference.com/years/{yrs}/probowl.htm'.format(yrs=year)
    driver = webdriver.Chrome(service=path)
    driver.get(NFL_url)

    table = driver.find_element(By.ID, 'pro_bowl')
    rows = table.find_elements(By.TAG_NAME, 'tr')

    for row in rows[1:]:  # Skip the header row
        cols = row.find_elements(By.TAG_NAME, 'td')
        pos = row.find_element(By.TAG_NAME, 'th')
        player_data = [col.text if col.text.strip() != '' else '0' for col in cols]
        player_data.append(year)
        player_data.append(pos)
        all_pro_players.append(player_data)
    driver.quit()

KeyboardInterrupt: 

In [4]:
#scrape pro bowl player data
for year in range(1987, 2023):
    NFL_url ='https://www.pro-football-reference.com/years/{yrs}/probowl.htm'.format(yrs=year)
    driver = webdriver.Chrome(service=path)
    driver.get(NFL_url)

    # Use a try-except block to handle potential errors
    try:
        table = driver.find_element(By.ID, 'pro_bowl')
        rows = table.find_elements(By.TAG_NAME, 'tr')

        for row in rows[1:]:  # Skip the header row
            cols = row.find_elements(By.TAG_NAME, 'td')
            # Use the 'th' element's 'scope' attribute to check if it's a row header
            pos = row.find_element(By.TAG_NAME, 'th').get_attribute('innerText').strip() if row.find_element(By.TAG_NAME, 'th').get_attribute('scope') == 'row' else ''
            player_data = [col.get_attribute('innerText').strip() if col.get_attribute('innerText').strip() != '' else '0' for col in cols]
            if pos:  # Only add rows that have a position element
                player_data.insert(0, pos)  # Insert the position at the start of the list
                player_data.append(year)
                all_pro_players.append(player_data)
    except Exception as e:
        print(f"An error occurred for year {year}: {e}")
    driver.quit() 

In [None]:
print(all_pro_players.__str__())

[['LILB', 'Fredd Young', 'AFC', 'SEA', '26', '3', '13', '13', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '9.0', '1', 'PFW: 1st Tm All-Conf., UPI: 1st Tm All-Conf., AP: 1st Tm, FW: 1st Tm, NEA: 1st Tm, PFW: 1st Tm, SN: 1st Tm', 1987], ['P', 'Jim Arnold', 'NFC', 'DET', '26', '4', '11', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0.0', '0', 'PFW: 1st Tm All-Conf., UPI: 1st Tm All-Conf., AP: 1st Tm, FW: 1st Tm, NEA: 1st Tm, PFW: 1st Tm, SN: 1st Tm', 1987], ['SS', 'Joey Browner', 'NFC', 'MIN', '27', '4', '12', '12', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1.0', '6', 'PFW: 1st Tm All-Conf., UPI: 1st Tm All-Conf., AP: 1st Tm, FW: 1st Tm, NEA: 1st Tm, PFW: 1st Tm, SN: 1st Tm', 1987], ['LDE', 'Reggie White', 'NFC', 'PHI', '26', '2', '12', '12', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '76', '21.0', '0', 'PFW: 1st Tm All-Conf., UPI: 1st Tm All-Conf., AP: 1st Tm, FW: 1st Tm, NEA: 1st Tm, PFW: 1st Tm, SN: 1st Tm', 1987], ['K

In [5]:
#gather tuples of just the player name, age of probowl and year of probowl inside list called temp_list
temp_list = []
for inner_list in all_pro_players:
    if (len(inner_list)>2):
        temp_list.append([inner_list[0], inner_list[1], inner_list[4], inner_list[-1]])

In [6]:
print(temp_list[-1])

['ST', 'Justin Hardee', '28', 2022]


In [7]:
df_pro_players = pd.DataFrame(temp_list, columns=['POS','Name', 'Age', 'Year'])
print(df_pro_players.head())

    POS             Name Age  Year
0  LILB      Fredd Young  26  1987
1     P       Jim Arnold  26  1987
2    SS     Joey Browner  27  1987
3   LDE     Reggie White  26  1987
4     K  Morten Andersen  27  1987


In [8]:
for ind, val in df_pro_players[['Name']].iterrows():
    if '+' in val[0]:
        df_pro_players['Name'][ind] = df_pro_players['Name'][ind].replace('+', '')
    elif '%' in val[0]:
        df_pro_players['Name'][ind] = df_pro_players['Name'][ind].replace('%', '')
print(df_pro_players.__str__())

       POS             Name Age  Year
0     LILB      Fredd Young  26  1987
1        P       Jim Arnold  26  1987
2       SS     Joey Browner  27  1987
3      LDE     Reggie White  26  1987
4        K  Morten Andersen  27  1987
...    ...              ...  ..   ...
3749   RET  KaVontae Turpin  26  2022
3750   RET      Jamal Agnew  27  2022
3751   RET   Devin Duvernay  25  2022
3752    ST    Jeremy Reaves  26  2022
3753    ST    Justin Hardee  28  2022

[3754 rows x 4 columns]


  if '+' in val[0]:
  elif '%' in val[0]:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pro_players['Name'][ind] = df_pro_players['Name'][ind].replace('+', '')
  if '+' in val[0]:
  elif '%' in val[0]:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pro_players['Name'][ind] = df_pro_players['Name'][ind].replace('+', '')
  if '+' in val[0]:
  elif '%' in val[0]:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pro_players['Name'][ind] = df_pro_players['Name'][ind].replace('+', '')
  if '+' in val[0]:
  eli

In [45]:
df_pro_players.to_csv('pro_players.csv', index=False)