In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import time

# Setup WebDriver
service = Service("/opt/homebrew/Caskroom/chromedriver/125.0.6422.60/chromedriver-mac-arm64/chromedriver")
driver = webdriver.Chrome(service=service)

# Navigate to the login page
driver.get("https://stathead.com/users/login.cgi")  # Update if the login URL differs
print("Please log in to the website. Proceed to the next cell once logged in.")


Please log in to the website. Proceed to the next cell once logged in.


In [2]:
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException


def fetch_csv_data(url):
    driver.get(url)
    wait = WebDriverWait(driver, 10)  # Setup wait for up to 10 seconds

    # Check for "no results" message before proceeding
    no_results_text = "Sorry, there are no results for your search."
    page_content = driver.page_source
    if no_results_text in page_content:
        print("No more results to fetch.")
        return None

    try:
        # Find and hover over the parent element to make the export button visible
        hover_element = wait.until(EC.visibility_of_element_located((By.XPATH, '//li[@class="hasmore"]/span[text()="Export Data"]')))
        ActionChains(driver).move_to_element(hover_element).perform()

        # Now wait for the specific export button to become clickable after the hover
        export_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//button[@class="tooltip" and contains(@tip, "suitable for use with Excel") and text()="Get table as CSV (for Excel)"]')))
        export_button.click()

        # Wait for the CSV data to become visible
        wait.until(EC.visibility_of_element_located((By.ID, 'csv_stats')))
    except TimeoutException as e:
        print("Timeout while waiting for elements:", e)
        return ""
    except Exception as e:
        print("Error interacting with page elements:", e)
        return ""

    # Extract CSV data
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    pre_tag = soup.find('pre', id='csv_stats')
    return pre_tag.text if pre_tag else ""

In [3]:
# NBA foreign
# base_url = "https://stathead.com/basketball/player-season-finder.cgi?request=1&draft_pick_type=overall&comp_type=reg&order_by=ws&match=player_season&season_start=1&display_type=totals&season_end=-1&locationMatch=isnot&pob=USA&comp_id=NBA&year_min=1947&offset="
# NBA all
# base_url = "https://stathead.com/basketball/player-season-finder.cgi?request=1&draft_pick_type=overall&comp_type=reg&order_by=ws&match=player_season&season_start=1&year_max=2024&display_type=totals&season_end=-1&comp_id=NBA&year_min=1947&offset="
# NFL all
# base_url = "https://stathead.com/football/player-season-finder.cgi?request=1&draft_pick_type=overall&comp_type=reg&order_by=av&match=player_season&season_start=1&season_end=-1&weight_max=500&rookie=N&year_min=2012&cstat[1]=pass_cmp&ccomp[1]=gt&cval[1]=0&cstat[2]=rush_att&ccomp[2]=gt&cval[2]=0&cstat[3]=targets&ccomp[3]=gt&cval[3]=0&cstat[4]=all_td&ccomp[4]=gt&cval[4]=0&cstat[5]=sacks&ccomp[5]=gt&cval[5]=0&cstat[6]=def_int&ccomp[6]=gt&cval[6]=0&cstat[7]=punt&ccomp[7]=gt&cval[7]=0&cstat[8]=touches&ccomp[8]=gt&cval[8]=0&offset="
# MLB Batting foreign
# base_url = "https://stathead.com/baseball/player-batting-season-finder.cgi?request=1&draft_pick_type=overall&location=pob&comp_type=reg&order_by=b_war&match=player_season&season_start=1&weight_max=500&season_end=-1&locationMatch=isnot&pob=USA&exactness=anymarked&offset="
# MLB Pitching foreign
# base_url = "https://stathead.com/baseball/player-pitching-season-finder.cgi?request=1&draft_pick_type=overall&location=pob&comp_type=reg&order_by=p_war&match=player_season&season_start=1&p_g=x&weight_max=500&season_end=-1&locationMatch=isnot&pob=USA&offset="
# MLS foreign
base_url = "https://stathead.com/fbref/player-season-finder.cgi?request=1&height_type=height_feet&force_min_year=1&height_max=84&comp_type=c-22&order_by=plus_minus&match=player_season&per90_type=player&weight_max=500&locationMatch=isnot&pob=USA&phase_id=0&comp_gender=m&per90min_val=5&weight_type=lbs&cstat[1]=assisted_shots&ccomp[1]=gt&cval[1]=0&cstat[2]=tackles_won&ccomp[2]=gt&cval[2]=0&cstat[3]=minutes_per_game&ccomp[3]=gt&cval[3]=0&cstat[4]=fouls&ccomp[4]=gt&cval[4]=0&cstat[5]=on_xg_for&ccomp[5]=gt&cval[5]=0&offset="
# NHL foreign
# base_url = "https://stathead.com/hockey/player-season-finder.cgi?request=1&draft_pick_type=overall&comp_type=reg&order_by=goals&match=player_season&season_start=1&season_end=-1&locationMatch=isnot&pob=USA&rookie=N&pos=S&comp_id=NHL&cstat[1]=ops&ccomp[1]=gt&cval[1]=0&cstat[2]=goals_per_game&ccomp[2]=gt&cval[2]=0&offset="
offset = 0
raw_data_list = []

while True:
    url = f"{base_url}{offset}"
    csv_data = fetch_csv_data(url)
    if csv_data is None or "Sorry, there are no results for your search." in csv_data:
        print("Finished fetching all available data.")
        break
    raw_data_list.append(csv_data)
    offset += 200

# Combine all fetched data into one large string
all_csv_data = "\n".join(raw_data_list)


No more results to fetch.
Finished fetching all available data.


In [4]:
def clean_csv_data(all_csv_data):
    lines = all_csv_data.split('\n')
    cleaned_lines = []
    header_found = False
    for line in lines:
        if 'Rk,Player,' in line and header_found:
            continue  # Skip duplicate headers
        elif '--- When using SR' in line or '</' in line or '<a href' in line:
            continue  # Skip lines containing footer content or HTML tags
        cleaned_lines.append(line)
        if 'Rk,Player,' in line:
            header_found = True  # Mark header as found
    
    return '\n'.join(cleaned_lines)

cleaned_csv = clean_csv_data(all_csv_data)


In [5]:
dataframe = pd.read_csv(StringIO(cleaned_csv))
dataframe

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Performance.5,Performance.6,Performance.7,Performance.8,Performance.9,Performance.10,Performance.11,Unnamed: 82,Unnamed: 83,-additional
0,Rk,Player,+/-,KP,TklW,Mn/MP,Fls,onxG,Season,Wt.,...,Gls,Ast,G+A,G-PK,PK,PKatt,PKm,Pos,Birth Location,-9999
1,1,Carlos Vela,+48,92,15,88,12,73.6,2019,161,...,34,10,44,25,9,11,2,FW,Cancún MEX,e0cd04e0
2,2,Mark-Anthony Kaye,+47,51,43,81,49,67.9,2019,154,...,4,8,12,4,0,0,0,MF,Toronto ON,36a38acd
3,3,Eduard Atuesta,+46,48,46,88,44,70.0,2019,157,...,3,8,11,3,0,0,0,MF,Vélez COL,60990b81
4,4,Jakob Glesnes,+46,7,26,90,18,59.3,2022,176,...,0,2,2,0,0,0,0,DF,Bergen NOR,ac65aef3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2670,2657,Justin Hoyte,-28,5,5,87,11,16.1,2019,165,...,0,0,0,0,0,0,0,DF,London GBR,b57b7582
2671,2658,Brenner,-29,27,8,84,18,37.0,2021,154,...,8,1,9,6,2,3,1,FW,Cuiabá BRA,279508bf
2672,2659,Andreas Maxsø,-29,5,5,89,17,35.9,2023,154,...,2,0,2,2,0,0,0,DF,Smørumnedre DNK,52fdf512
2673,2660,Federico Bernardeschi,-29,55,11,84,33,28.0,2023,165,...,5,2,7,4,1,1,0,FWMF,Carrara ITA,ee93c1a9


In [6]:
# filename = 'NBA_all.csv'
# filename = 'NFL_all.csv'
# filename = 'MLB_B_foreign.csv'
# filename = 'MLB_P_foreign.csv'
filename = 'MLS_foreign.csv'
# filename = 'NHL_foreign.csv'
dataframe.to_csv(filename, index=False)
print(f"Data saved to {filename}.")


Data saved to MLS_foreign.csv.
