In [1]:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd

In [3]:
#All urls to the Baseball Reference SEC pages for each respective year in the study
sec2022 = 'https://www.baseball-reference.com/register/league.cgi?id=bbbb899c'
sec2023 = 'https://www.baseball-reference.com/register/league.cgi?id=f57c40ea'
sec2024 = 'https://www.baseball-reference.com/register/league.cgi?id=968c06c1'
sec2025 = 'https://www.baseball-reference.com/register/league.cgi?id=82d1384a'

#Page requests to all of the seasons
page_sec2022 = requests.get(sec2022) 
print('2022')
time.sleep(2)

page_sec2023 = requests.get(sec2023) 
print('2023')
time.sleep(2)

page_sec2024 = requests.get(sec2024) 
print('2024')
time.sleep(2)

page_sec2025 = requests.get(sec2025)
print('2025')

2022
2023
2024
2025


In [4]:
#Convert the pages into BeautifulSoups
sec_soups = [BeautifulSoup(page_sec2022.text,'html'),
             BeautifulSoup(page_sec2023.text,'html'), 
             BeautifulSoup(page_sec2024.text,'html'), 
             BeautifulSoup(page_sec2025.text,'html')]

In [6]:
#Dictionaries of player stats will be loaded onto loading_list
loading_list = []

#A dictionary that relates the needed stats in the table and their indices
stat_index_dict = {'age': 0,
                    'team': 2,
                    'conference': 3,
                    'PA': 7,
                    'BB': 17,
                    'SO': 18,
                    'OBP': 20,
                    'SLG': 21,
                    'OPS': 22}

#Contains all ids for reference to prevent duplicate data
seen_ids = set()

In [20]:
#season is the soup for a particular SEC season page
for season in sec_soups:

    #team is the link for a particular team in a season
    for team in season.find_all('a', title=True):
        
        team_url = 'https://www.baseball-reference.com{0}'.format(team['href'])
        team_page = requests.get(team_url)
        time.sleep(6)
        team_soup = BeautifulSoup(team_page.text, 'html')

        #List of players on a particular team
        players = [plyr for plyr in team_soup.find_all('a', href=True) if "/register/player.fcgi?" in plyr['href']]

        #player is the url of the player's college career page
        for player in players:

            #this substring extracts the uniquely identifiable code for a player from the url
            id = player['href'][25:]

            if id in seen_ids:
                continue
            
            id_url = 'https://www.baseball-reference.com/register/player.fcgi?id={0}'.format(id)
            temp_page = requests.get(id_url)
            time.sleep(6)
            temp_soup = BeautifulSoup(temp_page.text, 'html')

            #find
            for plyr_season in temp_soup.find_all('tr', class_="college full"):
        
                temp_dict = {}
                temp_dict["player_id"] = id
                temp_dict["name"] = temp_soup.find('h1').get_text().strip()
                temp_dict["year"] = plyr_season.find_all('a')[0].get_text()
                for stat,index in stat_index_dict.items():
                    temp_dict[stat] = plyr_season.find_all('td')[index].get_text()

                seen_ids.add(id)
                loading_list.append(temp_dict)
                print(temp_dict)
                

{'player_id': 'beck--001jor', 'name': 'Jordan Beck', 'year': '2020', 'age': '19', 'team': 'Tennessee', 'conference': 'SEC', 'PA': '48', 'BB': '8', 'SO': '11', 'OBP': '.396', 'SLG': '.475', 'OPS': '.871'}
{'player_id': 'beck--001jor', 'name': 'Jordan Beck', 'year': '2021', 'age': '20', 'team': 'Tennessee', 'conference': 'SEC', 'PA': '289', 'BB': '24', 'SO': '60', 'OBP': '.336', 'SLG': '.523', 'OPS': '.859'}
{'player_id': 'beck--001jor', 'name': 'Jordan Beck', 'year': '2022', 'age': '21', 'team': 'Tennessee', 'conference': 'SEC', 'PA': '297', 'BB': '37', 'SO': '62', 'OBP': '.391', 'SLG': '.595', 'OPS': '.986'}
{'player_id': 'booker000kyl', 'name': 'Kyle Booker', 'year': '2021', 'age': '19', 'team': 'Tennessee', 'conference': 'SEC', 'PA': '68', 'BB': '7', 'SO': '18', 'OBP': '.382', 'SLG': '.448', 'OPS': '.831'}
{'player_id': 'booker000kyl', 'name': 'Kyle Booker', 'year': '2022', 'age': '20', 'team': 'Tennessee', 'conference': 'SEC', 'PA': '55', 'BB': '7', 'SO': '12', 'OBP': '.364', 'SLG':

In [21]:
df = pd.DataFrame(loading_list)

In [22]:
df

Unnamed: 0,player_id,name,year,age,team,conference,PA,BB,SO,OBP,SLG,OPS
0,beck--001jor,Jordan Beck,2020,19,Tennessee,SEC,48,8,11,.396,.475,.871
1,beck--001jor,Jordan Beck,2021,20,Tennessee,SEC,289,24,60,.336,.523,.859
2,beck--001jor,Jordan Beck,2022,21,Tennessee,SEC,297,37,62,.391,.595,.986
3,booker000kyl,Kyle Booker,2021,19,Tennessee,SEC,68,7,18,.382,.448,.831
4,booker000kyl,Kyle Booker,2022,20,Tennessee,SEC,55,7,12,.364,.356,.719
...,...,...,...,...,...,...,...,...,...,...,...,...
1984,patter004chr,Chris Patterson,2025,18,Missouri,SEC,125,10,41,.304,.393,.697
1985,picare000bra,Brady Picarelli,2025,19,Missouri,SEC,56,7,15,.393,.604,.997
1986,seals-000pie,Pierre Seals,2024,20,Memphis,Amer,241,31,63,.436,.553,.988
1987,seals-000pie,Pierre Seals,2025,21,Missouri,SEC,172,17,58,.374,.446,.820


In [23]:
df.to_csv('raw_sec_player_stats.csv', index=False)