In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

def PEAR_Win_Prob(home_pr, away_pr):
    rating_diff = home_pr - away_pr
    win_prob = round(1 / (1 + 10 ** (-rating_diff / 10)) * 100, 2)
    return win_prob

# Base URL for NCAA stats
base_url = "https://www.ncaa.com"
stats_page = f"{base_url}/stats/baseball/d1"

# Function to get page content
def get_soup(url):
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    response.raise_for_status()  # Ensure request was successful
    return BeautifulSoup(response.text, "html.parser")

# Get main page content
soup = get_soup(stats_page)

# Find the dropdown container and extract stat URLs
dropdown = soup.find("select", {"id": "select-container-team"})
options = dropdown.find_all("option")

# Extract stat names and links
stat_links = {
    option.text.strip(): base_url + option["value"]
    for option in options if option.get("value")
}

url = "https://www.ncaa.com/rankings/baseball/d1/rpi"
response = requests.get(url)
response.raise_for_status()  # Ensure request was successful
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find("table", class_="sticky")
if table:
    headers = [th.text.strip() for th in table.find_all("th")]
    data = []
    for row in table.find_all("tr")[1:]:  # Skip header row
        cols = row.find_all("td")
        data.append([col.text.strip() for col in cols])
    rpi = pd.DataFrame(data, columns=headers)
    rpi = rpi.drop(columns = ['Previous'])
    rpi.rename(columns={"School": "Team"}, inplace=True)
else:
    print("Table not found.")

url = "https://www.collegebaseballratings.com/"
response = requests.get(url)
response.raise_for_status()  # Raise an error for failed requests
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find("table", {"id": "teamList"})
headers = [th.text.strip() for th in table.find("thead").find_all("th")]
data = []
for row in table.find("tbody").find_all("tr"):
    cells = [td.text.strip() for td in row.find_all("td")]
    data.append(cells)
cbr = pd.DataFrame(data, columns=headers[1:])
cbr.rename(columns={"Rank":"CBRank"}, inplace=True)
cbr['Team'] = cbr['Team'].str.replace('State', 'St.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Southern Miss', 'Southern Miss.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('NC St.', 'NC State', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Appalachian St.', 'App State', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Dallas Baptist', 'DBU', regex=False)
cbr['Team'] = cbr['Team'].str.replace('USC', 'Southern California', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Charleston', 'Col. of Charleston', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Col. of Charleston Southern', 'Charleston So.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Georgia Southern', 'Ga. Southern', regex=False)
cbr['Team'] = cbr['Team'].str.replace('UNC Wilmington', 'UNCW', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Southern Illinois', 'Southern Ill.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Florida Atlantic', 'Fla. Atlantic', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Lamar', 'Lamar University', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Western Kentucky', 'Western Ky.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Southern California Upstate', 'USC Upstate', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Southeast Missouri', 'Southeast Mo. St.', regex=False)
cbr['Team'] = cbr['Team'].str.replace("St. John's", "St. John's (NY)", regex=False)
cbr['Team'] = cbr['Team'].str.replace('Southeastern Louisiana', 'Southeastern La.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Kennesaw', 'Kennesaw St.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Louisiana Monroe', 'ULM', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Western Carolina', 'Western Caro.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('USF', 'South Fla.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Loyola Marymount', 'LMU (CA)', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Army', 'Army West Point', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Incarnate Word', 'UIW', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Central Michigan', 'Central Mich.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Eastern Illinois', 'Eastern Ill.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Western Michigan', 'Western Mich.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Central Arkansas', 'Central Ark.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Middle Tennessee', 'Middle Tenn.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Monmouth (NJ)', 'Monmouth', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Northern Kentucky', 'Northern Ky.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('North Carolina A&T', 'N.C. A&T', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Texas A&M-Corpus Christi', 'A&M-Corpus Christi', regex=False)
cbr['Team'] = cbr['Team'].str.replace("Saint Joseph's (PA)", "Saint Joseph's", regex=False)
cbr['Team'] = cbr['Team'].str.replace('Eastern Kentucky', 'Eastern Ky.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Seattle', 'Seattle U', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Eastern Michigan', 'Eastern Mich.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('North Alabama', 'North Ala.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Northern Colorado', 'Northern Colo.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Stephen F. Austin', 'SFA', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Western Illinois', 'Western Ill.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Prairie View A&M', 'Prairie View', regex=False)
cbr['Team'] = cbr['Team'].apply(lambda x: 'Southern U.' if x == 'Southern' else x)
cbr['Team'] = cbr['Team'].str.replace('Arkansas-Pine Bluff', 'Ark.-Pine Bluff', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Maryland Eastern Shore', 'UMES', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Mississippi Valley St.', 'Mississippi Val.', regex=False)
cbr['Team'] = cbr['Team'].str.replace('Alcorn St.', 'Alcorn', regex=False)

In [None]:
def get_stat_dataframe(stat_name):
    """Fetches the specified stat table from multiple pages and returns a combined DataFrame,
    keeps 'Team' as string, and converts all other columns to float."""
    
    if stat_name not in stat_links:
        print(f"Stat '{stat_name}' not found. Available stats: {list(stat_links.keys())}")
        return None
    
    # Initialize the DataFrame to store all pages' data
    all_data = []
    page_num = 1  # Start from the first page

    while True:
        url = stat_links[stat_name]
        if page_num > 1:
            # Modify the URL to include the page number
            url = f"{url}/p{page_num}"
        
        # print(f"Fetching data for: {stat_name} (Page {page_num} - {url})")

        try:
            # Get stats page content
            soup = get_soup(url)

            # Locate table
            table = soup.find("table")
            if not table:
                print(f"No table found for {stat_name} on page {page_num}")
                break  # Exit the loop if no table is found (end of valid pages)

            # Extract table headers
            headers = [th.text.strip() for th in table.find_all("th")]

            # Extract table rows
            data = []
            for row in table.find_all("tr")[1:]:  # Skip header row
                cols = row.find_all("td")
                data.append([col.text.strip() for col in cols])

            all_data.extend(data)  # Add the data from this page to the list of all data
        
        except requests.exceptions.HTTPError as e:
            print(f"{stat_name} Done")
            break  # Exit the loop on HTTPError (page doesn't exist)
        except Exception as e:
            print(f"An error occurred: {e}")
            break  # Exit the loop on any other error

        page_num += 1  # Go to the next page

    # Convert to DataFrame
    if all_data:
        df = pd.DataFrame(all_data, columns=headers)

        # Convert all columns to float except "Team"
        for col in df.columns:
            if col != "Team":
                df[col] = pd.to_numeric(df[col], errors="coerce")  # Converts to float, invalid values become NaN

        return df
    else:
        print("No data collected.")
        return None

# Example usage
stat_name_input = "Batting Average"  # Change this to the desired stat
ba = get_stat_dataframe(stat_name_input)
ba["HPG"] = ba["H"] / ba["G"]
ba["ABPG"] = ba["AB"] / ba["G"]
ba["HPAB"] = ba["H"] / ba["AB"]
ba = ba.drop(columns=['Rank'])

stat_name_input = "Base on Balls"
bb = get_stat_dataframe(stat_name_input)
bb["BBPG"] = bb["BB"] / bb["G"]
bb = bb.drop(columns=['Rank', 'G'])

stat_name_input = "Double Plays Per Game"
dp = get_stat_dataframe(stat_name_input)
dp.rename(columns={"PG": "DPPG"}, inplace=True)
dp = dp.drop(columns=['Rank', 'G'])

stat_name_input = "Earned Run Average"
era = get_stat_dataframe(stat_name_input)
era.rename(columns={"R":"RA"}, inplace=True)
era = era.drop(columns=['Rank', 'G'])

stat_name_input = "Fielding Percentage"
fp = get_stat_dataframe(stat_name_input)
fp["APG"] = fp["A"] / fp["G"]
fp["EPG"] = fp["E"] / fp["G"]
fp = fp.drop(columns=['Rank', 'G'])

stat_name_input = "Hits Allowed Per Nine Innings"
ha = get_stat_dataframe(stat_name_input)
ha.rename(columns={"PG": "HAPG"}, inplace=True)
ha = ha.drop(columns=['Rank', 'G', 'IP'])

stat_name_input = "Home Runs Per Game"
hr = get_stat_dataframe(stat_name_input)
hr.rename(columns={"PG": "HRPG"}, inplace=True)
hr = hr.drop(columns=['Rank', 'G'])
duplicate_teams = hr[hr.duplicated('Team', keep=False)]
filtered_teams = duplicate_teams.loc[duplicate_teams.groupby('Team')["HR"].idxmin()]
hr_cleaned = hr[~hr["Team"].isin(duplicate_teams["Team"])]
hr = pd.concat([hr_cleaned, filtered_teams], ignore_index=True)

stat_name_input = "On Base Percentage"
obp = get_stat_dataframe(stat_name_input)
obp.rename(columns={"PCT": "OBP"}, inplace=True)
obp["HBPPG"] = obp["HBP"] / obp["G"]
obp = obp.drop(columns=['Rank', 'G', 'AB', 'H', 'BB', 'SF', 'SH'])

stat_name_input = "Runs"
runs = get_stat_dataframe(stat_name_input)
runs["RPG"] = runs["R"] / runs["G"]
runs.rename(columns={"R": "RS"}, inplace=True)
runs = runs.drop(columns=['Rank', 'G'])

stat_name_input = "Sacrifice Bunts"
sb = get_stat_dataframe(stat_name_input)
sb.rename(columns={"SH": "SB"}, inplace=True)
sb["SBPG"] = sb["SB"] / sb["G"]
sb = sb.drop(columns=['Rank', 'G'])

stat_name_input = "Sacrifice Flies"
sf = get_stat_dataframe(stat_name_input)
sf["SFPG"] = sf["SF"] / sf["G"]
sf = sf.drop(columns=['Rank', 'G'])

stat_name_input = "Slugging Percentage"
slg = get_stat_dataframe(stat_name_input)
slg.rename(columns={"SLG PCT": "SLG"}, inplace=True)
slg = slg.drop(columns=['Rank', 'G', 'AB'])

stat_name_input = "Stolen Bases"
stl = get_stat_dataframe(stat_name_input)
stl["STLP"] = stl["SB"] / (stl["SB"] + stl["CS"])
stl["STLPG"] = stl["SB"] / stl["G"]
stl["CSPG"] = stl["CS"] / stl["G"]
stl["SAPG"] = (stl["SB"] + stl["CS"]) / stl["G"]
stl.rename(columns={"SB": "STL"}, inplace=True)
stl = stl.drop(columns=['Rank', 'G'])

stat_name_input = "Strikeout-to-Walk Ratio"
kbb = get_stat_dataframe(stat_name_input)
kbb["IP"] = round(kbb["IP"])
kbb.rename(columns={"K/BB": "KBB"}, inplace=True)
kbb.rename(columns={"BB": "PBB"}, inplace=True)
kbb = kbb.drop(columns=['Rank', 'App', 'IP'])

stat_name_input = "Strikeouts Per Nine Innings"
kp9 = get_stat_dataframe(stat_name_input)
kp9.rename(columns={"K/9": "KP9"}, inplace=True)
kp9 = kp9.drop(columns=['Rank', 'G', 'IP', 'SO'])

stat_name_input = "Walks Allowed Per Nine Innings"
wp9 = get_stat_dataframe(stat_name_input)
wp9.rename(columns={"PG": "WP9"}, inplace=True)
wp9 = wp9.drop(columns=['Rank', 'G', 'IP', 'BB'])

stat_name_input = "WHIP"
whip = get_stat_dataframe(stat_name_input)
whip = whip.drop(columns=['Rank', 'HA', 'IP', 'BB'])

dfs = [ba, bb, era, fp, obp, runs, slg, kp9, wp9, whip, cbr]
for df in dfs:
    df["Team"] = df["Team"].str.strip()
df_combined = dfs[0]
for df in dfs[1:]:
    df_combined = pd.merge(df_combined, df, on="Team", how="inner")
baseball_stats = df_combined.loc[:, ~df_combined.columns.duplicated()].sort_values('Team').reset_index(drop=True)
baseball_stats['OPS'] = baseball_stats['SLG'] + baseball_stats['OBP']
baseball_stats['PYTHAG'] = (baseball_stats['RS'] ** 1.83) / ((baseball_stats['RS'] ** 1.83) + (baseball_stats['RA'] ** 1.83))

Batting Average Done
Base on Balls Done
Double Plays Per Game Done
Earned Run Average Done
Fielding Percentage Done
Hits Allowed Per Nine Innings Done
Home Runs Per Game Done
On Base Percentage Done
Runs Done
Sacrifice Bunts Done
Sacrifice Flies Done
Slugging Percentage Done
Stolen Bases Done
Strikeout-to-Walk Ratio Done
Strikeouts Per Nine Innings Done
Walks Allowed Per Nine Innings Done
WHIP Done


In [7]:
rpi_2024 = pd.read_csv("./PEAR/PEAR Baseball/rpi_end_2024.csv")

modeling_stats = baseball_stats[['Team', 'HPG',
                'BBPG', 'ERA', 'PCT', 
                'KP9', 'WP9', 'OPS', 
                'WHIP', 'PYTHAG', 'CBRank']]
modeling_stats = pd.merge(modeling_stats, rpi_2024[['Team', 'Rank']], on = 'Team', how='left')
modeling_stats["Rank"] = modeling_stats["Rank"].apply(pd.to_numeric, errors='coerce')
modeling_stats["CBRank"] = modeling_stats["CBRank"].apply(pd.to_numeric, errors='coerce')
modeling_stats['Rank_pct'] = 1 - (modeling_stats['Rank'] - 1) / (len(modeling_stats) - 1)

higher_better = ["HPG", "BBPG", "PCT", "KP9", "OPS", "Rank_pct", 'PYTHAG']
lower_better = ["ERA", "WP9", "WHIP"]

scaler = MinMaxScaler(feature_range=(1, 100))
modeling_stats[higher_better] = scaler.fit_transform(modeling_stats[higher_better])
modeling_stats[lower_better] = scaler.fit_transform(-modeling_stats[lower_better])
weights = {
    'HPG': 8, 'BBPG': 8, 'ERA': 22, 'PCT': 8,
    'KP9': 8, 'WP9': 8, 'OPS': 22, 'WHIP': 8, 'PYTHAG': 22, 'Rank_pct': 50
}
modeling_stats['in_house_pr'] = sum(modeling_stats[stat] * weight for stat, weight in weights.items())

In [8]:
modeling_stats['in_house_pr'] = modeling_stats['in_house_pr'] - modeling_stats['in_house_pr'].mean()
current_range = modeling_stats['in_house_pr'].max() - modeling_stats['in_house_pr'].min()
desired_range = 25
scaling_factor = desired_range / current_range
modeling_stats['in_house_pr'] = round(modeling_stats['in_house_pr'] * scaling_factor, 4)
modeling_stats['in_house_pr'] = modeling_stats['in_house_pr'] - modeling_stats['in_house_pr'].min()

In [11]:
import pandas as pd
import numpy as np
import pandas as pd
from scipy.optimize import minimize
import numpy as np
from scipy.optimize import differential_evolution
from tqdm import tqdm
pbar = tqdm(total=500, desc="Optimization Progress")
def progress_callback(xk, convergence):
    """Callback to update the progress bar after each iteration."""
    pbar.update(1)
    if convergence < 1e-4:  # Close bar if convergence is achieved early
        pbar.close()

def objective_function(weights):
    (w_hpb, w_bbpg, w_era, w_pct, w_kp9, w_wp9, w_whip, w_ops, w_pythag, w_in_house_pr) = weights
    
    modeling_stats['power_ranking'] = (
        w_hpb * modeling_stats['HPG'] +
        w_bbpg * modeling_stats['BBPG'] +
        w_era * modeling_stats['ERA'] +
        w_pct * modeling_stats['PCT'] +
        w_kp9 * modeling_stats['KP9'] +
        w_wp9 * modeling_stats['WP9'] +
        w_whip * modeling_stats['WHIP'] +
        w_ops * modeling_stats['OPS'] +
        w_pythag * modeling_stats['PYTHAG'] + 
        w_in_house_pr * modeling_stats['in_house_pr']
    )

    modeling_stats['calculated_rank'] = modeling_stats['power_ranking'].rank(ascending=False)
    modeling_stats['combined_rank'] = (
        modeling_stats['CBRank']
    )
    spearman_corr = modeling_stats[['calculated_rank', 'combined_rank']].corr(method='spearman').iloc[0,1]

    return -spearman_corr

bounds = [(-1,1),
          (-1,1),
          (-1,1),
          (-1,1),
          (-1,1),
          (-1,1),
          (-1,1),
          (-1,1),
          (-1,1),
          (0,1)]
result = differential_evolution(objective_function, bounds, strategy='best1bin', maxiter=500, tol=1e-4, seed=42, callback=progress_callback)
optimized_weights = result.x
modeling_stats = modeling_stats.sort_values('power_ranking', ascending=False).reset_index(drop=True)

Optimization Progress:  43%|████▎     | 213/500 [01:26<01:30,  3.16it/s]

In [12]:
modeling_stats['Rating'] = modeling_stats['power_ranking'] - modeling_stats['power_ranking'].mean()
current_range = modeling_stats['Rating'].max() - modeling_stats['Rating'].min()
desired_range = 15
scaling_factor = desired_range / current_range
modeling_stats['Rating'] = round(modeling_stats['Rating'] * scaling_factor, 4)
modeling_stats['Rating'] = modeling_stats['Rating'] - modeling_stats['Rating'].min()
modeling_stats['Rating'] = round(modeling_stats['Rating'] - modeling_stats['Rating'].mean(),2)
modeling_stats['Rating'] = round(modeling_stats['Rating'], 2)

In [16]:
ending_data = pd.merge(baseball_stats, modeling_stats[['Team', 'Rating']], on="Team", how="inner").sort_values('Rating', ascending=False).reset_index(drop=True)
ending_data.index = ending_data.index + 1
ending_data[['Wins', 'Losses']] = ending_data['Rec'].str.split('-', expand=True).astype(int)
ending_data['WIN%'] = round(ending_data['Wins'] / (ending_data['Wins'] + ending_data['Losses']), 3)
ending_data['Wins_Over_Pythag'] = ending_data['WIN%'] - ending_data['PYTHAG']
ending_data.sort_values('Wins_Over_Pythag', ascending=False)[0:25]

Unnamed: 0,Team,G,AB,H,BA,HPG,ABPG,HPAB,BB,BBPG,...,RPI,Prev,Trend,OPS,PYTHAG,Rating,Wins,Losses,WIN%,Wins_Over_Pythag
59,UNC Greensboro,4,109,22,0.202,5.5,27.25,0.201835,18,4.5,...,,99,23,0.622,0.312527,3.32,3,1,0.75,0.437473
269,Mississippi Val.,1,38,16,0.421,16.0,38.0,0.421053,11,11.0,...,,306,0,1.198,0.582647,-5.47,1,0,1.0,0.417353
226,UNC Asheville,4,120,35,0.292,8.75,30.0,0.291667,17,4.25,...,,204,-4,0.774,0.341008,-3.49,3,1,0.75,0.408992
112,Bowling Green,3,117,31,0.265,10.333333,39.0,0.264957,9,3.0,...,,157,-1,0.741,0.266924,1.55,2,1,0.667,0.400076
84,Western Ky.,2,67,17,0.254,8.5,33.5,0.253731,11,5.5,...,,107,3,0.715,0.617778,2.5,2,0,1.0,0.382222
231,Lipscomb,3,100,18,0.18,6.0,33.333333,0.18,16,5.333333,...,,173,-1,0.554,0.304252,-3.8,2,1,0.667,0.362748
121,Houston,3,92,19,0.207,6.333333,30.666667,0.206522,14,4.666667,...,,119,5,0.649,0.313841,1.22,2,1,0.667,0.353159
96,Arizona St.,4,125,33,0.264,8.25,31.25,0.264,22,5.5,...,,45,-12,0.869,0.651446,2.03,4,0,1.0,0.348554
176,Northwestern,3,99,22,0.222,7.333333,33.0,0.222222,15,5.0,...,,199,-21,0.722,0.322567,-1.25,2,1,0.667,0.344433
140,Stony Brook,3,96,24,0.25,8.0,32.0,0.25,15,5.0,...,,188,4,0.788,0.677433,0.15,3,0,1.0,0.322567


In [13]:
modeling_stats[0:25]

Unnamed: 0,Team,HPG,BBPG,ERA,PCT,KP9,WP9,OPS,WHIP,PYTHAG,CBRank,Rank,Rank_pct,in_house_pr,power_ranking,calculated_rank,combined_rank,Rating
0,Florida,74.646341,35.615385,92.294811,68.32,100.0,91.242879,72.017724,96.748175,93.938852,18,11,96.743421,22.9136,10.550835,1.0,18,6.95
1,Florida St.,61.969512,35.615385,95.505307,86.14,72.716535,87.829085,53.639925,92.412409,98.319055,9,5,98.697368,22.1943,10.462993,2.0,9,6.85
2,LSU,63.780488,61.923077,96.089033,81.19,91.425197,100.0,68.693097,100.0,100.0,11,15,95.440789,23.5451,10.154737,3.0,11,6.51
3,UC Santa Barbara,51.103659,14.846154,85.40684,65.35,75.834646,91.985007,48.837687,88.076642,83.131037,33,17,94.789474,19.8141,9.897287,4.0,33,6.23
4,Virginia,49.292683,34.230769,88.850825,91.09,63.362205,83.747376,40.064366,88.076642,83.131037,8,10,97.069079,20.1514,9.877861,5.0,8,6.2
5,South Carolina,42.04878,46.0,90.71875,92.08,85.968504,97.105697,46.713619,100.0,96.159807,13,25,92.184211,21.3991,9.776591,6.0,13,6.09
6,North Carolina,43.859756,43.923077,81.028892,79.21,60.244094,78.997751,37.386194,75.069343,77.60131,22,4,99.023026,19.2342,9.59265,7.0,22,5.89
7,Mississippi St.,51.103659,41.846154,89.84316,60.4,82.850394,85.602699,57.795709,92.051095,94.333229,15,19,94.138158,21.185,9.460131,8.0,15,5.74
8,Arkansas,52.914634,56.384615,88.850825,85.15,53.228346,94.87931,65.737873,90.605839,94.013295,3,6,98.371711,22.1574,9.446654,9.0,3,5.73
9,Vanderbilt,49.292683,41.430769,78.985849,89.11,82.070866,76.771364,47.821828,75.430657,60.713827,19,26,91.858553,18.6973,9.345597,10.0,19,5.61


Data Dictionary

- G: Games
- AB: At Bats
- H: Hits
- BA: Batting Average
- HPG: Hits Per Game
- ABPG: At Bats Per Game
- HPAB: Hits Per At Bat
- BB: Walks
- BBPG: Walks Per Game
- DP: Double Plays
- DPPG: Double Plays Per Game
- IP: Innings Pitched
- RA: Runs Allowed
- ER: Earned Runs
- ERA: Earned Runs Allowed
- PO: Put Outs
- A: Assists
- E: Errors
- PCT: Fielding Percentage
- APG: Assists Per Game
- EPG: Errors Per Game
- HA: Hits Allowed
- HAPG: Hits Allowed Per Game
- HR: Home Runs Hit
- HRPG: Home Runs Hit Per Game
- HBP: Hit By Pitch
- OBP: On Base Percentage
- HBPPG: Hit By Pitch Per Game
- RS: Runs Scored
- RPG: Runs Scored Per Game
- SB: Sacrifice Bunts
- SBPG: Sacrifice Bunts Per Game
- SF: Sacrifice Flies
- SFPG: Sacrifice Flies Per Game
- TB: Total Bases
- SLG: Slugging Percentage
- STL: Stolen Bases
- CS: Caught Stealing
- STLP: Stolen Bases Success Percentage
- STLPG: Stolen Bases Per Game
- CSPG: Caught Stealing Per Game
- SAPG: Stealing Attempts Per Game
- SO: Pitching Strike Outs
- PBB: Pitching Walks
- KBB: Strikeouts to Walk Ratio
- KP9: Strikeouts Per Nine
- WP9: Walks Allowed Per Nine
- WHIP: Walks Hits Over Innings Pitched

In [None]:
# # Scrape all stats at once
# for stat_name, url in stat_links.items():
#     print(f"Scraping: {stat_name} ({url})")
    
#     # Get stats page content
#     soup = get_soup(url)
    
#     # Locate table
#     table = soup.find("table")
#     if not table:
#         print(f"No table found for {stat_name}")
#         continue

#     # Extract table headers
#     headers = [th.text.strip() for th in table.find_all("th")]

#     # Extract table rows
#     data = []
#     for row in table.find_all("tr")[1:]:  # Skip header row
#         cols = row.find_all("td")
#         data.append([col.text.strip() for col in cols])

#     # Convert to DataFrame and save
#     df = pd.DataFrame(data, columns=headers)
#     # df.to_csv(f"{stat_name}.csv", index=False)
#     print(f"Saved {stat_name}.csv")

# print("Scraping completed!")