In [65]:
from bs4 import BeautifulSoup
import requests
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('raw_sec_player_stats.csv')
df

Unnamed: 0,player_id,name,year,age,team,conference,PA,BB,SO,OBP,SLG,OPS
0,beck--001jor,Jordan Beck,2020,19,Tennessee,SEC,48,8.0,11,0.396,0.475,0.871
1,beck--001jor,Jordan Beck,2021,20,Tennessee,SEC,289,24.0,60,0.336,0.523,0.859
2,beck--001jor,Jordan Beck,2022,21,Tennessee,SEC,297,37.0,62,0.391,0.595,0.986
3,booker000kyl,Kyle Booker,2021,19,Tennessee,SEC,68,7.0,18,0.382,0.448,0.831
4,booker000kyl,Kyle Booker,2022,20,Tennessee,SEC,55,7.0,12,0.364,0.356,0.719
...,...,...,...,...,...,...,...,...,...,...,...,...
1984,patter004chr,Chris Patterson,2025,18,Missouri,SEC,125,10.0,41,0.304,0.393,0.697
1985,picare000bra,Brady Picarelli,2025,19,Missouri,SEC,56,7.0,15,0.393,0.604,0.997
1986,seals-000pie,Pierre Seals,2024,20,Memphis,Amer,241,31.0,63,0.436,0.553,0.988
1987,seals-000pie,Pierre Seals,2025,21,Missouri,SEC,172,17.0,58,0.374,0.446,0.820


In [3]:
df['conference'].unique()

array(['SEC', 'SUMT', 'MVC', 'WAC', 'P12', 'ACC', 'SLC', 'Amer', 'BigW',
       'CUSA', 'B12', 'MAC', 'SBC', 'GLVC', 'SoCo', 'BigS', 'BTen',
       'ASun', 'SSC', 'CAA', 'MWC', 'OVC', 'AEC', 'MIAA', 'IVY', 'WCC',
       'CCar', 'GSC', 'A10', 'BigE', 'NE10', 'UAA', 'LSC', 'MEAC', 'SWAC',
       'SCIA', 'D1IN', 'Horz', 'MAAC', 'PBC', 'CACC', 'SAC', 'SAA',
       'GNAC', 'AMWC'], dtype=object)

In [4]:
conference_name_map = {
    "ACC": "ACC",
    "America-East": "AEC",
    "American-Athletic": "Amer",
    "ASUN": "ASun",
    "Atlantic-10": "A10",
    "Big-12": "B12",
    "Big-East": "BigE",
    "Big-South": "BigS",
    "Big-Ten": "BTen",
    "Big-West": "BigW",
    "Coastal-Athletic": "CAA",  # same as Colonial
    "Colonial-Athletic": "CAA",  # same as Coastal
    "Conference-USA": "CUSA",
    "Horizon-League": "Horz",
    "Independent": "D1IN",
    "Ivy-League": "IVY",
    "MAAC": "MAAC",
    "Mid-American": "MAC",
    "Missouri-Valley": "MVC",
    "Mountain-West": "MWC",
    "Northeast": "NEC",  # not in your list — placeholder if needed
    "Ohio-Valley": "OVC",
    "Patriot-League": "Patriot",  # not in your list — placeholder if needed
    "SEC": "SEC",
    "Southern": "SoCo",
    "Southland": "SLC",
    "Sun-Belt": "SBC",
    "SWAC": "SWAC",
    "The-Summit-League": "SUMT",
    "West-Coast": "WCC",
    "Western-Athletic": "WAC",
    "Pac-12": "P12",
    "MEAC": "MEAC"
}

In [5]:
conf_rpi_map = {
    2021: {},
    2022: {},
    2023: {},
    2024: {}
}

In [6]:
url = 'https://www.warrennolan.com/baseball/2021/rpi-conference'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')

In [7]:
trs_with_conf_logo = [
    tr for tr in soup.find_all('tr')
    if tr.find('ul', class_='conf-logo') is not None
]


html_snippet = ''.join(str(tr) for tr in trs_with_conf_logo)
sub_soup = BeautifulSoup(html_snippet, 'html.parser')

for conf in sub_soup.find_all('tr'):
    conference = conf.find('a')['class'][0]
    rpi = conf.find_all('td')[2].text
    conf_rpi_map[2021].update({conference_name_map[conference]: rpi})
    

In [8]:
conf_rpi_map

{2021: {'SEC': '0.5661',
  'B12': '0.5584',
  'ACC': '0.5507',
  'P12': '0.5430',
  'Amer': '0.5154',
  'CUSA': '0.5129',
  'MVC': '0.5093',
  'MWC': '0.5054',
  'SoCo': '0.5046',
  'SBC': '0.4988',
  'BTen': '0.4972',
  'WCC': '0.4950',
  'BigE': '0.4919',
  'BigW': '0.4919',
  'MAAC': '0.4899',
  'CAA': '0.4875',
  'OVC': '0.4846',
  'AEC': '0.4833',
  'SLC': '0.4819',
  'BigS': '0.4804',
  'A10': '0.4769',
  'ASun': '0.4758',
  'SUMT': '0.4749',
  'Horz': '0.4739',
  'MAC': '0.4731',
  'WAC': '0.4590',
  'Patriot': '0.4572',
  'NEC': '0.4550',
  'SWAC': '0.4062',
  'MEAC': '0.3201',
  'IVY': '0.0610'},
 2022: {},
 2023: {},
 2024: {}}

In [9]:
url2 = 'https://www.warrennolan.com/baseball/2022/rpi-conference'
page2 = requests.get(url2)
soup2 = BeautifulSoup(page2.text, 'html')

In [10]:
soup2.find_all('tr')[1:]

[<tr>
 <td class="data-cell data-medium">
 <div class="logo-name-container">
 <div class="logo-subcontainer"><ul class="conf-logo"><li><a class="SEC" href="/baseball/2022/conference/SEC"></a></li></ul></div>
 <div class="name-subcontainer"><a class="blue-black" href="/baseball/2022/conference/SEC">SEC</a></div>
 </div>
 </td>
 <td class="data-cell data-center data-medium">0.5760</td>
 <td class="data-cell data-center data-medium cell-right-black">1</td>
 <td class="data-cell data-center data-medium">264-76</td>
 <td class="data-cell data-center data-medium cell-right-black">0.7765</td>
 <td class="data-cell data-medium">
 <div class="logo-name-container">
 <div class="logo-subcontainer"><ul class="team-logo"><li><a class="Tennessee" href="/baseball/2022/schedule/Tennessee"></a></li></ul></div>
 <div class="name-subcontainer"><a class="blue-black" href="/baseball/2022/schedule/Tennessee">Tennessee</a></div>
 </div>
 </td>
 <td class="data-cell data-center data-medium">1</td>
 </tr>,
 <t

In [11]:
for conf in soup2.find_all('tr')[1:]:
    conference = conf.find('a')['class'][0]
    rpi = conf.find_all('td', 'data-cell data-center data-medium')[0].text
    conf_rpi_map[year].update({conference_name_map[conference]: rpi})

NameError: name 'year' is not defined

In [12]:
for year in range(2022,2025):
    url = 'https://www.warrennolan.com/baseball/' + str(year) + '/rpi-conference'
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html')

    for conf in soup.find_all('tr')[1:]:
        conference = conf.find('a')['class'][0]
        rpi = conf.find_all('td', 'data-cell data-center data-medium')[0].text
        conf_rpi_map[year].update({conference_name_map[conference]: rpi})
    
    time.sleep(5)

In [13]:
conf_rpi_map

{2021: {'SEC': '0.5661',
  'B12': '0.5584',
  'ACC': '0.5507',
  'P12': '0.5430',
  'Amer': '0.5154',
  'CUSA': '0.5129',
  'MVC': '0.5093',
  'MWC': '0.5054',
  'SoCo': '0.5046',
  'SBC': '0.4988',
  'BTen': '0.4972',
  'WCC': '0.4950',
  'BigE': '0.4919',
  'BigW': '0.4919',
  'MAAC': '0.4899',
  'CAA': '0.4875',
  'OVC': '0.4846',
  'AEC': '0.4833',
  'SLC': '0.4819',
  'BigS': '0.4804',
  'A10': '0.4769',
  'ASun': '0.4758',
  'SUMT': '0.4749',
  'Horz': '0.4739',
  'MAC': '0.4731',
  'WAC': '0.4590',
  'Patriot': '0.4572',
  'NEC': '0.4550',
  'SWAC': '0.4062',
  'MEAC': '0.3201',
  'IVY': '0.0610'},
 2022: {'SEC': '0.5760',
  'ACC': '0.5672',
  'B12': '0.5556',
  'P12': '0.5480',
  'CUSA': '0.5222',
  'SBC': '0.5182',
  'Amer': '0.5173',
  'BTen': '0.5166',
  'MVC': '0.5145',
  'SoCo': '0.5052',
  'WCC': '0.5048',
  'CAA': '0.5037',
  'BigE': '0.5021',
  'MWC': '0.4943',
  'ASun': '0.4933',
  'OVC': '0.4924',
  'IVY': '0.4889',
  'SLC': '0.4857',
  'WAC': '0.4840',
  'BigW': '0.4

In [38]:
conf_rpi_map.update({2025: {'SEC': 0.5858}})

In [39]:
conf_rpi_map

{2021: {'SEC': '0.5661',
  'B12': '0.5584',
  'ACC': '0.5507',
  'P12': '0.5430',
  'Amer': '0.5154',
  'CUSA': '0.5129',
  'MVC': '0.5093',
  'MWC': '0.5054',
  'SoCo': '0.5046',
  'SBC': '0.4988',
  'BTen': '0.4972',
  'WCC': '0.4950',
  'BigE': '0.4919',
  'BigW': '0.4919',
  'MAAC': '0.4899',
  'CAA': '0.4875',
  'OVC': '0.4846',
  'AEC': '0.4833',
  'SLC': '0.4819',
  'BigS': '0.4804',
  'A10': '0.4769',
  'ASun': '0.4758',
  'SUMT': '0.4749',
  'Horz': '0.4739',
  'MAC': '0.4731',
  'WAC': '0.4590',
  'Patriot': '0.4572',
  'NEC': '0.4550',
  'SWAC': '0.4062',
  'MEAC': '0.3201',
  'IVY': '0.0610'},
 2022: {'SEC': '0.5760',
  'ACC': '0.5672',
  'B12': '0.5556',
  'P12': '0.5480',
  'CUSA': '0.5222',
  'SBC': '0.5182',
  'Amer': '0.5173',
  'BTen': '0.5166',
  'MVC': '0.5145',
  'SoCo': '0.5052',
  'WCC': '0.5048',
  'CAA': '0.5037',
  'BigE': '0.5021',
  'MWC': '0.4943',
  'ASun': '0.4933',
  'OVC': '0.4924',
  'IVY': '0.4889',
  'SLC': '0.4857',
  'WAC': '0.4840',
  'BigW': '0.4

In [16]:
mreg_df = pd.read_csv('player_comparison.csv')

In [17]:
mreg_df

Unnamed: 0,player_id,prev year,next year,class transition,type,prev conf,next conf,prev BB%,next BB%,prev K%,next K%,prev OBP,next OBP,prev SLG,next SLG,prev OPS,next OPS
0,beck--001jor,2021,2022,2 to 3,Returning,SEC,SEC,8.3,12.5,20.8,20.9,0.336,0.391,0.523,0.595,0.859,0.986
1,burke-000bla,2023,2024,2 to 3,Returning,SEC,SEC,9.2,10.8,17.4,14.9,0.369,0.449,0.527,0.702,0.896,1.151
2,ensley000hun,2023,2024,3 to 4,Returning,SEC,SEC,10.3,10.7,18.5,19.0,0.391,0.390,0.425,0.532,0.816,0.923
3,ensley000hun,2024,2025,4 to 5,Returning,SEC,SEC,10.7,10.6,19.0,14.9,0.390,0.418,0.532,0.531,0.923,0.950
4,gilber002and,2021,2022,2 to 3,Returning,SEC,SEC,5.0,13.5,13.2,13.1,0.341,0.455,0.437,0.673,0.778,1.128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,sulliv000noa,2024,2025,3 to 4,Transfer,BigS,SEC,14.5,13.7,12.1,16.5,0.453,0.475,0.613,0.645,1.066,1.120
246,hage--000col,2024,2025,3 to 4,Transfer,IVY,SEC,13.2,11.9,13.7,15.1,0.471,0.444,0.658,0.574,1.129,1.019
247,lawren001luk,2024,2025,2 to 3,Transfer,MVC,SEC,8.4,7.5,13.6,12.7,0.399,0.397,0.465,0.425,0.864,0.822
248,hensel000wya,2024,2025,4 to 5,Transfer,IVY,SEC,11.4,7.5,11.4,16.2,0.465,0.423,0.755,0.562,1.220,0.985


In [29]:
mreg_df[mreg_df['type'] == 'Returning'][['prev BB%','prev K%', 'prev OBP', 'prev SLG', 'next BB%']].corr()['next BB%']

prev BB%    0.452738
prev K%     0.080261
prev OBP    0.370612
prev SLG    0.365454
next BB%    1.000000
Name: next BB%, dtype: float64

In [52]:
mreg_df = pd.get_dummies(mreg_df, columns=['class transition'], drop_first=True)

In [49]:
def compute_conf_jump(row):
    prev_score = conf_rpi_map[row['prev year']][row['prev conf']]
    next_score = conf_rpi_map[row['next year']][row['next conf']]
    return float(next_score) - float(prev_score)

In [50]:
mreg_df['conf jump score'] = mreg_df.apply(compute_conf_jump, axis=1)

In [53]:
mreg_df

Unnamed: 0,player_id,prev year,next year,type,prev conf,next conf,prev BB%,next BB%,prev K%,next K%,...,next OBP,prev SLG,next SLG,prev OPS,next OPS,conf jump score,class transition_2 to 3,class transition_3 to 4,class transition_4 to 5,class transition_5 to 6
0,beck--001jor,2021,2022,Returning,SEC,SEC,8.3,12.5,20.8,20.9,...,0.391,0.523,0.595,0.859,0.986,0.0099,True,False,False,False
1,burke-000bla,2023,2024,Returning,SEC,SEC,9.2,10.8,17.4,14.9,...,0.449,0.527,0.702,0.896,1.151,0.0011,True,False,False,False
2,ensley000hun,2023,2024,Returning,SEC,SEC,10.3,10.7,18.5,19.0,...,0.390,0.425,0.532,0.816,0.923,0.0011,False,True,False,False
3,ensley000hun,2024,2025,Returning,SEC,SEC,10.7,10.6,19.0,14.9,...,0.418,0.532,0.531,0.923,0.950,-0.0028,False,False,True,False
4,gilber002and,2021,2022,Returning,SEC,SEC,5.0,13.5,13.2,13.1,...,0.455,0.437,0.673,0.778,1.128,0.0099,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,sulliv000noa,2024,2025,Transfer,BigS,SEC,14.5,13.7,12.1,16.5,...,0.475,0.613,0.645,1.066,1.120,0.1100,False,True,False,False
246,hage--000col,2024,2025,Transfer,IVY,SEC,13.2,11.9,13.7,15.1,...,0.444,0.658,0.574,1.129,1.019,0.1274,False,True,False,False
247,lawren001luk,2024,2025,Transfer,MVC,SEC,8.4,7.5,13.6,12.7,...,0.397,0.465,0.425,0.864,0.822,0.0771,True,False,False,False
248,hensel000wya,2024,2025,Transfer,IVY,SEC,11.4,7.5,11.4,16.2,...,0.423,0.755,0.562,1.220,0.985,0.1274,False,False,True,False


In [55]:
mreg_df = pd.get_dummies(mreg_df, columns=['type'], drop_first=True)


In [56]:
mreg_df

Unnamed: 0,player_id,prev year,next year,prev conf,next conf,prev BB%,next BB%,prev K%,next K%,prev OBP,...,prev SLG,next SLG,prev OPS,next OPS,conf jump score,class transition_2 to 3,class transition_3 to 4,class transition_4 to 5,class transition_5 to 6,type_Transfer
0,beck--001jor,2021,2022,SEC,SEC,8.3,12.5,20.8,20.9,0.336,...,0.523,0.595,0.859,0.986,0.0099,True,False,False,False,False
1,burke-000bla,2023,2024,SEC,SEC,9.2,10.8,17.4,14.9,0.369,...,0.527,0.702,0.896,1.151,0.0011,True,False,False,False,False
2,ensley000hun,2023,2024,SEC,SEC,10.3,10.7,18.5,19.0,0.391,...,0.425,0.532,0.816,0.923,0.0011,False,True,False,False,False
3,ensley000hun,2024,2025,SEC,SEC,10.7,10.6,19.0,14.9,0.390,...,0.532,0.531,0.923,0.950,-0.0028,False,False,True,False,False
4,gilber002and,2021,2022,SEC,SEC,5.0,13.5,13.2,13.1,0.341,...,0.437,0.673,0.778,1.128,0.0099,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,sulliv000noa,2024,2025,BigS,SEC,14.5,13.7,12.1,16.5,0.453,...,0.613,0.645,1.066,1.120,0.1100,False,True,False,False,True
246,hage--000col,2024,2025,IVY,SEC,13.2,11.9,13.7,15.1,0.471,...,0.658,0.574,1.129,1.019,0.1274,False,True,False,False,True
247,lawren001luk,2024,2025,MVC,SEC,8.4,7.5,13.6,12.7,0.399,...,0.465,0.425,0.864,0.822,0.0771,True,False,False,False,True
248,hensel000wya,2024,2025,IVY,SEC,11.4,7.5,11.4,16.2,0.465,...,0.755,0.562,1.220,0.985,0.1274,False,False,True,False,True


In [59]:
df_filt = mreg_df.drop(columns=['player_id','prev year','next year', 'prev conf', 'next conf'])

In [60]:
df_filt

Unnamed: 0,prev BB%,next BB%,prev K%,next K%,prev OBP,next OBP,prev SLG,next SLG,prev OPS,next OPS,conf jump score,class transition_2 to 3,class transition_3 to 4,class transition_4 to 5,class transition_5 to 6,type_Transfer
0,8.3,12.5,20.8,20.9,0.336,0.391,0.523,0.595,0.859,0.986,0.0099,True,False,False,False,False
1,9.2,10.8,17.4,14.9,0.369,0.449,0.527,0.702,0.896,1.151,0.0011,True,False,False,False,False
2,10.3,10.7,18.5,19.0,0.391,0.390,0.425,0.532,0.816,0.923,0.0011,False,True,False,False,False
3,10.7,10.6,19.0,14.9,0.390,0.418,0.532,0.531,0.923,0.950,-0.0028,False,False,True,False,False
4,5.0,13.5,13.2,13.1,0.341,0.455,0.437,0.673,0.778,1.128,0.0099,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,14.5,13.7,12.1,16.5,0.453,0.475,0.613,0.645,1.066,1.120,0.1100,False,True,False,False,True
246,13.2,11.9,13.7,15.1,0.471,0.444,0.658,0.574,1.129,1.019,0.1274,False,True,False,False,True
247,8.4,7.5,13.6,12.7,0.399,0.397,0.465,0.425,0.864,0.822,0.0771,True,False,False,False,True
248,11.4,7.5,11.4,16.2,0.465,0.423,0.755,0.562,1.220,0.985,0.1274,False,False,True,False,True


In [67]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

stats = ['BB%', 'K%', 'OBP', 'SLG', 'OPS']

# Columns that are shared across all regressions
shared_features = [
    'conf jump score',
    'class transition_2 to 3',
    'class transition_3 to 4',
    'class transition_4 to 5',
    'class transition_5 to 6',
    'type_Transfer'
]

models = {}

for stat in stats:
    # Set up target and features
    target_col = f'next {stat}'
    prev_col = f'prev {stat}'
    features = [prev_col] + shared_features

    # Drop rows with missing values in relevant columns
    data = df_filt.dropna(subset=[target_col] + features)

    X = data[features]
    y = data[target_col]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Fit regression model
    model = LinearRegression()
    model.fit(X_scaled, y)
    y_pred = model.predict(X)

    
    # Get coefficients
    coefficients = pd.Series(model.coef_, index=features)
    print(coefficients.sort_values(key=abs, ascending=False))

    # Store model and print results
    models[stat] = model
    print(f"\nStat: {stat}")
    print(f"  R²:  {r2_score(y, y_pred):.3f}")
    print(f"  MAE: {mean_absolute_error(y, y_pred):.3f}")

    print(f"\nMultipliers for {stat}:")
    for feature, coef in zip(features, model.coef_):
        print(f"  {feature}: {coef:.4f}")
    print(f"  Intercept: {model.intercept_:.4f}")


prev BB%                   1.622737
type_Transfer              0.361627
class transition_5 to 6    0.334154
class transition_2 to 3    0.327489
conf jump score           -0.285106
class transition_3 to 4   -0.149659
class transition_4 to 5    0.118388
dtype: float64

Stat: BB%
  R²:  -29.318
  MAE: 18.358

Multipliers for BB%:
  prev BB%: 1.6227
  conf jump score: -0.2851
  class transition_2 to 3: 0.3275
  class transition_3 to 4: -0.1497
  class transition_4 to 5: 0.1184
  class transition_5 to 6: 0.3342
  type_Transfer: 0.3616
  Intercept: 12.3872
prev K%                    3.721668
type_Transfer              0.447613
class transition_2 to 3    0.385525
class transition_3 to 4    0.356525
class transition_4 to 5    0.343598
conf jump score            0.111917
class transition_5 to 6    0.003946
dtype: float64

Stat: K%
  R²:  -175.400
  MAE: 70.992

Multipliers for K%:
  prev K%: 3.7217
  conf jump score: 0.1119
  class transition_2 to 3: 0.3855
  class transition_3 to 4: 0.3565
  c

