In [None]:
import pybaseball as pyb
import pandas as pd
from sklearn import metrics
import requests

# Objective:
### To create an xWAR using statcast data

# Formulas:
### xRuns = xwRAA + Fielding Runs Prevented + BsR
### Runs per Win = 9*(Lg Runs/ Lg IP) * 1.5 + 3
### xWAR = (xRuns - Positional Adjustment) / Runs per Win

# Results:
### Looking at 2021 and 2022 seasons, the R squared score for WAR is 0.292, the R squared score for xWAR is 0.345. This shows a noticeable increase in the reliability of using xWAR as a predictive stat compared to WAR. 

### Based off the 2022 season, the correlation between Runs and WAR was 0.905 while it was 0.895 for xWAR. This shows that xWAR has just a marginal lesser relationship to runs than WAR.

# Interpretation:
### Judging from the results, one can make the claim that xWAR is a better statistic than WAR when predicting a players future value due to the much greater increase in year-to-year R squared while maintaining a similar correlation to runs scored.

# What is Next:
### This project is still in the early stages. 
### The next step would be to use Statcast's sprint speed numbers to better represent base running runs. 
### Furthermore, it would be insightful to see the isolated effects of each different source of runs; wRAA to xwRaa, BSR to perspective Statcast's sprint speed based statistic, UZR to FRP

In [2]:
def get_xwar(year, lgxwOBA, woba_scale):
    
    # Get batting and fielding statistics for the given year
    bat = pyb.batting_stats(year, qual=0)
    field = pyb.statcast_outs_above_average(year, pos = "all", min_att = 0)
    
    # Rename the "Name" column in the fielding dataframe
    field['Name'] = field['first_name'].str[1:]+' '+field['last_name']
    
    # Calculate xwRAA (Expected Weighted Runs Above Average) for each player
    bat['xwRAA'] = ((bat['xwOBA'] - lgxwOBA) / woba_scale) * bat['PA']
    
    # Merge the batting and fielding dataframes and fill missing values with 0
    stats = pd.merge(bat, field, on=['Name'], how='left').fillna(0)
    
    # Calculate BSR (Base Running Runs) for each player
    stats['BSR'] = stats['wSB'] + stats['UBR'] + stats['wGDP']
    
    # Rename the "fielding_runs_prevented" column in the merged dataframe
    stats = stats.rename(columns={'fielding_runs_prevented':'FRP'})
    
    # Calculate xRuns for each player
    stats['xRuns'] = stats['xwRAA'] + stats['FRP'] + stats['BSR']
    
    # Get league-wide batting and pitching statistics for the given year
    lg_bat = pyb.team_batting(year).sum()
    lg_pitch = pyb.team_pitching(year).sum()
    
    # Calculate runs per win (RPW) for the given year
    RPW = 9*(lg_bat['R'] / lg_pitch['IP'])*1.5 + 3
    
    # Calculate xWAR for each player
    stats['xWAR'] = (stats['xRuns'] - stats['Pos'])/RPW
    
    return stats


In [57]:
def get_team_xwar(year, lgxwOBA, woba_scale):
    
    teams_dict = {
    "Angels": "LAA",
    "Astros": "HOU",
    "Athletics": "OAK",
    "Blue Jays": "TOR",
    "Braves": "ATL",
    "Brewers": "MIL",
    "Cardinals": "STL",
    "Cubs": "CHC",
    "D-backs": "ARI",
    "Dodgers": "LAD",
    "Giants": "SFG",
    "Guardians": "CLE",
    "Mariners": "SEA",
    "Marlins": "MIA",
    "Mets": "NYM",
    "Nationals": "WSN",
    "Orioles": "BAL",
    "Padres": "SDP",
    "Phillies": "PHI",
    "Pirates": "PIT",
    "Rangers": "TEX",
    "Rays": "TBR",
    "Red Sox": "BOS",
    "Reds": "CIN",
    "Rockies": "COL",
    "Royals": "KCR",
    "Tigers": "DET",
    "Twins": "MIN",
    "White Sox": "CHW",
    "Yankees": "NYY"
    }


    
    
    url = f'https://baseballsavant.mlb.com/league?season={year}#statcastHitting'

    # Use the `requests` library to fetch the webpage
    response = requests.get(url)

    # Extract the HTML table from the webpage
    dfs = pd.read_html(response.text)

    # Get the first table in the list of tables
    bat = dfs[0]

    bat.columns = bat.columns.droplevel(0)
    
    bat['Team'] = bat['Team'].map(teams_dict)
    
    bat = bat.set_index('Team')

    bat['xwRAA'] = ((bat['XWOBA'] - lgxwOBA) / woba_scale) * bat['PA']
            
    # Calculate BSR (Base Running Runs) for each player
    
    team_bat = pyb.team_batting(year).set_index('Team')
    
    field = pd.read_csv(f'team_oaa_{year}.csv', index_col = 0).set_index('Team')
    
    field.index = field.index.map(teams_dict)

    pitch = pyb.team_pitching(year)
    
    pitch = pitch.set_index('Team')
    
    stats = bat.join(field[['Runs Prevented']]).join(pitch[['W','ERA','FIP']]).join(team_bat[['WAR','wSB','UBR',
                                     'wGDP','R']]).join(pyb.batting_stats(year).groupby('Team').sum()['Pos'])

    
    stats['BSR'] = stats['wSB'] + stats['UBR'] + stats['wGDP']


    stats['xRuns'] = stats['xwRAA'] + stats['Runs Prevented'] + stats['BSR']
    
    
    # Get league-wide batting and pitching statistics for the given year
    lg_bat = pyb.team_batting(year).sum()
    lg_pitch = pyb.team_pitching(year).sum()

    # Calculate runs per win (RPW) for the given year
    RPW = 9*(lg_bat['R'] / lg_pitch['IP'])*1.5 + 3

    # Calculate xWAR for each player
    stats['xWAR'] = (stats['xRuns'] - stats['Pos'])/RPW
        
    stats = stats.dropna(subset = ['xWAR'])    
    
    return stats

In [4]:
def get_xwar_two_years(year1, lgxwOBA1, woba_scale1, year2, lgxwOBA2, woba_scale2):
    # Get xWAR data for the first year
    xwar1 = get_xwar(year1, lgxwOBA1, woba_scale1)
    
    # Get xWAR data for the second year
    xwar2 = get_xwar(year2, lgxwOBA2, woba_scale2)
    
    # Merge the data for the two years, keeping only rows that are present in both dataframes
    combined = pd.merge(xwar1, xwar2, on=['Name'], how='inner')
    
    return combined


In [37]:
# Call the get_xwar_two_years() function to get the combined xWAR data for the two years
combined_xwar = get_xwar_two_years(2021, .317, 1.209, 2022, .309, 1.259)

# Calculate the R^2 scores for WAR and xWAR
war_r2 = metrics.r2_score(combined_xwar['WAR_x'], combined_xwar['WAR_y'])
xwar_r2 = metrics.r2_score(combined_xwar['xWAR_x'], combined_xwar['xWAR_y'])

# Print the R^2 scores
print(f'WAR R^2 score: {war_r2}')
print(f'xWAR R^2 score: {xwar_r2}')

WAR R^2 score: 0.2923369522971567
xWAR R^2 score: 0.3448454811908307


In [61]:
year = 2022
lgxwOBA = .309
woba_scale = 1.259

team_xwar22 = get_team_xwar(year, lgxwOBA, woba_scale)

In [81]:
team_xwar22[['R','WAR','xWAR']].corr()

Unnamed: 0,R,WAR,xWAR
R,1.0,0.905294,0.895882
WAR,0.905294,1.0,0.917256
xWAR,0.895882,0.917256,1.0
