In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from pybaseball import pitching_stats
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from datetime import date

In [2]:
START = 2002
END = date.today().year - 1

pitching = pitching_stats(START, END, qual=50)
pitching.to_csv("pitching.csv")
pitching = pitching.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)

def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player

pitching = pitching.groupby("IDfg", group_keys=False).apply(next_season)

In [3]:
null_count = pitching.isnull().sum()
complete_cols = list(pitching.columns[null_count == 0])
pitching = pitching[complete_cols + ["Next_WAR"]].copy()

del pitching["Dollars"]
del pitching["Age Rng"]
del pitching["Team"]

pitching = pitching.dropna()

df = pitching.copy()
df2 = df[df['Season'] != 2020]

In [4]:
def evaluate(y_true, y_pred):
    mse  = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2   = r2_score(y_true, y_pred)
    return {'RMSE': rmse, 'R2': r2}


In [5]:
def lasso_predict_next_war(
    df,
    test_feature_year=2023,
    alpha=0.01,
    max_iter=20000,
    random_state=42,
    test_features_df=None,
    compute_metrics='auto',
    sort_pred_desc=True
):
    df_ = df.copy()

    exclude = {'Next_WAR', 'Season', 'Name', 'Team'}
    if 'IDfg' in df_.columns:
        exclude.add('IDfg')
    feature_cols = [c for c in df_.columns if c not in exclude]
    X_all = df_[feature_cols].select_dtypes(include=[np.number]).copy()
    y_all = df_['Next_WAR'].to_numpy()

    train_cutoff = test_feature_year - 1
    train_mask = df_['Season'] <= train_cutoff
    X_train, y_train = X_all.loc[train_mask], y_all[train_mask]

    if test_features_df is None:
        test_df = df_.loc[df_['Season'] == test_feature_year].copy()
        if test_df.empty:
            raise RuntimeError(
                f"No rows with Season == {test_feature_year}. "
                f"If you’re predicting a future year, pass test_features_df."
            )
        X_test = X_all.loc[test_df.index]
        y_test = test_df['Next_WAR'].to_numpy() if 'Next_WAR' in test_df.columns else None
    else:
        test_df = test_features_df.copy()
        drop_for_feats = [c for c in ['IDfg','Name','Team','Age','Season'] if c in test_df.columns]
        X_test = (test_df.drop(columns=drop_for_feats, errors='ignore')
                        .select_dtypes(include=[np.number])
                        .reindex(columns=X_train.columns, fill_value=0.0))
        y_test = test_df['Next_WAR'].to_numpy() if 'Next_WAR' in test_df.columns else None

    scaler = ColumnTransformer([('num', StandardScaler(), X_train.columns.tolist())], remainder='drop')
    lasso  = Lasso(alpha=alpha, max_iter=max_iter, random_state=random_state)
    pipe   = Pipeline([('prep', scaler), ('model', lasso)])
    pipe.fit(X_train, y_train)

    y_pred = pipe.predict(X_test)
    next_year = test_feature_year + 1
    pred_col = f'Predicted_{next_year}_WAR'

    meta_cols = [c for c in ['IDfg','Name','Team','Age','Season','IP','WAR','G','GS'] if c in test_df.columns]
    out = test_df[meta_cols].copy()
    if 'Season' in out.columns:
        out = out.rename(columns={'Season': 'Season (features)'})
    out[pred_col] = y_pred

    def _should_compute_metrics():
        if compute_metrics is True:
            return True
        if compute_metrics is False:
            return False
        return (y_test is not None) and np.isfinite(y_test).all()

    if _should_compute_metrics():
        m = evaluate(y_test, y_pred)
        metrics = {'RMSE': float(m['RMSE']), 'R2': float(m['R2']),
                   'train_upto': int(train_cutoff), 'test_year': int(next_year)}
        out[f'Actual_{next_year}_WAR'] = y_test
        out['Error'] = out[pred_col] - out[f'Actual_{next_year}_WAR']
        out['AbsError'] = out['Error'].abs()
    else:
        metrics = {'RMSE': 'N/A', 'R2': 'N/A',
                   'train_upto': int(train_cutoff), 'test_year': int(next_year)}

    if sort_pred_desc:
        out = out.sort_values(pred_col, ascending=False).reset_index(drop=True)

    return out, metrics, pipe

In [6]:
def build_corr_pruned_df(
    df,
    threshold=0.9,
    exclude=('Next_WAR','Season','Name','Team','IDfg'),
    protect=('IP',),
    verbose=True
):
    df_ = df.copy()
    excl = set(exclude) & set(df_.columns)
    prot = set(protect) & set(df_.columns)

    feat_cols = [c for c in df_.select_dtypes(include=[np.number]).columns if c not in excl]
    X = df_[feat_cols]

    corr_abs = X.corr().abs()
    upper = corr_abs.where(np.triu(np.ones(corr_abs.shape), k=1).astype(bool))

    to_drop = []
    for col in upper.columns:
        if col in prot:
            continue
        if (upper[col] >= threshold).any():
            to_drop.append(col)

    report_rows = []
    for col in to_drop:
        partners = corr_abs[col].drop(index=col)
        if partners.empty:
            report_rows.append((col, None, np.nan))
        else:
            partner = partners.idxmax()
            report_rows.append((col, partner, float(partners[partner])))

    df_pruned = df_.drop(columns=to_drop, errors='ignore')
    dropped_report = pd.DataFrame(report_rows, columns=['dropped_col','max_corr_with','abs_corr'])

    return df_pruned, to_drop, corr_abs, dropped_report

In [7]:
df3, dropped_cols_df3, corr_df3, dropped_report_df3 = build_corr_pruned_df(df2, threshold=0.9, verbose=True)

In [8]:
raw = pd.read_csv("pitching.csv", index_col=0)
features_pre = raw[raw['Season'] == END].copy()

pred_fin_table, metrics_fin, pipe_fin = lasso_predict_next_war(
    df3,
    test_feature_year=END,
    test_features_df=features_pre,
    compute_metrics='auto',
    sort_pred_desc=True
)

In [9]:
next_year = END + 1
pred_col = f'Predicted_{next_year}_WAR'

pred_fin_table[pred_col] = pred_fin_table[pred_col].round(1)
pred_fin_table.to_csv('war_predictions.csv', index=False)