In [32]:

import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats

START = 2002
END = 2022

batting = batting_stats(START, END, qual=200)
batting.to_csv("batting.csv")

batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] >1)

#set up target, WAR for next season

def next_season(player):
  player = player.sort_values("Season")
  player["Next_WAR"] = player["WAR"].shift(-1)
  return player

batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

batting[["Name", "Season", "WAR", "Next_WAR"]]

#cleaning the data

null_count = batting.isnull().sum()

complete_cols = list(batting.columns[null_count ==0])

batting = batting[complete_cols + ["Next_WAR"]].copy()

batting.dtypes[batting.dtypes == "object"]

del batting["Dol"]
del batting["Age Rng"]

batting["team_code"] = batting["Team"].astype("category").cat.codes

batting_full = batting.copy()

batting = batting.dropna()

#selecting useful features/columns using feautre selector
!pip install scikit-learn
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, n_features_to_select=20, direction ="forward", cv=split, n_jobs=4)
removed_columns = ["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])

sfs.fit(batting[selected_columns],batting["Next_WAR"])

predictors = list(selected_columns[sfs.get_support()])
#Making Predictions with ML

def backtest(data, model, predictors, start=5, step=1):
  all_predictions = []
  years = sorted(data["Season"].unique())

  for i in range(start, len(years), step):
        current_year = years[i]

        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]
#intwesting line|
        model.fit(train[predictors], train["Next_WAR"])
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        all_predictions.append(combined)
  return pd.concat(all_predictions)
predictions = backtest(batting, rr, predictors)
predictions

from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])
batting["Next_WAR"].describe()

#sqrt of mse lower than std deviation
2.7671807143292371 ** .5

#improving accuracy

def player_history(df):
  df = df.sort_values("Season")
  df["player_season"] = range(0,df.shape[0])
  df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"), "WAR"])
  df["war_corr"].fillna(1,inplace=True)

  df["war_diff"] = df["Next_WAR"] / df["WAR"].shift(1)
  df["war_diff"].fillna(1, inplace=True)

  df["war_diff"][df["war_diff"] == np.inf] = 1
  return df

batting = batting.groupby("IDfg", group_keys=False).apply(player_history)

def group_averages(df):
  return df["WAR"] / df["WAR"].mean()

batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)
new_predictors = predictors + ["player_season", "war_corr", "war_season", "war_diff"]
predictions = backtest(batting, rr, new_predictors)
mean_squared_error(predictions["actual"], predictions["prediction"])

pd.Series(rr.coef_, index=new_predictors).sort_values()



ValueError: Input X contains infinity or a value too large for dtype('float64').