# Baseline Model
## This notebook  contains the code for the baseline model.

In [1]:
import numpy as np  #importing necessary libraries
import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline
from sportsreference.mlb.roster import Player
from sportsreference.mlb.roster import Roster
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import scipy.stats as scs
sns.set(style="whitegrid")
from sklearn.multioutput import RegressorChain
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout,LSTM
from keras.optimizers import RMSprop
from keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Conv2D,MaxPool2D,Dropout,Flatten,Dense,BatchNormalization

In [3]:
df=pd.read_csv('Mlb_Rosters_2000_2020_filtered.csv',index_col=0) #importing libraries

In [4]:
df.columns #checking columns of dataframe

Index(['Unnamed: 0.1', 'index', 'name', 'season', 'team_abbreviation',
       'assists', 'at_bats', 'bases_on_balls', 'batting_average', 'birth_date',
       'complete_games', 'defensive_chances',
       'defensive_runs_saved_above_average',
       'defensive_runs_saved_above_average_per_innings', 'double_plays_turned',
       'doubles', 'errors', 'fielding_percentage', 'games', 'games_catcher',
       'games_center_fielder', 'games_designated_hitter',
       'games_first_baseman', 'games_in_batting_order',
       'games_in_defensive_lineup', 'games_left_fielder', 'games_outfielder',
       'games_pinch_hitter', 'games_pinch_runner', 'games_pitcher',
       'games_right_fielder', 'games_second_baseman', 'games_shortstop',
       'games_started', 'games_third_baseman', 'grounded_into_double_plays',
       'height', 'hits', 'home_runs', 'innings_played',
       'intentional_bases_on_balls', 'league_fielding_percentage',
       'league_range_factor_per_game', 'league_range_factor_per_nine

In [5]:
six_year_players=[] #only taking players with at least 6 years of experience
for name in df['name'].unique():
    if len(df.loc[df.name==name,:])>=6:
           six_year_players.append(name)
    else:
           pass 

In [6]:
six_year_dfs=[] #compiling 6 year players into large dataframe
for player in six_year_players:
    df_player=df.loc[df.name==player,:]
    six_year_dfs.append(df_player)
Six_year_players=pd.concat(six_year_dfs)

In [7]:
Six_year_players.shape #shape of dataframe

(7581, 76)

In [8]:
next_season_six=[] #collecting actual OPS+ values during arbitration seasons
next_next_season_six=[]
next_next_next_season_six=[]
for name in Six_year_players.name.unique():
    df_next=Six_year_players.loc[Six_year_players.name==name,:][3:4]
    df_next_next=Six_year_players.loc[Six_year_players.name==name,:][4:5]
    df_next_next_next=Six_year_players.loc[Six_year_players.name==name,:][5:6]
    next_season_six.append(df_next)
    next_next_season_six.append(df_next_next)
    next_next_next_season_six.append(df_next_next_next)
Next_Season_six=pd.concat(next_season_six)
Next_Next_Season_six=pd.concat(next_next_season_six)
Next_Next_Next_Season_six=pd.concat(next_next_next_season_six)

# The purpose of the next 6 cells is to collect the projections for the next 3 seasons for each player and add them to the 6 year player dataframe as additional columns

In [9]:
next_year_dict={}
for player in Next_Season_six.name:
    OPS_plus=Next_Season_six.loc[Next_Season_six.name==player,'on_base_plus_slugging_percentage_plus']
    next_year_dict[player]=OPS_plus.values

In [10]:
Six_year_players['Next_Year_OPS+']=Six_year_players['name'].map(next_year_dict).astype('float64')

In [11]:
next_next_year_dict={}
for player in Next_Next_Season_six.name:
    OPS_plus=Next_Next_Season_six.loc[Next_Next_Season_six.name==player,'on_base_plus_slugging_percentage_plus']
    next_next_year_dict[player]=OPS_plus.values

In [12]:
Six_year_players['Two_Year_OPS+']=Six_year_players['name'].map(next_next_year_dict).astype('float64')

In [13]:
next_next_next_year_dict={}
for player in Next_Next_Next_Season_six.name:
    OPS_plus=Next_Next_Next_Season_six.loc[Next_Next_Next_Season_six.name==player,'on_base_plus_slugging_percentage_plus']
    next_next_next_year_dict[player]=OPS_plus.values

In [14]:
Six_year_players['Three_Year_OPS+']=Six_year_players['name'].map(next_next_next_year_dict).astype('float64')

In [15]:
Six_year_players['Elite_Slugging']=np.where(Six_year_players.slugging_percentage>.46,1,0)
#feature engineering, see master notebook for details

In [16]:
first_three_six=[] #only taking first three seasons for each player in 6 year df
for player in Six_year_players.name.unique():
    df_name=Six_year_players.loc[Six_year_players.name==player,:][:3]
    first_three_six.append(df_name)
First_three_six=pd.concat(first_three_six)

In [17]:
First_three_six.head()

Unnamed: 0,Unnamed: 0.1,index,name,season,team_abbreviation,assists,at_bats,bases_on_balls,batting_average,birth_date,complete_games,defensive_chances,defensive_runs_saved_above_average,defensive_runs_saved_above_average_per_innings,double_plays_turned,doubles,errors,fielding_percentage,games,games_catcher,games_center_fielder,games_designated_hitter,games_first_baseman,games_in_batting_order,games_in_defensive_lineup,games_left_fielder,games_outfielder,games_pinch_hitter,games_pinch_runner,games_pitcher,games_right_fielder,games_second_baseman,games_shortstop,games_started,games_third_baseman,grounded_into_double_plays,height,hits,home_runs,innings_played,intentional_bases_on_balls,league_fielding_percentage,league_range_factor_per_game,league_range_factor_per_nine_innings,nationality,on_base_percentage,on_base_plus_slugging_percentage,on_base_plus_slugging_percentage_plus,plate_appearances,player_id,position,putouts,range_factor_per_game,range_factor_per_nine_innings,runs,runs_batted_in,sacrifice_flies,sacrifice_hits,slugging_percentage,stolen_bases,times_caught_stealing,times_hit_by_pitch,times_struck_out,total_bases,total_fielding_runs_above_average,total_fielding_runs_above_average_per_innings,triples,weight,birth_year,Age,Max_OPS+,Best_season,Height,Cumulative_Games_started,Cluster,Relative_performance_to_cluster_avg,Next_Year_OPS+,Two_Year_OPS+,Three_Year_OPS+,Elite_Slugging
0,0,1,Damian Miller,1998.0,MIL,26.0,168.0,11.0,0.286,1969-10-13,41.0,285.0,,,2.0,14.0,4.0,0.986,57.0,46.0,0.0,2.0,1.0,57.0,48.0,0.0,2.0,10.0,2.0,0.0,2.0,0.0,0.0,43.0,0.0,2.0,6-3,48.0,3.0,388.1,2.0,0.99,7.47,7.51,United States of America,0.337,0.783,104.0,183.0,milleda02,C,255.0,6.11,6.51,17.0,14.0,0.0,2.0,0.446,1.0,0.0,2.0,43.0,75.0,-2.0,-6.0,2.0,202,1969,29.0,104.0,1,6.25,43.0,3,1.094737,89.0,94.0,76.0,0
1,1,2,Damian Miller,1999.0,MIL,61.0,296.0,19.0,0.27,1969-10-13,71.0,689.0,,,9.0,19.0,6.0,0.991,86.0,86.0,0.0,0.0,0.0,86.0,86.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,78.0,0.0,6.0,6-3,80.0,11.0,715.1,3.0,0.99,7.35,7.41,United States of America,0.316,0.762,90.0,320.0,milleda02,C,622.0,7.94,8.59,35.0,47.0,3.0,0.0,0.446,0.0,0.0,2.0,78.0,132.0,3.0,5.0,0.0,202,1969,30.0,104.0,0,6.25,121.0,3,0.947368,89.0,94.0,76.0,0
2,2,3,Damian Miller,2000.0,MIL,47.0,324.0,36.0,0.275,1969-10-13,84.0,734.0,,,4.0,24.0,6.0,0.992,100.0,97.0,0.0,0.0,2.0,100.0,99.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,91.0,0.0,6.0,6-3,89.0,10.0,805.2,4.0,0.99,7.38,7.45,United States of America,0.347,0.788,95.0,364.0,milleda02,C,681.0,7.51,8.13,43.0,44.0,2.0,1.0,0.441,2.0,2.0,1.0,74.0,143.0,9.0,13.0,0.0,202,1969,31.0,104.0,0,6.25,212.0,3,1.0,89.0,94.0,76.0,0
10,10,12,Greg Colbrunn,1992.0,ARI,29.0,168.0,6.0,0.268,1969-07-26,25.0,395.0,,,24.0,8.0,3.0,0.992,52.0,0.0,0.0,0.0,47.0,52.0,47.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,44.0,0.0,1.0,6-0,45.0,2.0,372.2,1.0,0.993,9.76,9.75,United States of America,0.294,0.646,83.0,180.0,colbrgr01,1B,363.0,8.34,9.47,12.0,18.0,4.0,0.0,0.351,3.0,2.0,2.0,34.0,59.0,3.0,8.0,0.0,190,1969,23.0,148.0,0,6.0,44.0,0,0.912088,99.0,105.0,88.0,0
11,11,13,Greg Colbrunn,1993.0,ARI,27.0,153.0,6.0,0.255,1969-07-26,25.0,401.0,,,31.0,9.0,2.0,0.995,70.0,0.0,0.0,0.0,61.0,70.0,61.0,0.0,0.0,13.0,2.0,0.0,0.0,0.0,0.0,33.0,0.0,1.0,6-0,39.0,4.0,356.1,1.0,0.992,9.61,9.68,United States of America,0.282,0.674,76.0,164.0,colbrgr01,1B,372.0,6.54,10.08,15.0,23.0,3.0,1.0,0.392,4.0,2.0,1.0,33.0,60.0,-2.0,-8.0,0.0,190,1969,24.0,148.0,0,6.0,77.0,0,0.835165,99.0,105.0,88.0,0


In [18]:
cols_selected=['season','assists','at_bats','bases_on_balls','batting_average','doubles',
'games','games_started','hits','home_runs','intentional_bases_on_balls',
 'on_base_plus_slugging_percentage','on_base_plus_slugging_percentage_plus',
'plate_appearances','runs_batted_in','sacrifice_flies','sacrifice_hits',
'slugging_percentage','total_bases','triples', 'weight','birth_year','Age','Max_OPS+',
 'Best_season','Height','Cumulative_Games_started','Cluster',
'Relative_performance_to_cluster_avg','Elite_Slugging',
'Next_Year_OPS+','Two_Year_OPS+','Three_Year_OPS+']
#relevant columns

In [19]:
modeling_data=First_three_six.groupby('name').mean()[cols_selected]
modeling_data.isna().any().sum()
#grouping by player and checking for nulls

1

In [20]:
modeling_data.dropna(axis=0,inplace=True)
modeling_data.shape
#shape of df

(738, 33)

In [21]:
Y=modeling_data[modeling_data.columns[-3:]]
Y.columns #labels

Index(['Next_Year_OPS+', 'Two_Year_OPS+', 'Three_Year_OPS+'], dtype='object')

# Dummy Linear Regression

In [68]:
dummy_cols=modeling_data.columns[:-12]
X_dummy_unscaled=modeling_data[dummy_cols]
dummy_scaler=StandardScaler()
dummy_scaler.fit(X_dummy_unscaled)
X_dummy=pd.DataFrame(data=dummy_scaler.transform(X_dummy_unscaled), columns=dummy_cols)
X_dummy.dropna(axis=0,inplace=True)
X_dummy.shape #training data after scaling

(738, 21)

In [69]:
lr_dummy=LinearRegression()
wrapper_dummy=RegressorChain(lr_dummy,order=[0,1,2])
cv=RepeatedKFold(n_repeats=3,random_state=1)
n_scores=cross_val_score(wrapper_dummy,X_dummy,Y,scoring='neg_mean_squared_error',cv=cv,verbose=1,n_jobs=-1)
dummy_wrapper_fit=wrapper_dummy.fit(X_dummy,Y)
#fitting baseline model

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:    0.1s finished


In [70]:
preds= dummy_wrapper_fit.predict(X_dummy)
mae = metrics.mean_absolute_error(Y, preds)
mse = metrics.mean_squared_error(Y, preds)
rmse = np.sqrt(metrics.mean_squared_error(Y, preds))
# evaulating model
print('Mean Absolute Error:', mae )
print('Mean Squared Error:',  mse)
print('Root Mean Squared Error:' , rmse)

Mean Absolute Error: 16.39258179121738
Mean Squared Error: 437.35732878864883
Root Mean Squared Error: 20.913089890990495


In [None]:
# This baseline model will be used as as a refernce when evulating other models