<a href="https://colab.research.google.com/github/tleitch/BDML/blob/main/assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Build a model to create a metric for judging team based on team level information ##

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
pd.options.mode.chained_assignment = None  # default='warn'
from mip import Model, xsum, maximize, BINARY

In [None]:
teams = pd.read_csv("teams.csv")
teams = teams[(teams.yearID >=1961) & (teams.yearID <= 2001)]

## Extracting features for teams ##

In [None]:
teams["BB"] = teams["BB"]/teams["G"]
teams["singles"] = (teams["H"] - teams["X2B"] - teams["X3B"] - teams["HR"])/teams["G"]
teams["doubles"] =  teams["X2B"]/teams["G"]
teams["triples"] = teams["X3B"]/teams["G"]
teams["HR"] = teams["HR"]/teams["G"]
teams["R"] = teams["R"]/teams["G"]

In [None]:
teams.shape

(1026, 52)

## Model building ##

In [None]:
team_features = teams[["BB","singles","doubles","triples","HR"]]
team_runs = teams["R"]
model = BayesianRidge()
model.fit(team_features, team_runs)
#reg = LinearRegression().fit(team_features, team_runs)
#reg.score(team_features,team_runs)

BayesianRidge()

Other possible models which can be built :

randomforest : https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html Gradient boosting : https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html support vector machine : https://scikit-learn.org/stable/auto_examples/svm/plot_svm_regression.html kNN fit : https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsRegressor.html

Instead of using batting average, or just number of HR,as a measure of picking players, we can use our fitted model to form a metric that relates more directly to run production.

Specifically, to define a metric for player A, we imagine a team made up of players just like player A and use our fitted regression model to predict how many runs this team would produce 

To define a player-specific metric, we have a bit more work to do. A challenge here is that we derived the metric for teams, based on team-level summary statistics. 

For example, the HR value that is entered into the equation is HR per game for the entire team 

We compute the per-plate-appearance rates for players available in 2002 on data from 1997-2001. 
To avoid small sample artifacts, we filter players with less than 1000 plate appearances per year.


Reference : https://rafalab.github.io/dsbook/linear-models.html#linear-regression-in-the-tidyverse

In [None]:
batting=pd.read_csv("Batting.csv")

In [None]:
def extract_pa_per_game(df):
    
    pa_per_game = (df['AB'].sum() + df["BB"].sum())/df["G"].max()
    
    return pa_per_game


In [None]:
pa_per_game=batting[batting.yearID ==2002].groupby('teamID').apply(extract_pa_per_game)
average_pa_teamwise = pa_per_game.mean()

In [None]:
average_pa_teamwise

38.74656866970645

In [None]:
batting["PA"] = batting["AB"] + batting["BB"]
batting["singles"] = batting["H"] - batting["X2B"] - batting["X3B"] - batting["HR"]

In [None]:
players = batting[(batting.yearID >= 1997) & (batting.yearID <=2001)].groupby('playerID').agg(PA_sum = ("PA",sum),HR_sum=("HR",sum),BB_sum=("BB",sum),singles_sum=("singles",sum),doubles_sum=("X2B",sum),triples_sum=("X3B",sum),AB_sum=("AB",sum),H_sum=("H",sum))
players["Average_PA"] = players["PA_sum"]/average_pa_teamwise
players["HR"] = players["HR_sum"]/players["Average_PA"]
players["BB"] = players["BB_sum"]/players["Average_PA"]
players["singles"] = players["singles_sum"]/players["Average_PA"]
players["doubles"] = players["doubles_sum"]/players['Average_PA']
players["triples"] = players["triples_sum"]/players["Average_PA"]
players["Average"] = players["H_sum"]/players["AB_sum"]
players = players[players.PA_sum >= 1000]

In [None]:
players.head()

Unnamed: 0_level_0,PA_sum,HR_sum,BB_sum,singles_sum,doubles_sum,triples_sum,AB_sum,H_sum,Average_PA,HR,BB,singles,doubles,triples,Average
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
abreubo01,2815,96,420,444,164,33,2395,737,72.651595,1.321375,5.781016,6.111359,2.257349,0.454223,0.307724
agbaybe01,1060,35,123,172,51,6,937,264,27.357261,1.279368,4.496064,6.287179,1.864222,0.21932,0.28175
alfoned01,3063,96,359,535,158,7,2704,796,79.052161,1.214388,4.541305,6.767683,1.99868,0.088549,0.294379
alicelu01,1954,24,216,339,82,22,1738,467,50.430272,0.475905,4.283142,6.722153,1.626007,0.436246,0.2687
alomaro01,3090,91,342,583,173,20,2748,867,79.748997,1.14108,4.288455,7.310437,2.169306,0.250787,0.315502


In [None]:
players_features = players[["HR","BB","singles","doubles","triples"]]
players_features.head()

Unnamed: 0_level_0,HR,BB,singles,doubles,triples
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
abreubo01,1.321375,5.781016,6.111359,2.257349,0.454223
agbaybe01,1.279368,4.496064,6.287179,1.864222,0.21932
alfoned01,1.214388,4.541305,6.767683,1.99868,0.088549
alicelu01,0.475905,4.283142,6.722153,1.626007,0.436246
alomaro01,1.14108,4.288455,7.310437,2.169306,0.250787


In [None]:
#players_features["R_hat"]=(reg.predict(players_features))
players_features["R_hat"]=(model.predict(players_features))
players_features.head()

Unnamed: 0_level_0,HR,BB,singles,doubles,triples,R_hat
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
abreubo01,1.321375,5.781016,6.111359,2.257349,0.454223,8.876176
agbaybe01,1.279368,4.496064,6.287179,1.864222,0.21932,7.505707
alfoned01,1.214388,4.541305,6.767683,1.99868,0.088549,7.852762
alicelu01,0.475905,4.283142,6.722153,1.626007,0.436246,7.451847
alomaro01,1.14108,4.288455,7.310437,2.169306,0.250787,8.557009


The player-specific predicted runs computed here can be interpreted as the number of runs we predict a team  will score if all batters are exactly like that player 

## Adding salary information ##

In [None]:
Salaries=pd.read_csv("salaries.csv")

In [None]:
salaries_yr_2002 =Salaries[Salaries.yearID==2002]
salaries_yr_2002= salaries_yr_2002[["playerID","salary"]]

In [None]:
player_insights = pd.merge(salaries_yr_2002, players_features, on='playerID')
player_insights.head()

Unnamed: 0,playerID,salary,HR,BB,singles,doubles,triples,R_hat
0,anderga01,5000000,1.245384,1.676031,7.25116,2.234707,0.197865,7.197128
1,erstada01,6250000,0.982139,3.197939,7.198358,2.024164,0.227569,7.632801
2,fabrejo01,500000,0.583429,2.230759,6.623638,1.132539,0.171597,5.360105
3,fullmbr01,4000000,1.433642,2.504095,5.906606,2.676132,0.133807,7.111337
4,glaustr01,4000000,2.105016,5.440932,4.299228,2.015821,0.053517,6.717143


## Adding position information ##

In [None]:
appearances = pd.read_csv("appearances.csv")
            

In [None]:
appearances.head()

Unnamed: 0.1,Unnamed: 0,yearID,teamID,lgID,playerID,G_all,GS,G_batting,G_defense,G_p,...,G_2b,G_3b,G_ss,G_lf,G_cf,G_rf,G_of,G_dh,G_ph,G_pr
0,1,1871,TRO,,abercda01,1,1.0,1,1.0,0,...,0,0,1,0,0,0,0,0.0,0.0,0.0
1,2,1871,RC1,,addybo01,25,25.0,25,25.0,0,...,22,0,3,0,0,0,0,0.0,0.0,0.0
2,3,1871,CL1,,allisar01,29,29.0,29,29.0,0,...,2,0,0,0,29,0,29,0.0,0.0,0.0
3,4,1871,WS3,,allisdo01,27,27.0,27,27.0,0,...,0,0,0,0,0,0,0,0.0,0.0,0.0
4,5,1871,RC1,,ansonca01,25,25.0,25,25.0,0,...,2,20,0,1,0,0,1,0.0,0.0,0.0


In [None]:
append_str = 'G_'
position_names = ["p","c","1b","2b","3b","ss","lf","cf","rf", "dh"]
position_names = [append_str + sub for sub in position_names]

In [None]:
operations_dict = {}
for term in position_names:
    operations_dict[term] = "sum"

In [None]:
result = appearances[appearances.yearID == 2002].groupby("playerID").agg(operations_dict)

In [None]:
result.head()

Unnamed: 0_level_0,G_p,G_c,G_1b,G_2b,G_3b,G_ss,G_lf,G_cf,G_rf,G_dh
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
abbotpa01,7,0,0,0,0,0,0,0,0,0.0
abernbr01,0,0,0,116,0,0,0,0,0,1.0
abreubo01,0,0,0,0,0,0,0,18,148,0.0
acevejo01,6,0,0,0,0,0,0,0,0,0.0
aceveju01,65,0,0,0,0,0,0,0,0,0.0


In [None]:
def max_position(x):
    
    position_counts = [x[term] for term in position_names]
    return position_counts.index(max(position_counts))

In [None]:
result["most_played_position"] = result.apply(lambda x : position_names[max_position(x)][2:],axis=1)

In [None]:
result.head()


Unnamed: 0_level_0,G_p,G_c,G_1b,G_2b,G_3b,G_ss,G_lf,G_cf,G_rf,G_dh,most_played_position
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
abbotpa01,7,0,0,0,0,0,0,0,0,0.0,p
abernbr01,0,0,0,116,0,0,0,0,0,1.0,2b
abreubo01,0,0,0,0,0,0,0,18,148,0.0,rf
acevejo01,6,0,0,0,0,0,0,0,0,0.0,p
aceveju01,65,0,0,0,0,0,0,0,0,0.0,p


In [None]:
player_salary_position = pd.merge(player_insights, result, on='playerID')
player_salary_position.drop(position_names,axis=1,inplace=True)
player_salary_position = player_salary_position[player_salary_position["most_played_position"]!="p"]

In [None]:
position_names = ["p","c","1b","2b","3b","ss","lf","cf","rf", "dh"]
for position in position_names:
    player_salary_position["chronicle_delta_" + str(position)] = player_salary_position.apply(lambda x : 1 if x.most_played_position == position else 0,axis=1)

## Select the players given the maximum budget as 40 million dollars ##


Please see reference link before proceeding the code : 
https://docs.python-mip.com/en/latest/examples.html

In [None]:
from mip import Model, xsum, maximize, BINARY

In [None]:
p = player_salary_position["R_hat"]
w = player_salary_position["salary"]
y =player_salary_position["chronicle_delta_" + str(position)]
c, I = 40000000, range(len(w))

In [None]:
m = Model("knapsack")

x = [m.add_var(var_type=BINARY) for i in I]

m.objective = maximize(xsum(p[i] * x[i] for i in I))

m += xsum(w[i] * x[i] for i in I) <= c


for position in position_names:
    m += xsum(player_salary_position["chronicle_delta_" + str(position)][i]*x[i] for i in I) == 1
    


m.optimize()

selected = [i for i in I if x[i].x >= 0.99]
print("selected items: {}".format(selected))


selected items: [39, 57, 84, 103, 108, 121, 191, 217, 218]


## Selected team ##

In [None]:
player_salary_position.drop(["chronicle_delta_" + str(position) for position in position_names],axis=1,inplace=True)
selected_team=player_salary_position.iloc[selected]
selected_team

Unnamed: 0,playerID,salary,HR,BB,singles,doubles,triples,R_hat,most_played_position
39,garcino01,9000000,1.730946,2.766555,7.323235,2.618611,0.384655,8.741697,ss
57,deshide01,1250000,0.602038,4.260579,6.715043,1.821552,0.524854,7.850101,2b
84,heltoto01,5000000,2.24035,4.739202,6.218408,2.742993,0.157973,8.929569,1b
103,millake01,900000,1.41555,3.818693,6.057238,2.304384,0.395037,7.822572,lf
108,berkmla01,500000,1.932416,5.240449,5.502471,2.783988,0.196517,8.629489,cf
121,maynebr01,2500000,0.436905,3.932144,7.565354,2.092545,0.022995,7.884126,c
191,abreubo01,6333333,1.321375,5.781016,6.111359,2.257349,0.454223,8.876176,rf
217,cirilje01,6375000,0.800994,3.873461,7.782788,2.283429,0.119551,8.530867,3b
218,martied01,7086668,1.741558,6.311602,6.410414,2.309725,0.049406,9.019139,dh
