In [1]:
#Import data/datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, GridSearchCV
from sklearn import neighbors
from sklearn.metrics import mean_squared_error, make_scorer, r2_score

In [2]:
historical_df = pd.read_csv('https://raw.githubusercontent.com/dribbleanalytics/ml-mvp-predict/master/2019-20-season/final-csv-data/historical-data.csv')


In [3]:
prediction_set = {
    'player':['Stephen Curry', 'Kevin Durant', 'Giannis Antetokounmpo', 'Nikola Jokic', 'Chris Paul', 'Jimmy Butler', 'Rudy Gobert', 'James Harden', 'Deandre Ayton', 'DeMar DeRozan'],
    'age':[33, 33, 27, 26, 36, 32, 29, 32, 23, 32],
    'g':[22, 22, 21, 17, 23, 17, 22, 23, 17, 24],
    'team_wins':[19, 16, 15, 11, 19, 14, 16, 16, 19, 16],
    'overall_seed':[1, 3, 6, 18, 2, 7, 4, 3, 2, 4],
    'mp':[34.4, 36.0, 32.7, 32.3, 32.3, 34.2, 31.6, 35.7, 30.6, 35.3],
    'fgm':[9, 10.4, 9.8, 10.2, 5.0, 7.9, 5.4, 5.9, 7.4, 9.5],
    'fga':[20.6, 19.5, 18.4, 17.4, 10.5, 15.3, 7.3, 14.6, 11.9, 19.0],
    'fg_perc':[0.437, 0.533, 0.532, 0.590, 0.479, 0.519, 0.739, 0.403, 0.619, 0.498],
    '3pm':[5.4, 1.8, 1.1, 1.7, 1.0, 0.4, 0, 2.5, 0.1, 0.8],
    '3pa':[13.2, 4.6, 4.1, 4.4, 3.0, 1.8, 1, 7.3, 0.3, 2.5],
    '3p_perc':[0.41, 0.382, 0.276, 0.387, 0.338, 0.200, 0, 0.347, 0.200, 0.333],
    '2pm':[3.6, 8.6, 8.7, 8.5, 4.0, 7.6, 5.4, 3.3, 7.3, 8.6],
    '2pa':[7.4, 14.8, 14.3, 12.9, 7.6, 13.5, 7.3, 7.3, 11.6, 16.5],
    '2p_perc':[0.485, 0.580, 0.607, 0.659, 0.534, 0.561, 0.744, 0.458, 0.629, 0.523],
    'efg':[0.568, 0.578, 0.563, 0.573, 0.522, 0.531, 0.739, 0.490, 0.621, 0.520],
    'ftm':[4.1, 6.1, 6.9, 3.9, 3.2, 7.4, 4.5, 6.3, 1.7, 6.6],
    'fta':[4.5, 7.0, 10.2, 5.1, 3.8, 8.6, 6.6, 7.2, 2.8, 7.5],
    'ft_perc':[0.929, 0.870, 0.673, 0.759, 0.841, 0.850, 0.676, 0.885, 0.617, 0.888],
    'orb':[0.6, 0.5, 1.9, 2.6, 0.3, 1.6, 2.9, 1.0, 3.0, 0.6],
    'drb':[5.1, 7.3, 10.0, 7.3, 3.7, 4.2, 11.5, 6.9, 8.2, 4.7],
    'trb':[5.7, 7.7, 11.8, 9.9, 4.0, 5.8, 14.4, 7.9, 11.2, 5.3],
    'ast':[6.5, 5.5, 6.0, 6.0, 10.1, 5.3, 0.9, 9.5, 1.4, 4.1],
    'stl':[1.8, 0.6, 1.1, 1.1, 2.0, 2.1, 1.0, 1.5, 0.8, 0.9],
    'blk':[0.5, 0.7, 1.7, 0.7, 0.3, 0.3, 2.1, 0.7, 0.7, 0.4],
    'tov':[3.1, 3.0, 3.2, 2.6, 2.4, 1.9, 2.0, 4.8, 1.6, 2.0],
    'pf':[1.8, 1.6, 2.9, 2.8, 2.0, 1.7, 2.7, 2.4, 2.4, 2.3],
    'pts':[27.5, 28.6, 27.6, 26.1, 14.3, 23.6, 15.3, 20.6, 16.5, 26.4],
    'per':[25.4, 26.1, 31.5, 34.5, 21.4, 27.8, 25.1, 20.7, 21.5, 23.6],
    'ts':[0.611, 0.634, 0.603, 0.665, 0.586, 0.618, 0.747, 0.581, 0.629, 0.592],
    '3par':[0.640, 0.238, 0.225, 0.254, 0.281, 0.115, 0.006, 0.499, 0.025, 0.592],
    'ftr':[0.216, 0.360, 0.553, 0.295, 0.364, 0.565, 0.901, 0.493, 0.233, 0.393],
    'orb_perc':[1.8, 1.4, 6.0, 9.1, 0.9, 5.3, 10.7, 3.1, 10.6, 2.0],
    'drb_perc':[13.6, 20.4, 30.7, 37.7, 11.8, 13.7, 37.8, 19.5, 27.1, 14.4],
    'ast_perc':[29.5, 28.1, 34.4, 38.4, 43.4, 26.7, 4.2, 11.6, 6.9, 19.7],
    'stl_perc':[2.5, 0.9, 1.6, 1.7, 3.0, 3.1, 1.5, 2.0, 1.3, 1.3],
    'blk_perc':[1.4, 1.7, 5.0, 2.2, 0.7, 1.0, 5.7, 1.7, 2.0, 0.9],
    'tov_perv':[12.1, 11.7, 12.2, 14.2, 16.4, 9.2, 16.4, 21.4, 10.8, 8.4],
    'usg_perc':[30.3, 31.2, 34.2, 31.9, 19.6, 27.2, 16.8, 27.8, 20.8, 30.3],
    'ws':[4.0, 3.7, 4.1, 3.6, 3.4, 3.4, 4.0, 2.7, 2.2, 3.5],
    'ws_per_48':[0.255, 0.225, 0.288, 0.316, 0.221, 0.284, 0.276, 0.158, 0.200, 0.200],
    'vorp':[2.0, 1.8, 2.2, 2.3, 1.2, 1.6, 1.1, 1.2, 0.5, 1.1],
    'bpm':[8.6, 6.9, 10.5, 14.1, 4.1, 8.8, 4.5, 3.7, 1.5, 2.9],
    'offensive_raptor':[7.2, 4.4, 4.8, 7.3, 3.8, 3.5, -0.5, 5.2, 0.3, 1.9],
    'defensive_raptor':[0, -0.5, 3.2, 2.7, -1.0, 1.9, 6.3, -0.8, 1.0, -1.4],
    'num_mvps_won':[2, 1, 2, 1, 0, 0, 0, 1, 0, 0],
    'num_top_rank':[8, 10, 6, 4, 9, 2, 2, 9, 0, 2],
    'consecutive_mvps':[0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
}
predictions_names = pd.DataFrame(prediction_set['player'], columns=['Name'])
prediction_set = pd.DataFrame(prediction_set).drop(['age', 'player'], axis=1)

In [4]:
player_mvps = {}
player_listed = {}
previous_player = ['', 0]
num_won_mvps = []
num_listed_mvps = []
consecutive_mvps = []


for index, row in historical_df[:].iterrows():
  if row['player'] in player_listed.keys():
    player_listed[row['player']] += 1
    if row['player'] == previous_player[0]:
      consecutive_mvps.append(previous_player[1])
    else:
      consecutive_mvps.append(0)
    if row['won_mvp'] == 1:
      player_mvps[row['player']] += 1
      if previous_player[0] == row['player']:
        previous_player[1] += 1
      else:
        previous_player = [row['player'], 1]
  else:
    player_listed[row['player']] = 1
    player_mvps[row['player']] = 0
    consecutive_mvps.append(0)
    if row['won_mvp'] == 1:
      player_mvps[row['player']] += 1
      previous_player = [row['player'], 1]
    
  num_won_mvps.append(player_mvps[row['player']])
  num_listed_mvps.append(player_listed[row['player']])

  

In [5]:
X = historical_df.copy()
X['num_mvps_won'] = num_won_mvps
X['num_top_rank'] = num_listed_mvps
X['consecutive_mvps'] = consecutive_mvps
X.tail(20)

Unnamed: 0,rank,won_mvp,player,season_start,age,pos,pos_number,tm,pts_won,pts_max,...,usg_perc,ws,ws_per_48,vorp,bpm,offensive_raptor,defensive_raptor,num_mvps_won,num_top_rank,consecutive_mvps
329,2,0,LeBron James,2017,33,PF,4,CLE,738.0,1010,...,31.6,14.0,0.221,8.9,9.6,7.460088,-1.692611,4,15,0
330,3,0,Anthony Davis,2017,24,PF,4,NOP,445.0,1010,...,30.0,13.7,0.241,4.9,5.2,1.259732,4.004133,0,3,0
331,4,0,Damian Lillard,2017,27,PG,1,POR,207.0,1010,...,30.6,12.6,0.227,5.9,6.7,6.154617,-0.127381,0,2,0
332,5,0,Russell Westbrook,2017,29,PG,1,OKC,76.0,1010,...,34.1,10.1,0.166,7.5,8.2,4.340246,0.089067,1,5,0
333,6,0,Giannis Antetokounmpo,2017,23,PF,4,MIL,75.0,1010,...,31.2,11.9,0.207,5.4,5.8,3.305364,2.243496,0,2,0
334,7,0,Kevin Durant,2017,29,SF,3,GSW,66.0,1010,...,30.4,10.4,0.215,4.5,5.6,5.897184,-0.73724,1,8,0
335,8,0,DeMar DeRozan,2017,28,SG,2,TOR,32.0,1010,...,29.6,9.6,0.17,2.6,1.8,3.436637,-2.129784,0,1,0
336,9,0,LaMarcus Aldridge,2017,32,C,5,SAS,6.0,1010,...,29.1,10.9,0.209,3.3,3.3,2.59819,1.373439,0,3,0
337,10,0,Jimmy Butler,2017,28,SG,2,MIN,5.0,1010,...,24.9,8.9,0.198,3.8,5.0,5.663651,2.404067,0,1,0
338,10,0,Stephen Curry,2017,29,PG,1,GSW,5.0,1010,...,31.0,9.1,0.267,4.4,8.6,8.05785,-0.500107,2,5,0


In [6]:
X = X.drop(['rank', 'season_start', 'age', 'player', 'pos', 'pos_number', 'tm', 'pts_won', 'pts_max', 'all_star_votes', 'preseason_odds_rank', 'won_mvp'], axis=1)
y = X['vote_share']
X = X.drop(['vote_share'], axis=1)

In [20]:
copy = X.copy()
copy['season_start'] = historical_df['season_start']
copy['player'] = historical_df['player']
y1984 = copy[copy['season_start']==1984]
y1985 = copy[copy['season_start']==1985]
y1986= copy[copy['season_start']==1986]
y1987= copy[copy['season_start']==1987]
y1988= copy[copy['season_start']==1988]
y1989= copy[copy['season_start']==1989]
y1990= copy[copy['season_start']==1990]
y1991= copy[copy['season_start']==1991]
y1992= copy[copy['season_start']==1992]
y1993= copy[copy['season_start']==1993]
y1994= copy[copy['season_start']==1994]
y1995= copy[copy['season_start']==1995]
y1996= copy[copy['season_start']==1996]
y1997= copy[copy['season_start']==1997]
y1998= copy[copy['season_start']==1998]
y1999= copy[copy['season_start']==1999]
y2000=copy[copy['season_start']==2000]
y2001=copy[copy['season_start']==2001]
y2002=copy[copy['season_start']==2002]
y2003=copy[copy['season_start']==2003]
y2004=copy[copy['season_start']==2004]
y2005=copy[copy['season_start']==2005]
y2006=copy[copy['season_start']==2006]
y2007=copy[copy['season_start']==2007]
y2008=copy[copy['season_start']==2008]
y2009=copy[copy['season_start']==2009]
y2010=copy[copy['season_start']==2010]
y2011=copy[copy['season_start']==2011]
y2012=copy[copy['season_start']==2012]
y2013=copy[copy['season_start']==2013]
y2014=copy[copy['season_start']==2014]
y2015=copy[copy['season_start']==2015]
y2016=copy[copy['season_start']==2016]
y2017=copy[copy['season_start']==2017]
y2018=copy[copy['season_start']==2018]

In [7]:
kf = KFold(n_splits=5, random_state=42, shuffle=True)

In [8]:
X = X.drop(['bpm', 'ws', 'fgm', 'mp', 'fg_perc', '3pm', '3pa', '3p_perc', '2pm', '2pa', '2p_perc', 'ftm', 'fta', 'ft_perc', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', '3par', 'ftr', 'orb_perc', 'drb_perc', 'ast_perc', 'stl_perc', 'blk_perc', 'tov_perv', 'usg_perc', 'g', 'team_wins', 'trb_perc', 'num_mvps_won', 'num_top_rank', 'consecutive_mvps', 'fga', 'efg', 'defensive_raptor', 'offensive_raptor'], axis=1)
prediction_set = prediction_set.drop(['bpm', 'ws', 'fgm', 'mp', 'fg_perc', '3pm', '3pa', '3p_perc', '2pm', '2pa', '2p_perc', 'ftm', 'fta', 'ft_perc', 'orb', 'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', '3par', 'ftr', 'orb_perc', 'drb_perc', 'ast_perc', 'stl_perc', 'blk_perc', 'tov_perv', 'usg_perc', 'g', 'team_wins', 'num_mvps_won', 'num_top_rank', 'consecutive_mvps', 'fga', 'efg','offensive_raptor', 'defensive_raptor'], axis=1)

In [9]:
X.head()

Unnamed: 0,overall_seed,pts,per,ts,ws_per_48,vorp
0,1,28.7,26.5,0.585,0.238,8.4
1,2,18.3,23.2,0.637,0.22,5.2
2,4,24.6,22.5,0.577,0.193,2.6
3,2,22.0,22.9,0.628,0.204,4.6
4,3,23.6,22.1,0.536,0.189,4.0


In [10]:
params={
    'n_neighbors':range(1,100), 
    'metric':['euclidean', 'manhattan', 'minkowski', 'chebyshev', 'wminkowski', 'seuclidiean', 'mahalanobis'], 
    'weights':['uniform', 'distance'], 'algorithm':['ball_tree', 'kd_tree', 'brute']
}

In [11]:
knn = neighbors.KNeighborsRegressor()
grid = GridSearchCV(estimator=knn, param_grid=params, n_jobs=5, cv=kf, scoring='neg_mean_squared_error', verbose=1)

In [12]:
grid.fit(X,y)
best_params = grid.best_params_
model = grid.best_estimator_
score = grid.best_score_
print(best_params)
print(score)

Fitting 5 folds for each of 4158 candidates, totalling 20790 fits
{'algorithm': 'brute', 'metric': 'chebyshev', 'n_neighbors': 7, 'weights': 'uniform'}
-0.03311643398656357


 -0.05535293]


In [13]:
grid = GridSearchCV(estimator=knn, param_grid=params, n_jobs=5, cv=kf, scoring='r2', verbose=1)
grid.fit(X,y)
best_params = grid.best_params_
model = grid.best_estimator_
score = grid.best_score_
print(best_params)
print(score)

Fitting 5 folds for each of 4158 candidates, totalling 20790 fits
{'algorithm': 'ball_tree', 'metric': 'chebyshev', 'n_neighbors': 15, 'weights': 'distance'}
0.6086778487576863




In [19]:
knn = neighbors.KNeighborsRegressor(algorithm='brute', metric='euclidean', n_neighbors=7, weights='uniform')
knn.fit(X,y)
prediction = pd.DataFrame(knn.predict(prediction_set), columns=['predicted voteshare'])
# prediction.merge(predictions_names)
results = pd.concat([predictions_names, prediction], axis=1)
results = results.sort_values(by=['predicted voteshare'], ascending=False)
results

Unnamed: 0,Name,predicted voteshare
6,Rudy Gobert,0.251
0,Stephen Curry,0.240857
2,Giannis Antetokounmpo,0.213143
4,Chris Paul,0.184286
8,Deandre Ayton,0.179714
9,DeMar DeRozan,0.178571
3,Nikola Jokic,0.175571
5,Jimmy Butler,0.135143
1,Kevin Durant,0.094857
7,James Harden,0.090286


In [36]:
Selected_features = {'overall_seed', 'pts', 'per', 'ts', 'vorp', 'ws'}

KeyError: "['ws'] not in index"

In [34]:
xLR = y1984[Selected_features]
predic = knn.predict(xLR)

prediction84 = pd.DataFrame({"Year": y1984['season_start'],
                      "Player": y1984['player'],
                      "Prediction": predic })
prediction84 = prediction84[prediction84['Prediction'] >= 0]
prediction84 = prediction84.sort_values('Prediction', ascending=False)
prediction84

KeyError: "['ws'] not in index"

In [None]:
preds = [prediction84['Player'].iloc[0], prediction85['Player'].iloc[0], prediction86['Player'].iloc[0], prediction87['Player'].iloc[0], prediction88['Player'].iloc[0], prediction89['Player'].iloc[0], prediction90['Player'].iloc[0], prediction91['Player'].iloc[0], prediction92['Player'].iloc[0], prediction93['Player'].iloc[0], prediction94['Player'].iloc[0], prediction95['Player'].iloc[0], prediction96['Player'].iloc[0], prediction97['Player'].iloc[0], prediction98['Player'].iloc[0], prediction99['Player'].iloc[0], prediction00['Player'].iloc[0], prediction01['Player'].iloc[0], prediction02['Player'].iloc[0], prediction03['Player'].iloc[0], prediction04['Player'].iloc[0], prediction05['Player'].iloc[0], prediction06['Player'].iloc[0], prediction07['Player'].iloc[0], prediction08['Player'].iloc[0], prediction09['Player'].iloc[0], prediction10['Player'].iloc[0], prediction11['Player'].iloc[0], prediction12['Player'].iloc[0], prediction13['Player'].iloc[0], prediction14['Player'].iloc[0], prediction15['Player'].iloc[0], prediction16['Player'].iloc[0], prediction17['Player'].iloc[0], prediction18['Player'].iloc[0]]

true= pd.DataFrame({"Year": historical_df['season_start'],
                     "MVP": historical_df['player'],
                     "i": historical_df['won_mvp']})
true = true[true['i'] == 1]
true = true.drop(['i'], axis=1)
true['LR_Predicted'] = preds

def highlight_col(x):
    #copy df to new - original data are not changed
    df = x.copy()
    #set default values to all values
    df.loc[:,:] = 'background-color: ""'
    #set by condition
    for i in range(35):
      if x.iloc[i,2]!=x.iloc[i,1]:
          df.iloc[i,2] = 'color: red'
      else:
          df.iloc[i,2] = 'color: black'
    return df    

true.style.apply(highlight_col, axis=None)

# OTHER

In [193]:
X.columns.values
X2 = X.drop(['trb_perc', 'team_wins'], axis=1)
X3 = X[['overall_seed', 'mp', 'fg_perc', 'efg', 'tov', 'pts', 'per', 'ts', 'usg_perc', 'ws_per_48', 'vorp', 'bpm', 'offensive_raptor', 'defensive_raptor', 'num_mvps_won', 'num_top_rank', 'consecutive_mvps']].copy()

In [196]:
prediction_set.columns.values
prediction_set2 = prediction_set.drop(['team_wins'], axis=1).copy()
prediction_set2 = prediction_set2[['overall_seed', 'mp', 'fg_perc', 'efg', 'tov', 'pts', 'per', 'ts', 'usg_perc', 'ws_per_48', 'vorp', 'bpm', 'offensive_raptor', 'defensive_raptor', 'num_mvps_won', 'num_top_rank', 'consecutive_mvps']].copy()

In [205]:
grid.fit(X3,y)
best_params = grid.best_params_
model = grid.best_estimator_
score = grid.best_score_
print(best_params)
print(score)

Fitting 5 folds for each of 4158 candidates, totalling 20790 fits
{'algorithm': 'ball_tree', 'metric': 'euclidean', 'n_neighbors': 8, 'weights': 'distance'}
0.6135865227862689




In [107]:
knn = neighbors.KNeighborsRegressor(algorithm='brute', metric='euclidean', n_neighbors=7, weights='uniform')
knn.fit(X,y)
prediction = pd.DataFrame(knn.predict(prediction_set), columns=['predicted voteshare'])
# prediction.merge(predictions_names)
results = pd.concat([predictions_names, prediction], axis=1)
results

Unnamed: 0,Name,predicted voteshare
0,Stephen Curry,0.240857
1,Kevin Durant,0.094857
2,Giannis Antetokounmpo,0.213143
3,Nikola Jokic,0.175571
4,Chris Paul,0.184286
5,Jimmy Butler,0.135143
6,Rudy Gobert,0.251
7,James Harden,0.090286
8,Deandre Ayton,0.179714
9,DeMar DeRozan,0.178571


In [110]:
results.sort_values(by=['predicted voteshare'], ascending=False)

Unnamed: 0,Name,predicted voteshare
6,Rudy Gobert,0.251
0,Stephen Curry,0.240857
2,Giannis Antetokounmpo,0.213143
4,Chris Paul,0.184286
8,Deandre Ayton,0.179714
9,DeMar DeRozan,0.178571
3,Nikola Jokic,0.175571
5,Jimmy Butler,0.135143
1,Kevin Durant,0.094857
7,James Harden,0.090286
