# Load packages

In [1]:
import pandas as pd
import numpy as np

# Clean up your training set

In [2]:
training_set = pd.read_csv('nba_mvp_training.csv')

In [3]:
# Fill our blanks with zero's
training_set.fillna(0, inplace=True)

# Format the win loss records as numeric rather than hypen-seperated
training_set['Overall_W'] = training_set['Overall'].apply(lambda x: int(x.split('-')[0]))
training_set['Overall_L'] = training_set['Overall'].apply(lambda x: int(x.split('-')[1]))
training_set['Home_W'] = training_set['Home'].apply(lambda x: int(x.split('-')[0]))
training_set['Home_L'] = training_set['Home'].apply(lambda x: int(x.split('-')[1]))
training_set['Road_W'] = training_set['Road'].apply(lambda x: int(x.split('-')[0]))
training_set['Road_L'] = training_set['Road'].apply(lambda x: int(x.split('-')[1]))

In [4]:
# Drop the unnecessary columns
training_set_cleaned = training_set.drop(['Player', 'Overall', 'Team',
                                          'Road', 'Home', 'Neutral'], axis=1)

In [5]:
# Get dummy variables for factors
training_set_cleaned = pd.get_dummies(training_set_cleaned)

Training Set Created!

# Set up your test set (the most recent season's data)

In [6]:
stand_2017 = pd.read_csv('2017_standings.csv')
players_2017 = pd.read_csv('2017_players.csv')
teams = pd.read_csv('teams_historic.csv')

In [7]:
# Bring together players with their records
teams_merged = pd.merge(stand_2017, teams,
                        left_on=['Team'], right_on=['name'])
players_merged = pd.merge(teams_merged, players_2017,
                          left_on=['abbreviation'], right_on=['Tm'])

In [8]:
# The same win - loss conversion as before
players_merged['Overall_W'] = players_merged['Overall'].apply(lambda x: int(x.split('-')[0]))
players_merged['Overall_L'] = players_merged['Overall'].apply(lambda x: int(x.split('-')[1]))
players_merged['Home_W'] = players_merged['Home'].apply(lambda x: int(x.split('-')[0]))
players_merged['Home_L'] = players_merged['Home'].apply(lambda x: int(x.split('-')[1]))
players_merged['Road_W'] = players_merged['Road'].apply(lambda x: int(x.split('-')[0]))
players_merged['Road_L'] = players_merged['Road'].apply(lambda x: int(x.split('-')[1]))

In [9]:
# Drop extra columns
test_set_cleaned = players_merged.drop(['Rk_x','Overall','Team','Road','Home','E','W','A','C',
                                        'SE','NW','SW','P','Pre','Post', 'Player',
                                        '≤3', '≥10', 'Oct', 'Nov', 'Dec', 'Jan','Feb',
                                        'Mar', 'Apr', 'name', 'abbreviation', 'Rk_y'],
                                        axis=1)

In [10]:
test_set_cleaned = pd.get_dummies(test_set_cleaned)

In [11]:
# Iterate through list to ensure that all training columns are in the test set

for i in training_set_cleaned:
    if i in test_set_cleaned.columns:
        x=1
    else:
        test_set_cleaned[i]=0

In [12]:
# Iterate to make sure everything in the test set is in the training set

for i in test_set_cleaned:
    if i in training_set_cleaned.columns:
        x=1
    else:
        test_set_cleaned.drop(i,axis=1,inplace=True)

In [13]:
# Fill in any last blanks
test_set_cleaned.fillna(0, inplace=True)

In [14]:
from sklearn import linear_model
from sklearn import ensemble
from sklearn import tree

In [15]:
# Remove voting results from our inputs
X = training_set_cleaned.drop(['First', 'Points', 'Pointsmax', 'Share'], axis=1)

# Create a response, which in our case is the share of the vote
y = training_set_cleaned['Share']

# Train your basic tree
mod = tree.DecisionTreeRegressor()
mod.fit(X,y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [16]:
# Feature importance
importance = pd.DataFrame()
importance['features'] = X.columns
importance['scores'] = mod.feature_importances_

importance.sort_values('scores', ascending=False).head()

Unnamed: 0,features,scores
25,PTS,0.2956
28,Overall_L,0.208527
19,TRB,0.059688
20,AST,0.057244
24,PF,0.052399


In [17]:
importance[importance['features']=='Tm_LAL']

Unnamed: 0,features,scores
66,Tm_LAL,0.00014


In [18]:
importance[importance['features']=='Tm_LAC']

Unnamed: 0,features,scores
65,Tm_LAC,0.000339


In [19]:
# Evaluate your tree
mod.score(X,y)

0.99997764733650707

In [20]:
X_test = test_set_cleaned.drop(['First', 'Points', 'Pointsmax', 'Share'], axis=1)
test_pred = mod.predict(X_test)
test_results = pd.DataFrame(players_merged.Player)
test_results['score'] = test_pred

In [21]:
test_results.sort_values('score', ascending=False).head()

Unnamed: 0,Player,score
72,LeBron James,0.773
2,Stephen Curry,0.773
42,James Harden,0.773
63,Isaiah Thomas,0.773
104,DeMar DeRozan,0.773


In [22]:
# Try again with Random Forest

mod = ensemble.RandomForestRegressor(n_estimators=25)
mod.fit(X,y)
test_pred = mod.predict(X_test)
test_results = pd.DataFrame(players_merged.Player)
test_results['score'] = test_pred
test_results.sort_values('score', ascending=False).head()

Unnamed: 0,Player,score
170,Russell Westbrook,0.46164
333,Anthony Davis,0.42348
268,Damian Lillard,0.37712
384,Karl-Anthony Towns,0.35124
72,LeBron James,0.28988


In [23]:
# Find the results for a given year

results = pd.DataFrame()
results['rf_score'] = mod.predict(X)
results['player'] = training_set['Player']
results['year'] = training_set['year']
results['share'] = training_set['Share']
results_2016 = results[results['year'] == 2016]

In [24]:
results_2016.sort_values('share', ascending=False).head()

Unnamed: 0,rf_score,player,year,share
107,0.91396,Stephen Curry,2016,1.0
274,0.237646,Kawhi Leonard,2016,0.484
231,0.43232,LeBron James,2016,0.482
473,0.29072,Russell Westbrook,2016,0.371
127,0.27084,Kevin Durant,2016,0.112


In [25]:
# What if we trim our variable lists...
X_abbreviated = X[['PTS', 'Overall_W', 'TRB', 'AST']]
X_test_abbreviated = X_test[['PTS', 'Overall_W', 'TRB', 'AST']]
mod = ensemble.RandomForestRegressor(n_estimators=250)
mod.fit(X_abbreviated,y)
test_pred = mod.predict(X_test_abbreviated)
test_results = pd.DataFrame(players_merged.Player)
test_results['score'] = test_pred
test_results.sort_values('score', ascending=False).head()

Unnamed: 0,Player,score
2,Stephen Curry,0.533612
170,Russell Westbrook,0.53188
42,James Harden,0.423128
63,Isaiah Thomas,0.224128
72,LeBron James,0.18434


In [26]:
# Gradient Boosting Models

mod = ensemble.GradientBoostingRegressor()
mod.fit(X,y)
test_pred = mod.predict(X_test)
test_results = pd.DataFrame(players_merged.Player)
test_results['score'] = test_pred
test_results.sort_values('score', ascending=False).head()

Unnamed: 0,Player,score
333,Anthony Davis,0.547647
384,Karl-Anthony Towns,0.506487
170,Russell Westbrook,0.478448
2,Stephen Curry,0.410033
72,LeBron James,0.36595


*All data via http://www.basketball-reference.com/