<a href="https://colab.research.google.com/github/TonyLiu836/NBA-MVP-Predictor/blob/main/NBA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nba_api
!pip install requests

In [None]:
import pandas as pd
import requests

## Data Collection

#### Get player stats by season

In [None]:
from nba_api.stats.endpoints import teamplayerdashboard
from nba_api.stats.endpoints import leaguedashteamstats
from nba_api.stats.endpoints import playerdashboardbyyearoveryear
import random
import numpy as np

# Get list of teams that played in a season
def get_teams(start_year):
    end_year = (start_year + 1) % 100
    leagueteams = leaguedashteamstats.LeagueDashTeamStats(season=f'{start_year}-{end_year:02}')
    teams = leagueteams.get_data_frames()[0]
    return teams.loc[:, 'TEAM_ID']

# Get list of players that played > 1000 min and scored > 600 points for a team in a season.
# 1000 min and 600 points total is ~12 mpg and 7 ppg so this filters out players that don't get much playtime.
def get_players(team_id, start_year):
    end_year = (start_year + 1) % 100
    teamplayers = teamplayerdashboard.TeamPlayerDashboard(team_id, season=f'{start_year}-{end_year:02}')
    players = teamplayers.get_data_frames()[1]
    good_players = players[players.loc[:,'MIN'] > 1000]
    good_players = good_players[good_players.loc[:, 'PTS'] > 600]
    return good_players.loc[:, ['PLAYER_ID', 'PLAYER_NAME']]

# Get player stats for a season
def get_player_stats(player_id, start_year):
    end_year = (start_year + 1) % 100
    playerdashboard = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(player_id, per_mode_detailed='PerGame')
    stats = playerdashboard.get_data_frames()[1]
    seasonstats = stats[stats['GROUP_VALUE'] == f'{start_year}-{end_year:02}']
    return seasonstats.drop(columns=['GROUP_SET', 'TEAM_ID', 'MAX_GAME_DATE', 'CFID', 'CFPARAMS'])

teams_2020 = get_teams(2020)
rand_team = get_players(teams_2020.iat[random.randint(0, 29)], 2020)
print(rand_team.shape)
rand_player = rand_team.iloc[random.randint(0, 5), :]
print(rand_player)
stats = get_player_stats(rand_player.iat[0], 2020)
print(stats)
print(stats.columns)

In [None]:
import pandas as pd
import time
from nba_api.stats.endpoints import teamplayerdashboard
from nba_api.stats.endpoints import leaguedashteamstats

# Iterate through each season and save data to csv
for start_year in range(2012, 2022):
    data = []
    end_year = (start_year + 1) % 100
    print(f'{start_year}-{end_year:02}')
    leagueteams = get_teams(start_year)
    for team_id in leagueteams:
        players = get_players(team_id, start_year)
        for player in players.itertuples():
            print(player)
            time.sleep(1)
            stats = get_player_stats(player[1], start_year)
            stats.insert(0, "PLAYER_NAME", player[2])
            data.append(stats)
    
    big_data = pd.concat(data)
    big_data.to_csv(f'{start_year}-{end_year:02}.csv')

#### Get MVP List, MVP Votes, ROY Votes

In [None]:
from bs4 import BeautifulSoup

URL = "https://www.nba.com/news/history-mvp-award-winners"
page = requests.get(URL)

#webscrape NBA.com's mvp page for MVP names and corresponding years
def get_MVP_List():
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find(id = "__next")
    players = results.find_all("div", class_="Article_article__2Ue3h")
    rawData = []
    for sample in players:
      mvps = sample.find_all("p")
      for mvp in mvps:
        rawData.append(str(mvp))

    rawData = rawData[2:]
    mvp_list = []
    for j in rawData:
      j = j[3:]                   #get rid of <p> and </p>
      j = j[:-4]
      info = j.split()  
      mvp_list.append([info[0], info[2] + " " + info[3], info[4] + " " + info[5]])

    return mvp_list

mvp_data = get_MVP_List()

mvp_dataframe = pd.DataFrame(mvp_data, columns = ["season", "Name", "Team"])
mvp_dataframe.to_csv("data/MVP_List.csv", index = False)



In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import time, requests

data_dir = "/home/data/data" if 'google.colab' in str(get_ipython()) else "data"
mvp_list = pd.read_csv(f'{data_dir}/MVP_List.csv')
samples = mvp_list.loc[mvp_list["season"] >= "1996-97"]
seasons = samples["season"]
seasons = [season[0:4] for season in seasons]

def getAwardVotes(seasons):
    award_names = ["mvp", "roy"]
    table_data = []
    for season in seasons:
        year = int(season) + 1
        print(year)
        time.sleep(1)
        URL = "https://www.basketball-reference.com/awards/awards_" + str(year) + ".html"
        page = requests.get(URL)
        soup = BeautifulSoup(page.content, "html.parser")

        for award in award_names:
            print(award)
            tables = soup.find("table", id = award)
            stats = tables.find("tbody").find_all("tr")

            for row in stats:
                player_name = row.find("td", {"data-stat":"player"}).find("a").get_text()
                first_votes = row.find("td", {"data-stat":"votes_first"}).get_text()
                pts_won = row.find("td", {"data-stat":"points_won"}).get_text()
                pts_max = row.find("td", {"data-stat":"points_max"}).get_text()
                award_share = row.find("td", {"data-stat":"award_share"}).get_text()
                table_data.append([season + "-" + str(year)[-2:], award,player_name, first_votes,pts_won, pts_max, award_share])
    return table_data

awards_data = getAwardVotes(seasons)
awards_dataframe = pd.DataFrame(awards_data, columns = ["season","Award", "Name", "First Votes", "Points Won", "Max Points", "Award Shares"])
awards_dataframe.to_csv("data/Awards_Voting_Data.csv", index=False)

2021
mvp
roy
2020
mvp
roy
2019
mvp
roy
2018
mvp
roy
2017
mvp
roy
2016
mvp
roy
2015
mvp
roy
2014
mvp
roy
2013
mvp
roy
2012
mvp
roy
2011
mvp
roy
2010
mvp
roy
2009
mvp
roy
2008
mvp
roy
2007
mvp
roy
2006
mvp
roy
2005
mvp
roy
2004
mvp
roy
2003
mvp
roy
2002
mvp
roy
2001
mvp
roy
2000
mvp
roy
1999
mvp
roy
1998
mvp
roy
1997
mvp
roy


In [None]:
from google.colab import drive
drive.mount('drive')
%cd /content/drive/MyDrive/NBA_MLProject
!pwd
!unzip data.zip -d /home/data

Mounted at drive
/content/drive/MyDrive/NBA_MLProject
/content/drive/MyDrive/NBA_MLProject
Archive:  data.zip
   creating: /home/data/data/
  inflating: /home/data/data/1996-97.csv  
  inflating: /home/data/data/1997-98.csv  
  inflating: /home/data/data/1998-99.csv  
  inflating: /home/data/data/1999-00.csv  
  inflating: /home/data/data/2000-01.csv  
  inflating: /home/data/data/2001-02.csv  
  inflating: /home/data/data/2002-03.csv  
  inflating: /home/data/data/2003-04.csv  
  inflating: /home/data/data/2004-05.csv  
  inflating: /home/data/data/2005-06.csv  
  inflating: /home/data/data/2006-07.csv  
  inflating: /home/data/data/2007-08.csv  
  inflating: /home/data/data/2008-09.csv  
  inflating: /home/data/data/2009-10.csv  
  inflating: /home/data/data/2010-11.csv  
  inflating: /home/data/data/2011-12.csv  
  inflating: /home/data/data/2012-13.csv  
  inflating: /home/data/data/2013-14.csv  
  inflating: /home/data/data/2014-15.csv  
  inflating: /home/data/data/2015-16.csv  


## Split Years Using K-Fold 

In [None]:
import pandas as pd
from sklearn.model_selection import KFold

num_folds = 5
data_dir = "/home/data/data" if 'google.colab' in str(get_ipython()) else "data"
def splitDataYears():
    #mvp_list = pd.read_csv(r'/content/drive/MyDrive/MVP_List.csv')          #make sure to change path to MVP_List.csv file
    mvp_list = pd.read_csv(f'{data_dir}/MVP_List.csv')
    samples = mvp_list.loc[mvp_list["season"] >= "1996-97"]
    years = samples["season"]
    trainingSamples = years.sample(frac=0.8)    #80/20 split for training/testing data 
    testSamples = years.drop(trainingSamples.index)
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=None)   #split training data into 5 folds, each fold contains 4 seasons

    trainingFolds = []
    for train, valid in kf.split(trainingSamples):
      trainingFolds.append([trainingSamples.iloc[train].values.tolist(), trainingSamples.iloc[valid].values.tolist()])
      
    testData = []
    for i in range(testSamples.size):
      testData.append(testSamples.iloc[i])
    return trainingFolds, testData

trainingSet, testingSet = splitDataYears()
# trainingSet = [[80% of seasons (train)],[20% of seasons (test)]] * 5
# testingSet = [season] * 5
  

In [None]:
import pandas as pd

data_dir = "/home/data/data" if 'google.colab' in str(get_ipython()) else "data"

drop_cols = ['Unnamed: 0', 'TEAM_ABBREVIATION', 'NBA_FANTASY_PTS',
                'GP_RANK', 'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK',
                'FGM_RANK', 'FGA_RANK', 'FG_PCT_RANK', 'FG3M_RANK', 'FG3A_RANK', 'FG3_PCT_RANK', 
                'FTM_RANK', 'FTA_RANK', 'FT_PCT_RANK', 'OREB_RANK', 'DREB_RANK', 'REB_RANK',
                'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK', 'PFD_RANK',
                'PTS_RANK', 'PLUS_MINUS_RANK', 'NBA_FANTASY_PTS_RANK', 'DD2_RANK', 'TD3_RANK']

mvp_rename_cols = {"Name": "PLAYER_NAME", "season": "GROUP_VALUE", "Award Shares": "MVP_SHARES"}
roy_rename_cols = {"Name": "PLAYER_NAME", "season": "GROUP_VALUE", "Award Shares": "ROY_SHARES"}

vote_data = pd.read_csv(f"{data_dir}/Awards_Voting_Data.csv")

mvp_vote_data = vote_data.loc[vote_data['Award'] == "mvp"]
mvp_vote_data = mvp_vote_data.rename(columns=mvp_rename_cols)

roy_vote_data = vote_data.loc[vote_data["Award"] == "roy"]
roy_vote_data = roy_vote_data.rename(columns=roy_rename_cols)

train = [[pd.read_csv(f"{data_dir}/{season}.csv").drop(columns=drop_cols).drop_duplicates(subset=['PLAYER_NAME'])
         for season in trainingSet[k][0]] for k in range(num_folds)]

val = [[pd.read_csv(f"{data_dir}/{season}.csv").drop(columns=drop_cols).drop_duplicates(subset=['PLAYER_NAME'])
         for season in trainingSet[k][1]] for k in range(num_folds)]

test = [[pd.read_csv(f"{data_dir}/{season}.csv").drop(columns=drop_cols).drop_duplicates(subset=['PLAYER_NAME'])
         for season in testingSet]]

print(train[0][0])

         PLAYER_NAME GROUP_VALUE  GP   W   L  ...  PFD   PTS  PLUS_MINUS  DD2  TD3
0    DeMarre Carroll     2013-14  73  37  36  ...  1.8  11.1         1.4    3    0
1        Jeff Teague     2013-14  79  37  42  ...  4.1  16.5         1.5   13    0
2        Kyle Korver     2013-14  71  37  34  ...  1.2  12.0         3.0    0    0
3       Lou Williams     2013-14  60  26  34  ...  2.4  10.4        -2.8    0    0
4         Mike Scott     2013-14  80  36  44  ...  1.4   9.6        -2.4    1    0
..               ...         ...  ..  ..  ..  ...  ...   ...         ...  ...  ...
168        John Wall     2013-14  82  44  38  ...  4.0  19.3         2.4   29    2
169    Marcin Gortat     2013-14  81  44  37  ...  2.6  13.2         3.6   37    0
170  Martell Webster     2013-14  78  42  36  ...  1.6   9.7        -0.2    1    0
171             Nene     2013-14  53  30  23  ...  4.8  14.2         2.2    2    0
172     Trevor Ariza     2013-14  77  41  36  ...  2.1  14.4         2.5   11    0

[16

#### Add Award Shares Data

In [None]:
assert("MVP_SHARES" not in train[0][0]), "MVP_SHARES already added to df"
mvp = mvp_vote_data.loc[:,["PLAYER_NAME", "GROUP_VALUE", "MVP_SHARES"]]

for i in range(len(train)):
    for j in range(len(train[i])):
        train[i][j] = train[i][j].merge(mvp, how='left', on=["PLAYER_NAME", "GROUP_VALUE"])   #union of both dataframes, if no vote shares value found then set to Nan
        train[i][j].fillna(value={"MVP_SHARES": 0.}, inplace=True)                             # replace Nans with 0

for i in range(len(val)):
    for j in range(len(val[i])):
        val[i][j] = val[i][j].merge(mvp, how='left', on=["PLAYER_NAME", "GROUP_VALUE"])
        val[i][j].fillna(value={"MVP_SHARES": 0.}, inplace=True)
                
for i in range(len(test)):
    for j in range(len(test[i])):
        test[i][j] = test[i][j].merge(mvp, how='left', on=["PLAYER_NAME", "GROUP_VALUE"])
        test[i][j].fillna(value={"MVP_SHARES": 0.}, inplace=True)

AssertionError: ignored

In [None]:
train[0][0].loc[train[0][0]["MVP_SHARES"] > 0.01]

Unnamed: 0,PLAYER_NAME,GROUP_VALUE,GP,W,L,W_PCT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,DD2,TD3,MVP_SHARES
17,Al Jefferson,2013-14,73,38,35,0.521,35.0,9.6,18.8,0.509,0.0,0.2,0.2,2.6,3.8,0.69,2.1,8.7,10.8,2.1,1.7,0.9,1.1,1.0,2.4,2.9,21.8,0.4,42,0,0.027
24,Joakim Noah,2013-14,80,48,32,0.6,35.2,4.8,10.0,0.475,0.0,0.0,0.0,3.1,4.2,0.737,3.5,7.7,11.3,5.4,2.4,1.2,1.5,0.9,3.1,3.6,12.6,2.9,47,4,0.258
53,Stephen Curry,2013-14,78,50,28,0.641,36.5,8.4,17.7,0.471,3.3,7.9,0.424,3.9,4.5,0.885,0.6,3.7,4.3,8.5,3.8,1.6,0.2,0.4,2.5,4.0,24.0,7.4,28,4,0.053
56,James Harden,2013-14,73,48,25,0.658,38.0,7.5,16.5,0.456,2.4,6.6,0.366,7.9,9.1,0.866,0.8,3.9,4.7,6.1,3.6,1.6,0.4,0.8,2.4,5.8,25.4,5.6,14,1,0.068
63,Paul George,2013-14,80,54,26,0.675,36.2,7.2,17.0,0.424,2.3,6.3,0.364,5.0,5.8,0.864,0.8,6.0,6.8,3.5,2.8,1.9,0.3,0.8,2.5,4.8,21.7,4.6,12,1,0.026
65,Blake Griffin,2013-14,80,56,24,0.7,35.8,9.0,17.0,0.528,0.2,0.6,0.273,6.0,8.4,0.715,2.4,7.1,9.5,3.9,2.8,1.2,0.6,0.8,3.3,7.3,24.1,7.1,43,1,0.347
66,Chris Paul,2013-14,62,44,18,0.71,35.0,6.5,14.0,0.467,1.3,3.4,0.368,4.8,5.6,0.855,0.6,3.7,4.3,10.7,2.3,2.5,0.1,0.5,2.5,4.5,19.1,8.5,39,0,0.036
81,LeBron James,2013-14,77,52,25,0.675,37.7,10.0,17.6,0.567,1.5,4.0,0.379,5.7,7.6,0.75,1.1,5.9,6.9,6.3,3.5,1.6,0.3,0.5,1.6,6.0,27.1,5.3,12,1,0.713
91,Kevin Love,2013-14,77,39,38,0.506,36.3,8.4,18.5,0.457,2.5,6.6,0.376,6.8,8.2,0.821,2.9,9.6,12.5,4.4,2.5,0.8,0.5,0.8,1.8,6.4,26.1,4.6,65,3,0.02
106,Kevin Durant,2013-14,81,58,23,0.716,38.5,10.5,20.8,0.503,2.4,6.1,0.391,8.7,9.9,0.873,0.7,6.7,7.4,5.5,3.5,1.3,0.7,0.5,2.1,6.4,32.0,6.3,27,3,0.986


#### Process Data

In [None]:
train_processed = []
for fold in train:
    fold_processed = []
    for season in fold:
        if "PLAYER_NAME" in season:
            season = season.drop(columns = ["PLAYER_NAME", "GROUP_VALUE"])
            season.loc[:, season.columns != "MVP_SHARES"] =\
                season.loc[:, season.columns != "MVP_SHARES"]\
                .apply(lambda x: (x - x.min()) / (x.max() - x.min()))
            fold_processed.append(season)
    train_processed.append(fold_processed)

val_processed = []
for fold in val:
    fold_processed = []
    for season in fold:
        if "PLAYER_NAME" in season:
            season = season.drop(columns = ["PLAYER_NAME", "GROUP_VALUE"])
            season.loc[:, season.columns != "MVP_SHARES"] =\
                season.loc[:, season.columns != "MVP_SHARES"]\
                .apply(lambda x: (x - x.min()) / (x.max() - x.min()))
            fold_processed.append(season)
    val_processed.append(fold_processed)


test_processed = []
for fold in test:
    fold_processed = []
    for season in fold:
        if "PLAYER_NAME" in season:
            season = season.drop(columns = ["PLAYER_NAME", "GROUP_VALUE"])
            season.loc[:, season.columns != "MVP_SHARES"] =\
                season.loc[:, season.columns != "MVP_SHARES"]\
                .apply(lambda x: (x - x.min()) / (x.max() - x.min()))
            fold_processed.append(season)
    test_processed.append(fold_processed)

In [None]:
train_processed[0][0].loc[train_processed[0][0]["MVP_SHARES"] > 0.01]

Unnamed: 0,GP,W,L,W_PCT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,FG3_PCT,FTM,FTA,FT_PCT,OREB,DREB,REB,AST,TOV,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,DD2,TD3,MVP_SHARES
17,0.769231,0.538462,0.418182,0.540958,0.828704,0.88,0.833333,0.448845,0.0,0.025316,0.2,0.246914,0.32967,0.521073,0.365385,0.891566,0.768595,0.165049,0.34375,0.272727,0.392857,0.6,0.481481,0.323529,0.581967,0.517857,0.646154,0.0,0.027
24,0.948718,0.730769,0.363636,0.66306,0.837963,0.24,0.246667,0.336634,0.0,0.0,0.0,0.308642,0.373626,0.611111,0.634615,0.771084,0.809917,0.485437,0.5625,0.409091,0.535714,0.533333,0.740741,0.426471,0.204918,0.666667,0.723077,0.8,0.258
53,0.897436,0.769231,0.290909,0.72643,0.898148,0.72,0.76,0.323432,1.0,1.0,0.424,0.407407,0.406593,0.894636,0.076923,0.289157,0.231405,0.786408,1.0,0.590909,0.071429,0.2,0.518519,0.485294,0.672131,0.934524,0.430769,0.8,0.053
56,0.769231,0.730769,0.236364,0.752705,0.967593,0.6,0.68,0.273927,0.727273,0.835443,0.366,0.901235,0.912088,0.858238,0.115385,0.313253,0.264463,0.553398,0.9375,0.590909,0.142857,0.466667,0.481481,0.75,0.729508,0.827381,0.215385,0.2,0.068
63,0.948718,0.846154,0.254545,0.77898,0.884259,0.56,0.713333,0.168317,0.69697,0.797468,0.364,0.54321,0.549451,0.854406,0.115385,0.566265,0.438017,0.300971,0.6875,0.727273,0.107143,0.466667,0.518519,0.602941,0.577869,0.767857,0.184615,0.2,0.026
65,0.948718,0.884615,0.218182,0.81762,0.865741,0.8,0.713333,0.511551,0.060606,0.075949,0.273,0.666667,0.835165,0.568966,0.423077,0.698795,0.661157,0.339806,0.6875,0.409091,0.214286,0.466667,0.814815,0.970588,0.67623,0.916667,0.661538,0.2,0.347
66,0.487179,0.653846,0.109091,0.833076,0.828704,0.466667,0.513333,0.310231,0.393939,0.43038,0.368,0.518519,0.527473,0.837165,0.076923,0.289157,0.231405,1.0,0.53125,1.0,0.035714,0.266667,0.518519,0.558824,0.471311,1.0,0.6,0.0,0.036
81,0.871795,0.807692,0.236364,0.77898,0.953704,0.933333,0.753333,0.640264,0.454545,0.506329,0.379,0.62963,0.747253,0.636015,0.173077,0.554217,0.446281,0.572816,0.90625,0.590909,0.107143,0.266667,0.185185,0.779412,0.79918,0.809524,0.184615,0.2,0.713
91,0.871795,0.557692,0.472727,0.517774,0.888889,0.72,0.813333,0.277228,0.757576,0.835443,0.376,0.765432,0.813187,0.772031,0.519231,1.0,0.909091,0.38835,0.59375,0.227273,0.178571,0.466667,0.259259,0.838235,0.758197,0.767857,1.0,0.6,0.02
106,0.974359,0.923077,0.2,0.842349,0.990741,1.0,0.966667,0.429043,0.727273,0.772152,0.391,1.0,1.0,0.871648,0.096154,0.650602,0.487603,0.495146,0.90625,0.454545,0.25,0.266667,0.37037,0.838235,1.0,0.869048,0.415385,0.6,0.986


# New Section

In [None]:
import torch 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from numpy import vstack
from numpy import argmax
from sklearn.metrics import accuracy_score
from torch.utils.data import Dataset
import pandas as pd


class MVPDataset(Dataset):
    def __init__(self, samples):
        #self.samples = samples #the results of preprocessing

        self.n_samples = len(self.train)

    def __len__(self):
        return self.n_samples

    def __getitem__(self, index):
        return self.samples[index]


def main():

    trainingDataset = MVPDataset(train)    #train - replace with results of preprocessing

    for i in range(num_folds):
        trainingDataset = MVPDataset(train[i])
        validDataset = MVPDataset(val[i])
        print(trainingDataset)
        print("   ")
        print(validDataset)

main()