<a href="https://colab.research.google.com/github/TonyLiu836/NBA-MVP-Predictor/blob/main/NBA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nba_api
!pip install requests

In [None]:
import pandas as pd
import requests

## Data Collection

#### Get player stats by season

In [None]:
from nba_api.stats.endpoints import teamplayerdashboard
from nba_api.stats.endpoints import leaguedashteamstats
from nba_api.stats.endpoints import playerdashboardbyyearoveryear
import random
import numpy as np

# Get list of teams that played in a season
def get_teams(start_year):
    end_year = (start_year + 1) % 100
    leagueteams = leaguedashteamstats.LeagueDashTeamStats(season=f'{start_year}-{end_year:02}')
    teams = leagueteams.get_data_frames()[0]
    return teams.loc[:, 'TEAM_ID']

# Get list of players that played > 1000 min and scored > 600 points for a team in a season.
# 1000 min and 600 points total is ~12 mpg and 7 ppg so this filters out players that don't get much playtime.
def get_players(team_id, start_year):
    end_year = (start_year + 1) % 100
    teamplayers = teamplayerdashboard.TeamPlayerDashboard(team_id, season=f'{start_year}-{end_year:02}')
    players = teamplayers.get_data_frames()[1]
    good_players = players[players.loc[:,'MIN'] > 1000]
    good_players = good_players[good_players.loc[:, 'PTS'] > 600]
    return good_players.loc[:, ['PLAYER_ID', 'PLAYER_NAME']]

# Get player stats for a season
def get_player_stats(player_id, start_year):
    end_year = (start_year + 1) % 100
    playerdashboard = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(player_id, per_mode_detailed='PerGame')
    stats = playerdashboard.get_data_frames()[1]
    seasonstats = stats[stats['GROUP_VALUE'] == f'{start_year}-{end_year:02}']
    return seasonstats.drop(columns=['GROUP_SET', 'TEAM_ID', 'MAX_GAME_DATE', 'CFID', 'CFPARAMS'])

teams_2020 = get_teams(2020)
rand_team = get_players(teams_2020.iat[random.randint(0, 29)], 2020)
print(rand_team.shape)
rand_player = rand_team.iloc[random.randint(0, 5), :]
print(rand_player)
stats = get_player_stats(rand_player.iat[0], 2020)
print(stats)
print(stats.columns)

In [None]:
import pandas as pd
import time
from nba_api.stats.endpoints import teamplayerdashboard
from nba_api.stats.endpoints import leaguedashteamstats

# Iterate through each season and save data to csv
for start_year in range(2012, 2022):
    data = []
    end_year = (start_year + 1) % 100
    print(f'{start_year}-{end_year:02}')
    leagueteams = get_teams(start_year)
    for team_id in leagueteams:
        players = get_players(team_id, start_year)
        for player in players.itertuples():
            print(player)
            time.sleep(1)
            stats = get_player_stats(player[1], start_year)
            stats.insert(0, "PLAYER_NAME", player[2])
            data.append(stats)
    
    big_data = pd.concat(data)
    big_data.to_csv(f'{start_year}-{end_year:02}.csv')

#### Get MVP List, MVP Votes, ROY Votes

In [None]:
from bs4 import BeautifulSoup

URL = "https://www.nba.com/news/history-mvp-award-winners"
page = requests.get(URL)

#webscrape NBA.com's mvp page for MVP names and corresponding years
def get_MVP_List():
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find(id = "__next")
    players = results.find_all("div", class_="Article_article__2Ue3h")
    rawData = []
    for sample in players:
      mvps = sample.find_all("p")
      for mvp in mvps:
        rawData.append(str(mvp))

    rawData = rawData[2:]
    mvp_list = []
    for j in rawData:
      j = j[3:]                   #get rid of <p> and </p>
      j = j[:-4]
      info = j.split()  
      mvp_list.append([info[0], info[2] + " " + info[3], info[4] + " " + info[5]])

    return mvp_list

mvp_data = get_MVP_List()

mvp_dataframe = pd.DataFrame(mvp_data, columns = ["season", "Name", "Team"])
mvp_dataframe.to_csv("MVP_List.csv", index = False)



In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import time, requests
#mvp_list = pd.read_csv (r'/content/drive/MyDrive/NBA_MLProject/MVP_List.csv')
mvp_list = pd.read_csv('data/MVP_List.csv')
samples = mvp_list.loc[mvp_list["season"] >= "1996-97"]
seasons = samples["season"]
seasons = [season[0:4] for season in seasons]

def getAwardVotes(seasons):
    award_names = ["mvp", "roy"]
    table_data = []
    for season in seasons:
        year = int(season) + 1
        print(year)
        time.sleep(1)
        URL = "https://www.basketball-reference.com/awards/awards_" + str(year) + ".html"
        page = requests.get(URL)
        soup = BeautifulSoup(page.content, "html.parser")

        for award in award_names:
            print(award)
            tables = soup.find("table", id = award)
            stats = tables.find("tbody").find_all("tr")

            for row in stats:
                player_name = row.find("td", {"data-stat":"player"}).find("a").get_text()
                first_votes = row.find("td", {"data-stat":"votes_first"}).get_text()
                pts_won = row.find("td", {"data-stat":"points_won"}).get_text()
                pts_max = row.find("td", {"data-stat":"points_max"}).get_text()
                award_share = row.find("td", {"data-stat":"award_share"}).get_text()
                table_data.append([season + "-" + str(year)[-2:], award,player_name, first_votes,pts_won, pts_max, award_share])
    return table_data

awards_data = getAwardVotes(seasons)
awards_dataframe = pd.DataFrame(awards_data, columns = ["season","Award", "Name", "First Votes", "Points Won", "Max Points", "Award Shares"])
awards_dataframe.to_csv("data\Awards_Voting_Data.csv", index=False)

2021
mvp
roy
2020
mvp
roy
2019
mvp
roy
2018
mvp
roy
2017
mvp
roy
2016
mvp
roy
2015
mvp
roy
2014
mvp
roy
2013
mvp
roy
2012
mvp
roy
2011
mvp
roy
2010
mvp
roy
2009
mvp
roy
2008
mvp
roy
2007
mvp
roy
2006
mvp
roy
2005
mvp
roy
2004
mvp
roy
2003
mvp
roy
2002
mvp
roy
2001
mvp
roy
2000
mvp
roy
1999
mvp
roy
1998
mvp
roy
1997
mvp
roy


## Split Years Using K-Fold 

In [103]:
import pandas as pd
from sklearn.model_selection import KFold

num_folds = 5

def splitDataYears():
    #mvp_list = pd.read_csv(r'/content/drive/MyDrive/MVP_List.csv')          #make sure to change path to MVP_List.csv file
    mvp_list = pd.read_csv('data/MVP_List.csv')
    samples = mvp_list.loc[mvp_list["season"] >= "1996-97"]
    years = samples["season"]
    trainingSamples = years.sample(frac=0.8)    #80/20 split for training/testing data 
    testSamples = years.drop(trainingSamples.index)
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=None)   #split training data into 5 folds, each fold contains 4 seasons

    trainingFolds = []
    for train, valid in kf.split(trainingSamples):
      trainingFolds.append([trainingSamples.iloc[train].values.tolist(), trainingSamples.iloc[valid].values.tolist()])
      
    testData = []
    for i in range(testSamples.size):
      testData.append(testSamples.iloc[i])
    return trainingFolds, testData

trainingSet, testingSet = splitDataYears()
# trainingSet = [[80% of seasons (train)],[20% of seasons (test)]] * 5
# testingSet = [season] * 5
  

In [176]:
import pandas as pd
data_dir = "/home/data/data" if 'google.colab' in str(get_ipython()) else "data"
drop_cols = ['Unnamed: 0', 'TEAM_ABBREVIATION', 'NBA_FANTASY_PTS',
                'GP_RANK', 'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK',
                'FGM_RANK', 'FGA_RANK', 'FG_PCT_RANK', 'FG3M_RANK', 'FG3A_RANK', 'FG3_PCT_RANK', 
                'FTM_RANK', 'FTA_RANK', 'FT_PCT_RANK', 'OREB_RANK', 'DREB_RANK', 'REB_RANK',
                'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK', 'PFD_RANK',
                'PTS_RANK', 'PLUS_MINUS_RANK', 'NBA_FANTASY_PTS_RANK', 'DD2_RANK', 'TD3_RANK']
mvp_rename_cols = {"Name": "PLAYER_NAME", "season": "GROUP_VALUE", "Award Shares": "MVP_SHARES"}
roy_rename_cols = {"Name": "PLAYER_NAME", "season": "GROUP_VALUE", "Award Shares": "ROY_SHARES"}

vote_data = pd.read_csv(f"{data_dir}/Awards_Voting_Data.csv")

mvp_vote_data = vote_data.loc[vote_data['Award'] == "mvp"]
mvp_vote_data = mvp_vote_data.rename(columns=mvp_rename_cols)
print(mvp_vote_data)
roy_vote_data = vote_data.loc[vote_data["Award"] == "roy"]
roy_vote_data = roy_vote_data.rename(columns=roy_rename_cols)

train = [[pd.read_csv(f"{data_dir}/{season}.csv").drop(columns=drop_cols).drop_duplicates(subset=['PLAYER_NAME'])
         for season in trainingSet[k][0]] for k in range(num_folds)]

val = [[pd.read_csv(f"{data_dir}/{season}.csv").drop(columns=drop_cols).drop_duplicates(subset=['PLAYER_NAME'])
         for season in trainingSet[k][1]] for k in range(num_folds)]

test = [[pd.read_csv(f"{data_dir}/{season}.csv").drop(columns=drop_cols).drop_duplicates(subset=['PLAYER_NAME'])
         for season in testingSet]]

print(train[0][0])

    GROUP_VALUE Award            PLAYER_NAME  First Votes  Points Won  \
0       2020-21   mvp           Nikola Jokić         91.0       971.0   
1       2020-21   mvp            Joel Embiid          1.0       586.0   
2       2020-21   mvp          Stephen Curry          5.0       453.0   
3       2020-21   mvp  Giannis Antetokounmpo          1.0       348.0   
4       2020-21   mvp             Chris Paul          2.0       139.0   
..          ...   ...                    ...          ...         ...   
545     1996-97   mvp        Charles Barkley          0.0         2.0   
546     1996-97   mvp          Tom Gugliotta          0.0         1.0   
547     1996-97   mvp          Allen Iverson          0.0         1.0   
548     1996-97   mvp          Kevin Johnson          0.0         1.0   
549     1996-97   mvp            Steve Smith          0.0         1.0   

     Max Points  MVP_SHARES  
0          1010       0.961  
1          1010       0.580  
2          1010       0.449  
3  

#### Add Award Shares Data

In [177]:
assert("MVP_SHARES" not in train[0][0]), "MVP_SHARES already added to df"
mvp = mvp_vote_data.loc[:,["PLAYER_NAME", "GROUP_VALUE", "MVP_SHARES"]]

for i in range(len(train)):
    for j in range(len(train[i])):
        train[i][j] = train[i][j].merge(mvp, how='left', on=["PLAYER_NAME", "GROUP_VALUE"])
        train[i][j].fillna(value={"MVP_SHARES": 0.}, inplace=True)

for i in range(len(val)):
    for j in range(len(val[i])):
        val[i][j] = val[i][j].merge(mvp, how='left', on=["PLAYER_NAME", "GROUP_VALUE"])
        val[i][j].fillna(value={"MVP_SHARES": 0.}, inplace=True)
                
for i in range(len(test)):
    for j in range(len(test[i])):
        test[i][j] = test[i][j].merge(mvp, how='left', on=["PLAYER_NAME", "GROUP_VALUE"])
        test[i][j].fillna(value={"MVP_SHARES": 0.}, inplace=True)

In [178]:
train[0][0].loc[train[0][0]["MVP_SHARES"] > 0.01]

Unnamed: 0,PLAYER_NAME,GROUP_VALUE,GP,W,L,W_PCT,MIN,FGM,FGA,FG_PCT,...,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,DD2,TD3,MVP_SHARES
50,Kevin Durant,2018-19,78,54,24,0.692,34.6,9.2,17.7,0.521,...,0.7,1.1,0.5,2.0,4.9,26.0,8.4,16,2,0.025
52,Stephen Curry,2018-19,69,52,17,0.754,33.8,9.2,19.4,0.472,...,1.3,0.4,0.5,2.4,3.6,27.3,10.0,3,0,0.173
57,James Harden,2018-19,78,51,27,0.654,36.8,10.8,24.5,0.442,...,2.0,0.7,1.4,3.1,7.2,36.1,4.6,34,7,0.768
87,Giannis Antetokounmpo,2018-19,72,56,16,0.778,32.8,10.0,17.3,0.578,...,1.3,1.5,1.4,3.2,7.7,27.7,9.1,54,5,0.932
107,Paul George,2018-19,77,46,31,0.597,36.9,9.2,21.0,0.438,...,2.2,0.4,1.0,2.8,5.5,28.0,6.4,24,1,0.352
119,Joel Embiid,2018-19,64,43,21,0.672,33.7,9.1,18.7,0.484,...,0.7,1.9,1.2,3.3,7.7,27.5,5.8,58,2,0.049
128,Damian Lillard,2018-19,80,51,29,0.638,35.5,8.5,19.2,0.444,...,1.1,0.4,1.1,1.9,4.9,25.8,6.2,13,0,0.068
146,Kawhi Leonard,2018-19,60,41,19,0.683,34.0,9.3,18.8,0.496,...,1.8,0.4,0.8,1.5,6.0,26.6,5.9,14,0,0.013


#### Process Data

In [179]:
train_processed = []
for fold in train:
    fold_processed = []
    for season in fold:
        if "PLAYER_NAME" in season:
            season = season.drop(columns = ["PLAYER_NAME", "GROUP_VALUE"])
            season.loc[:, season.columns != "MVP_SHARES"] =\
                season.loc[:, season.columns != "MVP_SHARES"]\
                .apply(lambda x: (x - x.min()) / (x.max() - x.min()))
            fold_processed.append(season)
    train_processed.append(fold_processed)

val_processed = []
for fold in val:
    fold_processed = []
    for season in fold:
        if "PLAYER_NAME" in season:
            season = season.drop(columns = ["PLAYER_NAME", "GROUP_VALUE"])
            season.loc[:, season.columns != "MVP_SHARES"] =\
                season.loc[:, season.columns != "MVP_SHARES"]\
                .apply(lambda x: (x - x.min()) / (x.max() - x.min()))
            fold_processed.append(season)
    val_processed.append(fold_processed)


test_processed = []
for fold in test:
    fold_processed = []
    for season in fold:
        if "PLAYER_NAME" in season:
            season = season.drop(columns = ["PLAYER_NAME", "GROUP_VALUE"])
            season.loc[:, season.columns != "MVP_SHARES"] =\
                season.loc[:, season.columns != "MVP_SHARES"]\
                .apply(lambda x: (x - x.min()) / (x.max() - x.min()))
            fold_processed.append(season)
    test_processed.append(fold_processed)

In [187]:
train_processed[0][0].loc[train_processed[0][0]["MVP_SHARES"] > 0.01]

Unnamed: 0,GP,W,L,W_PCT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,...,STL,BLK,BLKA,PF,PFD,PTS,PLUS_MINUS,DD2,TD3,MVP_SHARES
50,0.92,0.88,0.25,0.859935,0.883249,0.807229,0.643979,0.505017,0.352941,0.378788,...,0.285714,0.407407,0.266667,0.333333,0.6,0.649306,0.913514,0.231884,0.058824,0.025
52,0.74,0.84,0.115385,0.960912,0.84264,0.807229,0.732984,0.341137,1.0,0.886364,...,0.571429,0.148148,0.266667,0.481481,0.414286,0.694444,1.0,0.043478,0.0,0.173
57,0.92,0.82,0.307692,0.798046,0.994924,1.0,1.0,0.240803,0.941176,1.0,...,0.904762,0.259259,0.866667,0.740741,0.928571,1.0,0.708108,0.492754,0.205882,0.768
87,0.8,0.92,0.096154,1.0,0.791878,0.903614,0.623037,0.695652,0.137255,0.212121,...,0.571429,0.555556,0.866667,0.777778,1.0,0.708333,0.951351,0.782609,0.147059,0.932
107,0.9,0.72,0.384615,0.705212,1.0,0.807229,0.816754,0.227425,0.745098,0.742424,...,1.0,0.148148,0.6,0.62963,0.685714,0.71875,0.805405,0.347826,0.029412,0.352
119,0.64,0.66,0.192308,0.827362,0.837563,0.795181,0.696335,0.381271,0.235294,0.310606,...,0.285714,0.703704,0.733333,0.814815,1.0,0.701389,0.772973,0.84058,0.058824,0.049
128,0.96,0.82,0.346154,0.771987,0.928934,0.722892,0.722513,0.247492,0.588235,0.606061,...,0.47619,0.148148,0.666667,0.296296,0.6,0.642361,0.794595,0.188406,0.0,0.068
146,0.56,0.62,0.153846,0.845277,0.852792,0.819277,0.701571,0.421405,0.372549,0.378788,...,0.809524,0.148148,0.466667,0.148148,0.757143,0.670139,0.778378,0.202899,0.0,0.013
