<a href="https://colab.research.google.com/github/TonyLiu836/NBA-MVP-Predictor/blob/main/NBA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nba_api
!pip install requests

In [None]:
import pandas as pd
import requests

## Data Collection

#### Get player stats by season

In [None]:
from nba_api.stats.endpoints import teamplayerdashboard
from nba_api.stats.endpoints import leaguedashteamstats
from nba_api.stats.endpoints import playerdashboardbyyearoveryear
import random
import numpy as np

# Get list of teams that played in a season
def get_teams(start_year):
    end_year = (start_year + 1) % 100
    leagueteams = leaguedashteamstats.LeagueDashTeamStats(season=f'{start_year}-{end_year:02}')
    teams = leagueteams.get_data_frames()[0]
    return teams.loc[:, 'TEAM_ID']

# Get list of players that played > 1000 min and scored > 600 points for a team in a season.
# 1000 min and 600 points total is ~12 mpg and 7 ppg so this filters out players that don't get much playtime.
def get_players(team_id, start_year):
    end_year = (start_year + 1) % 100
    teamplayers = teamplayerdashboard.TeamPlayerDashboard(team_id, season=f'{start_year}-{end_year:02}')
    players = teamplayers.get_data_frames()[1]
    good_players = players[players.loc[:,'MIN'] > 1000]
    good_players = good_players[good_players.loc[:, 'PTS'] > 600]
    return good_players.loc[:, ['PLAYER_ID', 'PLAYER_NAME']]

# Get player stats for a season
def get_player_stats(player_id, start_year):
    end_year = (start_year + 1) % 100
    playerdashboard = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(player_id, per_mode_detailed='PerGame')
    stats = playerdashboard.get_data_frames()[1]
    seasonstats = stats[stats['GROUP_VALUE'] == f'{start_year}-{end_year:02}']
    return seasonstats.drop(columns=['GROUP_SET', 'TEAM_ID', 'MAX_GAME_DATE', 'CFID', 'CFPARAMS'])

teams_2020 = get_teams(2020)
rand_team = get_players(teams_2020.iat[random.randint(0, 29)], 2020)
print(rand_team.shape)
rand_player = rand_team.iloc[random.randint(0, 5), :]
print(rand_player)
stats = get_player_stats(rand_player.iat[0], 2020)
print(stats)
print(stats.columns)

In [None]:
import pandas as pd
import time
from nba_api.stats.endpoints import teamplayerdashboard
from nba_api.stats.endpoints import leaguedashteamstats

# Iterate through each season and save data to csv
for start_year in range(2012, 2022):
    data = []
    end_year = (start_year + 1) % 100
    print(f'{start_year}-{end_year:02}')
    leagueteams = get_teams(start_year)
    for team_id in leagueteams:
        players = get_players(team_id, start_year)
        for player in players.itertuples():
            print(player)
            time.sleep(1)
            stats = get_player_stats(player[1], start_year)
            stats.insert(0, "PLAYER_NAME", player[2])
            data.append(stats)
    
    big_data = pd.concat(data)
    big_data.to_csv(f'{start_year}-{end_year:02}.csv')

#### Get MVP List, MVP Votes, ROY Votes

In [None]:
from bs4 import BeautifulSoup

URL = "https://www.nba.com/news/history-mvp-award-winners"
page = requests.get(URL)

#webscrape NBA.com's mvp page for MVP names and corresponding years
def get_MVP_List():
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find(id = "__next")
    players = results.find_all("div", class_="Article_article__2Ue3h")
    rawData = []
    for sample in players:
      mvps = sample.find_all("p")
      for mvp in mvps:
        rawData.append(str(mvp))

    rawData = rawData[2:]
    mvp_list = []
    for j in rawData:
      j = j[3:]                   #get rid of <p> and </p>
      j = j[:-4]
      info = j.split()  
      mvp_list.append([info[0], info[2] + " " + info[3], info[4] + " " + info[5]])

    return mvp_list

mvp_data = get_MVP_List()

mvp_dataframe = pd.DataFrame(mvp_data, columns = ["season", "Name", "Team"])
mvp_dataframe.to_csv("MVP_List.csv", index = False)



In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import time, requests
#mvp_list = pd.read_csv (r'/content/drive/MyDrive/NBA_MLProject/MVP_List.csv')
mvp_list = pd.read_csv('data/MVP_List.csv')
samples = mvp_list.loc[mvp_list["season"] >= "1996-97"]
seasons = samples["season"]
seasons = [season[0:4] for season in seasons]

def getAwardVotes(seasons):
    award_names = ["mvp", "roy"]
    table_data = []
    for season in seasons:
        year = int(season) + 1
        print(year)
        time.sleep(1)
        URL = "https://www.basketball-reference.com/awards/awards_" + str(year) + ".html"
        page = requests.get(URL)
        soup = BeautifulSoup(page.content, "html.parser")

        for award in award_names:
            print(award)
            tables = soup.find("table", id = award)
            stats = tables.find("tbody").find_all("tr")

            for row in stats:
                player_name = row.find("td", {"data-stat":"player"}).find("a").get_text()
                first_votes = row.find("td", {"data-stat":"votes_first"}).get_text()
                pts_won = row.find("td", {"data-stat":"points_won"}).get_text()
                pts_max = row.find("td", {"data-stat":"points_max"}).get_text()
                award_share = row.find("td", {"data-stat":"award_share"}).get_text()
                table_data.append([season + "-" + str(year)[-2:], award,player_name, first_votes,pts_won, pts_max, award_share])
    return table_data

awards_data = getAwardVotes(seasons)
awards_dataframe = pd.DataFrame(awards_data, columns = ["season","Award", "Name", "First Votes", "Points Won", "Max Points", "Award Shares"])
awards_dataframe.to_csv("data\Awards_Voting_Data.csv", index=False)

2021
mvp
roy
2020
mvp
roy
2019
mvp
roy
2018
mvp
roy
2017
mvp
roy
2016
mvp
roy
2015
mvp
roy
2014
mvp
roy
2013
mvp
roy
2012
mvp
roy
2011
mvp
roy
2010
mvp
roy
2009
mvp
roy
2008
mvp
roy
2007
mvp
roy
2006
mvp
roy
2005
mvp
roy
2004
mvp
roy
2003
mvp
roy
2002
mvp
roy
2001
mvp
roy
2000
mvp
roy
1999
mvp
roy
1998
mvp
roy
1997
mvp
roy


# Split Years Using K-Fold 

In [None]:
import pandas as pd
from sklearn.model_selection import KFold

num_folds = 5

def splitDataYears():
    #mvp_list = pd.read_csv(r'/content/drive/MyDrive/MVP_List.csv')          #make sure to change path to MVP_List.csv file
    mvp_list = pd.read_csv('data/MVP_List.csv')
    samples = mvp_list.loc[mvp_list["season"] >= "1996-97"]
    years = samples["season"]
    trainingSamples = years.sample(frac=0.8)    #80/20 split for training/testing data 
    testSamples = years.drop(trainingSamples.index)
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=None)   #split training data into 5 folds, each fold contains 4 seasons

    trainingFolds = []
    for train, valid in kf.split(trainingSamples):
      trainingFolds.append([trainingSamples.iloc[train].values.tolist(), trainingSamples.iloc[valid].values.tolist()])
      
    testData = []
    for i in range(testSamples.size):
      testData.append(testSamples.iloc[i])
    return trainingFolds, testData

trainingSet, testingSet = splitDataYears()
# trainingSet = [[80% of seasons (train)],[20% of seasons (test)]] * 5
# testingSet = [season] * 5
  

In [None]:
trainingSet[0][0]

In [None]:
import pandas as pd
drop_cols = ['Unnamed: 0', 'GROUP_VALUE', 'TEAM_ABBREVIATION', 'NBA_FANTASY_PTS',
                'GP_RANK', 'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK',
                'FGM_RANK', 'FGA_RANK', 'FG_PCT_RANK', 'FG3M_RANK', 'FG3A_RANK', 'FG3_PCT_RANK', 
                'FTM_RANK', 'FTA_RANK', 'FT_PCT_RANK', 'OREB_RANK', 'DREB_RANK', 'REB_RANK',
                'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK', 'PFD_RANK',
                'PTS_RANK', 'PLUS_MINUS_RANK', 'NBA_FANTASY_PTS_RANK', 'DD2_RANK', 'TD3_RANK']

vote_data = pd.read_csv("/home/data/data/Awards_Voting_Data.csv")

mvp_vote_data = vote_data.loc[vote_data['Award'] == "mvp"]
roy_vote_data = vote_data.loc[vote_data["Award"] == "roy"]


train = [[pd.read_csv(f"/home/data/data/{season}.csv")
         for season in trainingSet[k][0]] for k in range(num_folds)]

val = [[pd.read_csv(f"/home/data/data/{season}.csv")#.drop(columns=drop_cols).drop_duplicates(subset=['PLAYER_NAME'])
         for season in trainingSet[k][1]] for k in range(num_folds)]

test = [[pd.read_csv(f"/home/data/data/{season}.csv")#.drop(columns=drop_cols).drop_duplicates(subset=['PLAYER_NAME'])
         for season in testingSet]]


#add award shares column 
for i in range(len(train)):
    for j in range(len(train[i])):
        train[i][j]["award_shares"] = 0

for i in range(len(val)):
    for j in range(len(val[i])):
        val[i][j]["award_shares"] = 0

for i in range(len(test)):
    for j in range(len(test[i])):
        test[i][j]["award_shares"] = 0


In [None]:
#process training data
for index, row in mvp_vote_data.iterrows():
    for i in range(len(train)):
        for j in range(len(train[i])):
            if row["Name"] in train[i][j].values and row["season"] in train[i][j].values:
                idx = train[i][j].loc[(train[i][j].PLAYER_NAME == row["Name"]) & (train[i][j].GROUP_VALUE == row["season"])].index.tolist()
                train[i][j].loc[idx, "award_shares"] = row["Award Shares"]

for i in range(len(train)):
    for j in range(len(train[i])):
        if "DD2_RANK" in train[i][j]:
                train[i][j] = train[i][j].drop(columns=drop_cols,axis=1).drop_duplicates(subset=['PLAYER_NAME'])
        if "PLAYER_NAME" in train[i][j]:
            train[i][j] = train[i][j].drop(columns = ["PLAYER_NAME"])
            train[i][j].loc[:, train[i][j].columns != "award_shares"] = train[i][j].loc[:, train[i][j].columns != "award_shares"].apply(lambda x: (x - x.min()) / (x.max() - x.min()))


#process validation data
for index, row in mvp_vote_data.iterrows():
    for i in range(len(val)):
        for j in range(len(val[i])):
            if row["Name"] in val[i][j].values and row["season"] in val[i][j].values:
                idx = val[i][j].loc[(val[i][j].PLAYER_NAME == row["Name"]) & (val[i][j].GROUP_VALUE == row["season"])].index.tolist()
                val[i][j].loc[idx, "award_shares"] = row["Award Shares"]

for i in range(len(val)):
    for j in range(len(val[i])):
        if "DD2_RANK" in val[i][j]:
            val[i][j] = val[i][j].drop(columns=drop_cols,axis=1).drop_duplicates(subset=['PLAYER_NAME'])
        if "PLAYER_NAME" in val[i][j]:
            val[i][j] = val[i][j].drop(columns = ["PLAYER_NAME"])
            val[i][j].loc[:, val[i][j].columns != "award_shares"] = val[i][j].loc[:, val[i][j].columns != "award_shares"].apply(lambda x: (x - x.min()) / (x.max() - x.min()))


#process test data
for index, row in mvp_vote_data.iterrows():
    for i in range(len(test)):
        for j in range(len(test[i])):
            if row["Name"] in test[i][j].values and row["season"] in test[i][j].values:
                idx = test[i][j].loc[(test[i][j].PLAYER_NAME == row["Name"]) & (test[i][j].GROUP_VALUE == row["season"])].index.tolist()
                test[i][j].loc[idx, "award_shares"] = row["Award Shares"]

for k in range(len(test)):
    for i in range(len(test[k])):
        if "DD2_RANK" in test[k][i]:
            test[k][i] = test[k][i].drop(columns=drop_cols,axis=1).drop_duplicates(subset=['PLAYER_NAME'])
        if "PLAYER_NAME" in test[k][i]:
            test[k][i] = test[k][i].drop(columns = ["PLAYER_NAME"])
            test[k][i].loc[:, test[k][i].columns != "award_shares"] = test[k][i].loc[:, test[k][i].columns != "award_shares"].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
        