<a href="https://colab.research.google.com/github/TonyLiu836/NBA-MVP-Predictor/blob/NN/NBA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install nba_api
!pip install requests

In [None]:
import pandas as pd
import requests

# Data Collection

### Get player stats by season

In [None]:
from nba_api.stats.endpoints import teamplayerdashboard
from nba_api.stats.endpoints import leaguedashteamstats
from nba_api.stats.endpoints import playerdashboardbyyearoveryear
import random
import numpy as np

# Get list of teams that played in a season
def get_teams(start_year):
    end_year = (start_year + 1) % 100
    leagueteams = leaguedashteamstats.LeagueDashTeamStats(season=f'{start_year}-{end_year:02}')
    teams = leagueteams.get_data_frames()[0]
    return teams.loc[:, 'TEAM_ID']

# Get list of players that played > 1000 min and scored > 600 points for a team in a season.
# 1000 min and 600 points total is ~12 mpg and 7 ppg so this filters out players that don't get much playtime.
def get_players(team_id, start_year):
    end_year = (start_year + 1) % 100
    teamplayers = teamplayerdashboard.TeamPlayerDashboard(team_id, season=f'{start_year}-{end_year:02}')
    players = teamplayers.get_data_frames()[1]
    good_players = players[players.loc[:,'MIN'] > 1000]
    good_players = good_players[good_players.loc[:, 'PTS'] > 600]
    return good_players.loc[:, ['PLAYER_ID', 'PLAYER_NAME']]

# Get player stats for a season
def get_player_stats(player_id, start_year):
    end_year = (start_year + 1) % 100
    playerdashboard = playerdashboardbyyearoveryear.PlayerDashboardByYearOverYear(player_id, per_mode_detailed='PerGame')
    stats = playerdashboard.get_data_frames()[1]
    seasonstats = stats[stats['GROUP_VALUE'] == f'{start_year}-{end_year:02}']
    return seasonstats.drop(columns=['GROUP_SET', 'TEAM_ID', 'MAX_GAME_DATE', 'CFID', 'CFPARAMS'])

teams_2020 = get_teams(2020)
rand_team = get_players(teams_2020.iat[random.randint(0, 29)], 2020)
print(rand_team.shape)
rand_player = rand_team.iloc[random.randint(0, 5), :]
print(rand_player)
stats = get_player_stats(rand_player.iat[0], 2020)
print(stats)
print(stats.columns)

In [None]:
import pandas as pd
import time
from nba_api.stats.endpoints import teamplayerdashboard
from nba_api.stats.endpoints import leaguedashteamstats

# Iterate through each season and save data to csv
for start_year in range(2012, 2022):
    data = []
    end_year = (start_year + 1) % 100
    print(f'{start_year}-{end_year:02}')
    leagueteams = get_teams(start_year)
    for team_id in leagueteams:
        players = get_players(team_id, start_year)
        for player in players.itertuples():
            print(player)
            time.sleep(1)
            stats = get_player_stats(player[1], start_year)
            stats.insert(0, "PLAYER_NAME", player[2])
            data.append(stats)
    
    big_data = pd.concat(data)
    big_data.to_csv(f'{start_year}-{end_year:02}.csv')

### Get MVP List, MVP Votes, ROY Votes

In [None]:
from bs4 import BeautifulSoup

URL = "https://www.nba.com/news/history-mvp-award-winners"
page = requests.get(URL)

#webscrape NBA.com's mvp page for MVP names and corresponding years
def get_MVP_List():
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find(id = "__next")
    players = results.find_all("div", class_="Article_article__2Ue3h")
    rawData = []
    for sample in players:
      mvps = sample.find_all("p")
      for mvp in mvps:
        rawData.append(str(mvp))

    rawData = rawData[2:]
    mvp_list = []
    for j in rawData:
      j = j[3:]                   #get rid of <p> and </p>
      j = j[:-4]
      info = j.split()  
      mvp_list.append([info[0], info[2] + " " + info[3], info[4] + " " + info[5]])

    return mvp_list

mvp_data = get_MVP_List()

mvp_dataframe = pd.DataFrame(mvp_data, columns = ["season", "Name", "Team"])
mvp_dataframe.to_csv("data/MVP_List.csv", index = False)



In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import time, requests

data_dir = "/home/data/data" if 'google.colab' in str(get_ipython()) else "data"
mvp_list = pd.read_csv(f'{data_dir}/MVP_List.csv')
samples = mvp_list.loc[mvp_list["season"] >= "1996-97"]
seasons = samples["season"]
seasons = [season[0:4] for season in seasons]

def getAwardVotes(seasons):
    award_names = ["mvp", "roy"]
    table_data = []
    for season in seasons:
        year = int(season) + 1
        print(year)
        time.sleep(1)
        URL = "https://www.basketball-reference.com/awards/awards_" + str(year) + ".html"
        page = requests.get(URL)
        soup = BeautifulSoup(page.content, "html.parser")

        for award in award_names:
            print(award)
            tables = soup.find("table", id = award)
            stats = tables.find("tbody").find_all("tr")

            for row in stats:
                player_name = row.find("td", {"data-stat":"player"}).find("a").get_text()
                first_votes = row.find("td", {"data-stat":"votes_first"}).get_text()
                pts_won = row.find("td", {"data-stat":"points_won"}).get_text()
                pts_max = row.find("td", {"data-stat":"points_max"}).get_text()
                award_share = row.find("td", {"data-stat":"award_share"}).get_text()
                table_data.append([season + "-" + str(year)[-2:], award,player_name, first_votes,pts_won, pts_max, award_share])
    return table_data

awards_data = getAwardVotes(seasons)
awards_dataframe = pd.DataFrame(awards_data, columns = ["season","Award", "Name", "First Votes", "Points Won", "Max Points", "Award Shares"])
awards_dataframe.to_csv("data/Awards_Voting_Data.csv", index=False)

FileNotFoundError: ignored

In [2]:
from google.colab import drive
drive.mount('drive')
%cd /content/drive/MyDrive/NBA_MLProject
!pwd
!unzip data.zip -d /home/data


Mounted at drive
/content/drive/MyDrive/NBA_MLProject
/content/drive/MyDrive/NBA_MLProject
Archive:  data.zip
   creating: /home/data/data/
  inflating: /home/data/data/1996-97.csv  
  inflating: /home/data/data/1997-98.csv  
  inflating: /home/data/data/1998-99.csv  
  inflating: /home/data/data/1999-00.csv  
  inflating: /home/data/data/2000-01.csv  
  inflating: /home/data/data/2001-02.csv  
  inflating: /home/data/data/2002-03.csv  
  inflating: /home/data/data/2003-04.csv  
  inflating: /home/data/data/2004-05.csv  
  inflating: /home/data/data/2005-06.csv  
  inflating: /home/data/data/2006-07.csv  
  inflating: /home/data/data/2007-08.csv  
  inflating: /home/data/data/2008-09.csv  
  inflating: /home/data/data/2009-10.csv  
  inflating: /home/data/data/2010-11.csv  
  inflating: /home/data/data/2011-12.csv  
  inflating: /home/data/data/2012-13.csv  
  inflating: /home/data/data/2013-14.csv  
  inflating: /home/data/data/2014-15.csv  
  inflating: /home/data/data/2015-16.csv  


In [6]:
!cp Awards_Voting_Data.csv -d /home/data

# Data Preprocessing

#### Split years using K-fold 

In [112]:
import pandas as pd
from sklearn.model_selection import KFold

num_folds = 5
data_dir = "/home/data/data" if 'google.colab' in str(get_ipython()) else "data"
def splitDataYears():
    '''
    Splits available seasons into folds, with each fold containing a list of seasons in random order
    Outputs: (trainingFolds, testData)
        trainingFolds = [ [[training seasons], [validation seasons]] * num_folds ]
        testData = [testing seasons]
    '''
    mvp_list = pd.read_csv(f'{data_dir}/MVP_List.csv')
    samples = mvp_list.loc[mvp_list["season"] >= "1996-97"]
    years = samples["season"]
    trainingSamples = years.sample(frac=0.8)    #80/20 split for training/testing data 
    testSamples = years.drop(trainingSamples.index)
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=None)   #split training data into 5 folds, each fold contains 4 seasons

    trainingFolds = []
    for train, valid in kf.split(trainingSamples):
      trainingFolds.append([trainingSamples.iloc[train].values.tolist(), trainingSamples.iloc[valid].values.tolist()])
      
    testData = []
    for i in range(testSamples.size):
      testData.append(testSamples.iloc[i])
    return trainingFolds, testData

trainingSet, testingSet = splitDataYears()
  

### Load data from csv

In [113]:
import pandas as pd

data_dir = "/home/data/data" if 'google.colab' in str(get_ipython()) else "data"

drop_cols = ['Unnamed: 0', 'TEAM_ABBREVIATION', 'PFD', 'NBA_FANTASY_PTS',
                'GP_RANK', 'W_RANK', 'L_RANK', 'W_PCT_RANK', 'MIN_RANK',
                'FGM_RANK', 'FGA_RANK', 'FG_PCT_RANK', 'FG3M_RANK', 'FG3A_RANK', 'FG3_PCT_RANK', 
                'FTM_RANK', 'FTA_RANK', 'FT_PCT_RANK', 'OREB_RANK', 'DREB_RANK', 'REB_RANK',
                'AST_RANK', 'TOV_RANK', 'STL_RANK', 'BLK_RANK', 'BLKA_RANK', 'PF_RANK', 'PFD_RANK',
                'PTS_RANK', 'PLUS_MINUS_RANK', 'NBA_FANTASY_PTS_RANK', 'DD2_RANK', 'TD3_RANK']

mvp_rename_cols = {"Name": "PLAYER_NAME", "season": "GROUP_VALUE", "Award Shares": "MVP_SHARES"}
roy_rename_cols = {"Name": "PLAYER_NAME", "season": "GROUP_VALUE", "Award Shares": "ROY_SHARES"}

vote_data = pd.read_csv(f"{data_dir}/Awards_Voting_Data.csv")

# separate vote_data by award, rename cols to match player data
mvp_vote_data = vote_data.loc[vote_data['Award'] == "mvp"]
mvp_vote_data = mvp_vote_data.rename(columns=mvp_rename_cols)

roy_vote_data = vote_data.loc[vote_data["Award"] == "roy"]
roy_vote_data = roy_vote_data.rename(columns=roy_rename_cols)

# form train, val, test data sets
# drop duplicates removes individual team entries but keeps season totals if player plays for more than one team in a season
train = [[pd.read_csv(f"{data_dir}/{season}.csv").drop(columns=drop_cols).drop_duplicates(subset=['PLAYER_NAME'])
         for season in trainingSet[k][0]] for k in range(num_folds)]

val = [[pd.read_csv(f"{data_dir}/{season}.csv").drop(columns=drop_cols).drop_duplicates(subset=['PLAYER_NAME'])
         for season in trainingSet[k][1]] for k in range(num_folds)]

test = [pd.read_csv(f"{data_dir}/{season}.csv").drop(columns=drop_cols).drop_duplicates(subset=['PLAYER_NAME'])
         for season in testingSet]

print(train[0][0])

         PLAYER_NAME GROUP_VALUE  GP   W   L  W_PCT   MIN  FGM   FGA  FG_PCT  \
0      Al Harrington     2005-06  76  24  52  0.316  36.6  7.3  16.1   0.452   
1        Joe Johnson     2005-06  82  26  56  0.317  40.7  7.7  17.0   0.453   
2     Josh Childress     2005-06  74  24  50  0.324  30.4  3.8   6.8   0.552   
3         Josh Smith     2005-06  80  26  54  0.325  32.0  4.1   9.7   0.425   
4    Marvin Williams     2005-06  79  25  54  0.316  24.7  3.0   6.7   0.443   
..               ...         ...  ..  ..  ..    ...   ...  ...   ...     ...   
157      Mehmet Okur     2005-06  82  41  41  0.500  35.9  6.3  13.8   0.460   
158   Antawn Jamison     2005-06  82  42  40  0.512  40.1  8.0  18.2   0.442   
159  Antonio Daniels     2005-06  80  42  38  0.525  28.5  2.9   6.9   0.418   
160     Caron Butler     2005-06  75  40  35  0.533  36.1  6.6  14.5   0.455   
161   Gilbert Arenas     2005-06  80  40  40  0.500  42.3  9.3  20.9   0.447   

     ...  AST  TOV  STL  BLK  BLKA   PF

In [114]:
# check for NaNs
for fold in train:
    for season in fold:
        if season.isnull().values.any():
            print("train:", season)
for fold in val:
    for season in fold:
        if season.isnull().values.any():
            print("val:", season)
for season in test:
    if season.isnull().values.any():
        print("test:", season)

#### Add award shares data

In [115]:
assert("MVP_SHARES" not in train[0][0]), "MVP_SHARES already added to df"
mvp = mvp_vote_data.loc[:,["PLAYER_NAME", "GROUP_VALUE", "MVP_SHARES"]]

for i in range(len(train)):
    for j in range(len(train[i])):
        train[i][j] = train[i][j].merge(mvp, how='left', on=["PLAYER_NAME", "GROUP_VALUE"])   #union of both dataframes, if no vote shares value found then set to Nan
        train[i][j].fillna(value={"MVP_SHARES": 0.}, inplace=True)                             # replace Nans with 0

for i in range(len(val)):
    for j in range(len(val[i])):
        val[i][j] = val[i][j].merge(mvp, how='left', on=["PLAYER_NAME", "GROUP_VALUE"])
        val[i][j].fillna(value={"MVP_SHARES": 0.}, inplace=True)
                
for i in range(len(test)):
    test[i] = test[i].merge(mvp, how='left', on=["PLAYER_NAME", "GROUP_VALUE"])
    test[i].fillna(value={"MVP_SHARES": 0.}, inplace=True)

In [116]:
train[0][0].loc[train[0][0]["MVP_SHARES"] > 0.001]

Unnamed: 0,PLAYER_NAME,GROUP_VALUE,GP,W,L,W_PCT,MIN,FGM,FGA,FG_PCT,...,TOV,STL,BLK,BLKA,PF,PTS,PLUS_MINUS,DD2,TD3,MVP_SHARES
22,LeBron James,2005-06,79,47,32,0.595,42.5,11.1,23.1,0.48,...,3.3,1.6,0.8,0.8,2.3,31.4,3.5,21,5,0.55
24,Dirk Nowitzki,2005-06,81,60,21,0.741,38.1,9.3,19.3,0.48,...,1.9,0.7,1.0,0.9,2.0,26.6,6.3,35,0,0.435
35,Chauncey Billups,2005-06,81,64,17,0.79,36.1,5.2,12.5,0.418,...,2.1,0.9,0.1,0.7,2.0,18.5,7.3,27,0,0.344
57,Elton Brand,2005-06,79,44,35,0.557,39.2,9.6,18.2,0.527,...,2.2,1.0,2.5,0.8,2.9,24.7,2.3,45,0,0.04
61,Kobe Bryant,2005-06,80,45,35,0.563,41.0,12.2,27.2,0.45,...,3.1,1.8,0.4,1.0,2.9,35.4,4.0,4,0,0.386
70,Dwyane Wade,2005-06,75,48,27,0.64,38.6,9.3,18.8,0.495,...,3.6,1.9,0.8,0.9,2.9,27.2,6.5,16,2,0.07
117,Steve Nash,2005-06,79,54,25,0.684,35.4,6.8,13.4,0.512,...,3.5,0.8,0.2,0.4,1.5,18.8,6.0,43,1,0.739
131,Tim Duncan,2005-06,80,61,19,0.763,34.8,7.2,14.8,0.484,...,2.5,0.9,2.0,0.8,2.7,18.6,6.5,52,0,0.026
132,Tony Parker,2005-06,80,61,19,0.763,33.9,7.8,14.2,0.548,...,3.1,1.0,0.1,1.0,2.0,18.9,6.4,8,0,0.007


#### Process Data

In [117]:
train_processed = []
for fold in train:
    fold_processed = []
    for season in fold:
        if "PLAYER_NAME" in season:
            season = season.drop(columns = ["PLAYER_NAME", "GROUP_VALUE"])
            
        season.loc[:, season.columns != "MVP_SHARES"] =\
            season.loc[:, season.columns != "MVP_SHARES"]\
            .apply(lambda x: (x - x.min()) / (x.max() - x.min()))
        fold_processed.append(season)
    train_processed.append(fold_processed)

val_processed = []
for fold in val:
    fold_processed = []
    for season in fold:
        if "PLAYER_NAME" in season:
            season = season.drop(columns = ["PLAYER_NAME", "GROUP_VALUE"])
            
        season.loc[:, season.columns != "MVP_SHARES"] =\
            season.loc[:, season.columns != "MVP_SHARES"]\
            .apply(lambda x: (x - x.min()) / (x.max() - x.min()))
        fold_processed.append(season)
    val_processed.append(fold_processed)


test_processed = []
for season in test:
    if "PLAYER_NAME" in season:
        season = season.drop(columns = ["PLAYER_NAME", "GROUP_VALUE"])
        
    season.loc[:, season.columns != "MVP_SHARES"] =\
        season.loc[:, season.columns != "MVP_SHARES"]\
        .apply(lambda x: (x - x.min()) / (x.max() - x.min()))
    test_processed.append(season)

In [118]:
train_processed[0][0].loc[train_processed[0][0]["MVP_SHARES"] > 0.001]

Unnamed: 0,GP,W,L,W_PCT,MIN,FGM,FGA,FG_PCT,FG3M,FG3A,...,TOV,STL,BLK,BLKA,PF,PTS,PLUS_MINUS,DD2,TD3,MVP_SHARES
22,0.914286,0.638298,0.409091,0.627273,0.976834,0.885417,0.803828,0.457014,0.470588,0.571429,...,0.875,0.608696,0.242424,0.461538,0.392857,0.856631,0.683453,0.33871,0.625,0.55
24,0.971429,0.914894,0.159091,0.892727,0.80695,0.697917,0.62201,0.457014,0.411765,0.392857,...,0.4375,0.217391,0.30303,0.538462,0.285714,0.684588,0.884892,0.564516,0.0,0.435
35,0.971429,1.0,0.068182,0.981818,0.72973,0.270833,0.296651,0.176471,0.676471,0.619048,...,0.5,0.304348,0.030303,0.384615,0.285714,0.394265,0.956835,0.435484,0.0,0.344
57,0.914286,0.574468,0.477273,0.558182,0.849421,0.729167,0.569378,0.669683,0.0,0.0,...,0.53125,0.347826,0.757576,0.461538,0.607143,0.616487,0.597122,0.725806,0.0,0.04
61,0.942857,0.595745,0.477273,0.569091,0.918919,1.0,1.0,0.321267,0.676471,0.77381,...,0.8125,0.695652,0.121212,0.615385,0.607143,1.0,0.719424,0.064516,0.0,0.386
70,0.8,0.659574,0.295455,0.709091,0.826255,0.697917,0.598086,0.524887,0.058824,0.119048,...,0.96875,0.73913,0.242424,0.538462,0.607143,0.706093,0.899281,0.258065,0.25,0.07
117,0.914286,0.787234,0.25,0.789091,0.702703,0.4375,0.339713,0.60181,0.558824,0.511905,...,0.9375,0.26087,0.060606,0.153846,0.107143,0.405018,0.863309,0.693548,0.125,0.739
131,0.942857,0.93617,0.113636,0.932727,0.679537,0.479167,0.406699,0.475113,0.0,0.011905,...,0.625,0.304348,0.606061,0.461538,0.535714,0.397849,0.899281,0.83871,0.0,0.026
132,0.942857,0.93617,0.113636,0.932727,0.644788,0.541667,0.37799,0.764706,0.029412,0.059524,...,0.8125,0.347826,0.030303,0.615385,0.285714,0.408602,0.892086,0.129032,0.0,0.007


In [119]:
for fold in train_processed:
    for season in fold:
        if season.isnull().values.any():
            print(season)

# New Section

In [126]:
import torch 
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader
import pandas as pd


class MVPDataset(Dataset):
    def __init__(self, samples):
        all_samples = pd.concat(samples)
        all_votes = all_samples.loc[all_samples['MVP_SHARES'] > 0.001]
        self.samples = all_votes.loc[:, :'MVP_SHARES']
        self.labels = all_votes.loc[:,'MVP_SHARES']
        self.n_samples = len(self.samples)

    def __len__(self):
        return self.n_samples

    def __getitem__(self, index):
        data = torch.tensor(self.samples.iloc[index].values, dtype=torch.float)
        label = torch.tensor(self.labels.iloc[index], dtype=torch.float)
        return data, label


train_datasets = []
for i in range(num_folds):
    trainingDataset = MVPDataset(train_processed[i])
    train_datasets.append(trainingDataset)
    
val_datasets = []
for i in range(num_folds):
    validationDataset = MVPDataset(val_processed[i])
    val_datasets.append(validationDataset)


class NeuralNetwork(nn.Module):
    def __init__(self, inputSize):
        super(NeuralNetwork, self).__init__()
        self.linear1 = nn.Linear(inputSize, 32)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(32, 64)
        self.linear3 = nn.Linear(64,1)
        

    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        x = self.linear3(x)
        return x

In [137]:
loss_hist = []
num_epochs = 100

model = NeuralNetwork(28)
optimizer = optim.Adam(model.parameters(), lr=0.0001)
lossfn = nn.MSELoss()

for i in range(num_folds):
    print(f"\n------------------ Fold {i} ------------------")
    train_dl = DataLoader(train_datasets[i], batch_size=20, shuffle=True)
    for epoch in range(num_epochs):
        epoch_loss = []
        for X, y in train_dl:
            if torch.any(torch.isnan(X)).item():
                print(X)
                raise Exception
            pred = model(X).flatten()
            loss = lossfn(pred, y)
            epoch_loss.append(loss.item())
            print(".", end="")
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        loss_hist.extend(epoch_loss)
        mean = np.mean(epoch_loss)
        e_max = max(epoch_loss)
        if epoch % 20 == 19:
            print(f"\nAvg: {mean} \tMax: {e_max}")
        else:
            print("")



------------------ Fold 0 ------------------
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
Avg: 0.07745991423726081 	Max: 0.1686161309480667
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
Avg: 0.05078512597829103 	Max: 0.09772029519081116
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
Avg: 0.02793507035821676 	Max: 0.05962264537811279
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........
..........