In [1]:
import pandas as pd
import numpy as np

#### Import NBA statistic from NBA API (player statistics and teams standings)

In [2]:
from nba_api.stats.endpoints import leaguedashplayerstats, leaguestandings

This script uses the RandomForestClassifier from the scikit-learn library to train the model.

In [3]:
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Collect data from NBA API

Get team standings dataset 

In [4]:
standings = leaguestandings.LeagueStandings(season='2022-23').get_data_frames()[0]

In [5]:
standings

Unnamed: 0,LeagueID,SeasonID,TeamID,TeamCity,TeamName,Conference,ConferenceRecord,PlayoffRank,ClinchIndicator,Division,...,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,PreAS,PostAS
0,0,22022,1610612743,Denver,Nuggets,West,34-18,1,- w,Northwest,...,,,,,,4-3,10-4,9-5,41-18,12-11
1,0,22022,1610612749,Milwaukee,Bucks,East,35-17,1,- e,Central,...,,,,,,6-0,9-5,8-7,41-17,17-7
2,0,22022,1610612738,Boston,Celtics,East,34-18,2,- a,Atlantic,...,,,,,,4-2,14-2,8-6,42-17,15-8
3,0,22022,1610612763,Memphis,Grizzlies,West,30-22,2,- sw,Southwest,...,,,,,,4-3,8-6,10-4,35-22,16-9
4,0,22022,1610612758,Sacramento,Kings,West,32-20,3,- p,Pacific,...,,,,,,2-4,9-5,8-6,32-25,16-9
5,0,22022,1610612755,Philadelphia,76ers,East,34-18,3,- x,Atlantic,...,,,,,,4-4,8-6,9-4,38-19,16-9
6,0,22022,1610612739,Cleveland,Cavaliers,East,34-18,4,- x,Central,...,,,,,,5-1,9-7,9-6,38-23,13-8
7,0,22022,1610612756,Phoenix,Suns,West,30-22,4,- x,Pacific,...,,,,,,5-1,10-5,5-11,32-28,13-9
8,0,22022,1610612752,New York,Knicks,East,32-20,5,- x,Atlantic,...,,,,,,3-3,7-9,9-6,33-27,14-8
9,0,22022,1610612746,LA,Clippers,West,27-25,5,- x,Pacific,...,,,,,,3-4,10-6,8-7,33-28,11-10


Get player stats data

In [6]:
player_stats = leaguedashplayerstats.LeagueDashPlayerStats(season='2022-23', season_type_all_star='Regular Season', per_mode_detailed='PerGame').get_data_frames()[0]
player_stats

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK
0,1630639,A.J. Lawson,A.J.,1610612742,DAL,22.0,15,5,10,0.333,...,491,358,475,457,429,462,482,253,39,466
1,1631260,AJ Green,AJ,1610612749,MIL,23.0,35,27,8,0.771,...,491,498,454,518,392,309,455,253,39,436
2,1631100,AJ Griffin,AJ,1610612737,ATL,19.0,72,34,38,0.472,...,372,268,394,422,217,169,287,253,39,273
3,203932,Aaron Gordon,Aaron,1610612743,DEN,27.0,68,45,23,0.662,...,65,32,205,58,82,4,77,56,39,89
4,1628988,Aaron Holiday,Aaron,1610612737,ATL,26.0,63,32,31,0.508,...,345,315,377,356,415,212,413,253,39,416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,1628380,Zach Collins,Zach,1610612759,SAS,25.0,63,20,43,0.317,...,63,123,19,113,142,486,130,70,39,131
535,203897,Zach LaVine,Zach,1610612741,CHI,28.0,77,38,39,0.494,...,305,27,167,47,21,216,44,155,39,33
536,1630192,Zeke Nnaji,Zeke,1610612743,DEN,22.0,53,34,19,0.642,...,162,339,181,282,351,391,380,191,39,389
537,1630533,Ziaire Williams,Ziaire,1610612763,MEM,21.0,37,21,16,0.568,...,385,432,298,312,332,420,387,253,39,367


# Prepare the dataset for training and testing 

Merge team standings with player stats by Team ID 

In [7]:
df = pd.merge(player_stats, standings, left_on='TEAM_ID', right_on='TeamID', how='outer')

In [8]:
df

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,PreAS,PostAS
0,1630639,A.J. Lawson,A.J.,1610612742,DAL,22.0,15,5,10,0.333,...,,,,,,3-3,7-7,11-6,31-29,7-15
1,1629735,Chris Silva,Chris,1610612742,DAL,26.0,1,1,0,1.000,...,,,,,,3-3,7-7,11-6,31-29,7-15
2,1626174,Christian Wood,Christian,1610612742,DAL,27.0,67,33,34,0.493,...,,,,,,3-3,7-7,11-6,31-29,7-15
3,202722,Davis Bertans,Davis,1610612742,DAL,30.0,45,20,25,0.444,...,,,,,,3-3,7-7,11-6,31-29,7-15
4,203939,Dwight Powell,Dwight,1610612742,DAL,31.0,76,37,39,0.487,...,,,,,,3-3,7-7,11-6,31-29,7-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,1629011,Mitchell Robinson,Mitchell,1610612752,NYK,25.0,59,35,24,0.593,...,,,,,,3-3,7-9,9-6,33-27,14-8
535,1630167,Obi Toppin,Obi,1610612752,NYK,25.0,67,37,30,0.552,...,,,,,,3-3,7-9,9-6,33-27,14-8
536,1629656,Quentin Grimes,Quentin,1610612752,NYK,22.0,71,41,30,0.577,...,,,,,,3-3,7-9,9-6,33-27,14-8
537,1629628,RJ Barrett,RJ,1610612752,NYK,22.0,73,40,33,0.548,...,,,,,,3-3,7-9,9-6,33-27,14-8


### Feature engineering
Including these two features in the machine learning model can help to capture important information about a team's performance that is not captured by other features such as points per game or rebounds per game.

In [9]:
df['AssistRatio'] = df['AST'] / (df['AST'] + df['TOV'])
#Indication of a player's ability to assist their teamates without turning the ball over

df['WinStreak'] = df['W'] - df['L']
#Indication of how many consecutive game has won (if value +) or lost (if value -)


Feature selection 

In [10]:
feature_columns = ['PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'FG_PCT', 'FT_PCT', 'FG3_PCT', 'W_PCT', 'AssistRatio', 'WinStreak']
X = df[feature_columns]
y = df['PlayoffRank']

In [11]:
X

Unnamed: 0,PTS,REB,AST,STL,BLK,TOV,FG_PCT,FT_PCT,FG3_PCT,W_PCT,AssistRatio,WinStreak
0,3.7,1.4,0.1,0.1,0.0,0.2,0.500,0.250,0.400,0.333,0.333333,-5
1,2.0,0.0,0.0,0.0,0.0,1.0,1.000,0.000,0.000,1.000,0.000000,1
2,16.6,7.3,1.8,0.4,1.1,1.8,0.515,0.772,0.376,0.493,0.500000,-1
3,4.6,1.2,0.5,0.2,0.2,0.2,0.431,0.867,0.390,0.444,0.714286,-5
4,6.7,4.1,0.9,0.6,0.3,0.9,0.732,0.667,0.000,0.487,0.500000,-2
...,...,...,...,...,...,...,...,...,...,...,...,...
534,7.4,9.4,0.9,0.9,1.8,0.7,0.671,0.484,0.000,0.593,0.562500,11
535,7.4,2.8,1.0,0.3,0.2,0.6,0.446,0.809,0.344,0.552,0.625000,7
536,11.3,3.2,2.1,0.7,0.4,1.0,0.468,0.796,0.386,0.577,0.677419,11
537,19.6,5.0,2.8,0.4,0.2,2.2,0.434,0.740,0.310,0.548,0.560000,7


In [12]:
# Impute missing values with mean value
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [13]:
X

Unnamed: 0,PTS,REB,AST,STL,BLK,TOV,FG_PCT,FT_PCT,FG3_PCT,W_PCT,AssistRatio,WinStreak
0,3.7,1.4,0.1,0.1,0.0,0.2,0.500,0.250,0.400,0.333,0.333333,-5.0
1,2.0,0.0,0.0,0.0,0.0,1.0,1.000,0.000,0.000,1.000,0.000000,1.0
2,16.6,7.3,1.8,0.4,1.1,1.8,0.515,0.772,0.376,0.493,0.500000,-1.0
3,4.6,1.2,0.5,0.2,0.2,0.2,0.431,0.867,0.390,0.444,0.714286,-5.0
4,6.7,4.1,0.9,0.6,0.3,0.9,0.732,0.667,0.000,0.487,0.500000,-2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
534,7.4,9.4,0.9,0.9,1.8,0.7,0.671,0.484,0.000,0.593,0.562500,11.0
535,7.4,2.8,1.0,0.3,0.2,0.6,0.446,0.809,0.344,0.552,0.625000,7.0
536,11.3,3.2,2.1,0.7,0.4,1.0,0.468,0.796,0.386,0.577,0.677419,11.0
537,19.6,5.0,2.8,0.4,0.2,2.2,0.434,0.740,0.310,0.548,0.560000,7.0


In [14]:
y

0      11
1      11
2      11
3      11
4      11
       ..
534     5
535     5
536     5
537     5
538     5
Name: PlayoffRank, Length: 539, dtype: int64

 Split dataset into training and testing sets

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Train a machine learning model

### Instantiate and train the RandomForestClassifier

In [16]:
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
clf.fit(X_train, y_train)

# Test the model to find the most accurate prediction

Predict the test set results

In [17]:
y_pred = clf.predict(X_test)

In [18]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.23703703703703705


In [19]:
# Predict the champion
df['predicted_rank'] = clf.predict(X)
predicted_champion = df.loc[df['predicted_rank'].idxmin()]['TeamName']
print(f"Predicted NBA Champion: {predicted_champion}")

Predicted NBA Champion: Bucks
