In [1]:
import pandas as pd
import numpy as np

#### Import NBA statistic from NBA API (player statistics and teams standings)

In [2]:
from nba_api.stats.endpoints import leaguedashplayerstats, leaguestandings

This script uses the RandomForestClassifier from the scikit-learn library to train the model.

In [5]:
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Collect data from NBA API

Get team standings dataset 

In [4]:
standings = leaguestandings.LeagueStandings(season='2022-23').get_data_frames()[0]

In [5]:
standings

Unnamed: 0,LeagueID,SeasonID,TeamID,TeamCity,TeamName,Conference,ConferenceRecord,PlayoffRank,ClinchIndicator,Division,...,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,PreAS,PostAS
0,0,22022,1610612743,Denver,Nuggets,West,32-14,1,- nw,Northwest,...,,,,,,4-3,10-4,9-5,41-18,10-7
1,0,22022,1610612749,Milwaukee,Bucks,East,32-16,1,- c,Central,...,,,,,,6-0,9-5,8-7,41-17,14-5
2,0,22022,1610612738,Boston,Celtics,East,31-17,2,- x,Atlantic,...,,,,,,4-2,14-2,8-6,42-17,11-7
3,0,22022,1610612763,Memphis,Grizzlies,West,28-20,2,- sw,Southwest,...,,,,,,4-3,8-6,10-4,35-22,13-6
4,0,22022,1610612758,Sacramento,Kings,West,30-16,3,- x,Pacific,...,,,,,,2-4,9-5,8-6,32-25,14-5
5,0,22022,1610612755,Philadelphia,76ers,East,30-16,3,- x,Atlantic,...,,,,,,4-4,8-6,9-4,38-19,12-7
6,0,22022,1610612739,Cleveland,Cavaliers,East,31-16,4,- x,Central,...,,,,,,5-1,9-7,9-6,38-23,10-6
7,0,22022,1610612756,Phoenix,Suns,West,26-20,4,,Pacific,...,,,,,,5-1,10-5,5-11,32-28,9-7
8,0,22022,1610612752,New York,Knicks,East,29-19,5,,Atlantic,...,,,,,,3-3,7-9,9-6,33-27,11-6
9,0,22022,1610612746,LA,Clippers,West,24-23,5,,Pacific,...,,,,,,3-4,10-6,8-7,33-28,8-8


Get player stats data

In [6]:
player_stats = leaguedashplayerstats.LeagueDashPlayerStats(season='2022-23', season_type_all_star='Regular Season', per_mode_detailed='PerGame').get_data_frames()[0]
player_stats

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK
0,1630639,A.J. Lawson,A.J.,1610612742,DAL,22.0,13,5,8,0.385,...,474,393,469,474,445,338,480,232,35,468
1,1631260,AJ Green,AJ,1610612749,MIL,23.0,33,26,7,0.788,...,474,475,432,494,361,316,428,232,35,401
2,1631100,AJ Griffin,AJ,1610612737,ATL,19.0,67,31,36,0.463,...,343,286,375,405,213,162,274,232,35,262
3,203932,Aaron Gordon,Aaron,1610612743,DEN,27.0,64,44,20,0.688,...,69,34,203,58,82,3,79,59,35,87
4,1628988,Aaron Holiday,Aaron,1610612737,ATL,26.0,58,30,28,0.517,...,327,293,352,331,387,225,386,232,35,391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
518,1628380,Zach Collins,Zach,1610612759,SAS,25.0,61,18,43,0.295,...,66,125,25,112,142,485,128,71,35,133
519,203897,Zach LaVine,Zach,1610612741,CHI,28.0,72,35,37,0.486,...,286,22,153,45,21,245,44,149,35,31
520,1630192,Zeke Nnaji,Zeke,1610612743,DEN,22.0,49,33,16,0.673,...,148,337,200,260,341,420,363,180,35,371
521,1630533,Ziaire Williams,Ziaire,1610612763,MEM,21.0,36,21,15,0.583,...,387,407,283,319,329,402,381,232,35,367


# Prepare the dataset for training and testing 

Merge team standings with player stats by Team ID 

In [7]:
df = pd.merge(player_stats, standings, left_on='TEAM_ID', right_on='TeamID', how='outer')

In [8]:
df

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,PreAS,PostAS
0,1630639,A.J. Lawson,A.J.,1610612742,DAL,22.0,13,5,8,0.385,...,,,,,,3-3,7-7,11-6,31-29,6-11
1,1629735,Chris Silva,Chris,1610612742,DAL,26.0,1,1,0,1.000,...,,,,,,3-3,7-7,11-6,31-29,6-11
2,1626174,Christian Wood,Christian,1610612742,DAL,27.0,64,32,32,0.500,...,,,,,,3-3,7-7,11-6,31-29,6-11
3,202722,Davis Bertans,Davis,1610612742,DAL,30.0,42,20,22,0.476,...,,,,,,3-3,7-7,11-6,31-29,6-11
4,203939,Dwight Powell,Dwight,1610612742,DAL,31.0,72,36,36,0.500,...,,,,,,3-3,7-7,11-6,31-29,6-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
518,1629011,Mitchell Robinson,Mitchell,1610612752,NYK,24.0,55,32,23,0.582,...,,,,,,3-3,7-9,9-6,33-27,11-6
519,1630167,Obi Toppin,Obi,1610612752,NYK,25.0,62,34,28,0.548,...,,,,,,3-3,7-9,9-6,33-27,11-6
520,1629656,Quentin Grimes,Quentin,1610612752,NYK,22.0,66,38,28,0.576,...,,,,,,3-3,7-9,9-6,33-27,11-6
521,1629628,RJ Barrett,RJ,1610612752,NYK,22.0,70,39,31,0.557,...,,,,,,3-3,7-9,9-6,33-27,11-6


### Feature engineering
Including these two features in the machine learning model can help to capture important information about a team's performance that is not captured by other features such as points per game or rebounds per game.

In [9]:
df['AssistRatio'] = df['AST'] / (df['AST'] + df['TOV'])
#Indication of a player's ability to assist their teamates without turning the ball over

df['WinStreak'] = df['W'] - df['L']
#Indication of how many consecutive game has won (if value +) or lost (if value -)


Feature selection 

In [10]:
feature_columns = ['PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'FG_PCT', 'FT_PCT', 'FG3_PCT', 'W_PCT', 'AssistRatio', 'WinStreak']
X = df[feature_columns]
y = df['PlayoffRank']

In [11]:
X

Unnamed: 0,PTS,REB,AST,STL,BLK,TOV,FG_PCT,FT_PCT,FG3_PCT,W_PCT,AssistRatio,WinStreak
0,2.6,0.8,0.1,0.1,0.0,0.1,0.500,0.333,0.438,0.385,0.500000,-3
1,2.0,0.0,0.0,0.0,0.0,1.0,1.000,0.000,0.000,1.000,0.000000,1
2,16.8,7.4,1.8,0.5,1.0,1.8,0.518,0.778,0.368,0.500,0.500000,0
3,4.2,1.0,0.5,0.2,0.1,0.2,0.428,0.833,0.393,0.476,0.714286,-2
4,6.9,4.2,0.9,0.7,0.3,0.9,0.727,0.669,0.000,0.500,0.500000,0
...,...,...,...,...,...,...,...,...,...,...,...,...
518,7.2,9.0,0.8,0.9,1.7,0.6,0.690,0.491,0.000,0.582,0.571429,9
519,6.3,2.8,0.8,0.3,0.2,0.5,0.418,0.784,0.327,0.548,0.615385,6
520,10.4,3.3,2.0,0.6,0.4,1.0,0.457,0.791,0.376,0.576,0.666667,10
521,19.6,5.0,2.7,0.4,0.2,2.3,0.435,0.743,0.318,0.557,0.540000,8


In [12]:
# Impute missing values with mean value
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [13]:
X

Unnamed: 0,PTS,REB,AST,STL,BLK,TOV,FG_PCT,FT_PCT,FG3_PCT,W_PCT,AssistRatio,WinStreak
0,2.6,0.8,0.1,0.1,0.0,0.1,0.500,0.333,0.438,0.385,0.500000,-3.0
1,2.0,0.0,0.0,0.0,0.0,1.0,1.000,0.000,0.000,1.000,0.000000,1.0
2,16.8,7.4,1.8,0.5,1.0,1.8,0.518,0.778,0.368,0.500,0.500000,0.0
3,4.2,1.0,0.5,0.2,0.1,0.2,0.428,0.833,0.393,0.476,0.714286,-2.0
4,6.9,4.2,0.9,0.7,0.3,0.9,0.727,0.669,0.000,0.500,0.500000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
518,7.2,9.0,0.8,0.9,1.7,0.6,0.690,0.491,0.000,0.582,0.571429,9.0
519,6.3,2.8,0.8,0.3,0.2,0.5,0.418,0.784,0.327,0.548,0.615385,6.0
520,10.4,3.3,2.0,0.6,0.4,1.0,0.457,0.791,0.376,0.576,0.666667,10.0
521,19.6,5.0,2.7,0.4,0.2,2.3,0.435,0.743,0.318,0.557,0.540000,8.0


In [14]:
y

0      11
1      11
2      11
3      11
4      11
       ..
518     5
519     5
520     5
521     5
522     5
Name: PlayoffRank, Length: 523, dtype: int64

 Split dataset into training and testing sets

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Train a machine learning model

### Instantiate and train the RandomForestClassifier

In [16]:
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
clf.fit(X_train, y_train)

# Test the model to find the most accurate prediction

Predict the test set results

In [17]:
y_pred = clf.predict(X_test)

In [18]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.2366412213740458


In [19]:
# Predict the champion
df['predicted_rank'] = clf.predict(X)
predicted_champion = df.loc[df['predicted_rank'].idxmin()]['TeamName']
print(f"Predicted NBA Champion: {predicted_champion}")

Predicted NBA Champion: Bucks


In [1]:
from nba_api.stats.endpoints import leaguedashplayerstats, leaguestandings
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np

In [2]:
standings = leaguestandings.LeagueStandings(season='2022-23').get_data_frames()[0]
player_stats = leaguedashplayerstats.LeagueDashPlayerStats(season='2022-23', season_type_all_star='Regular Season', per_mode_detailed='PerGame').get_data_frames()[0]
df = pd.merge(player_stats, standings, left_on='TEAM_ID', right_on='TeamID', how='outer')
df['AssistRatio'] = df['AST'] / (df['AST'] + df['TOV'])
df['WinStreak'] = df['W'] - df['L']

In [3]:
feature_columns = ['PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'FG_PCT', 'FT_PCT', 'FG3_PCT', 'AssistRatio', 'WinStreak']
X = df[feature_columns]
y = df['W_PCT']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [8]:
from sklearn.impute import SimpleImputer

# Create a SimpleImputer with strategy set to 'mean'
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on the training data
imputer.fit(X_train)

# Transform both the training and test data using the imputer
X_train_imputed = imputer.transform(X_train)
X_test_imputed = imputer.transform(X_test)




In [9]:
reg = LinearRegression()

reg.fit(X_train_imputed, y_train)



In [10]:


# Calculate predictions and evaluate the model with the imputed data
y_pred = reg.predict(X_test_imputed)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


In [11]:
print(f"Mean Squared Error: {mse:.4f}")
print(f"R2 Score: {r2:.4f}")



Mean Squared Error: 0.0154
R2 Score: 0.4643


In [12]:
# Impute the missing values in the whole dataset
X_imputed = imputer.transform(X)

In [13]:
# Predict the champion
df['predicted_w_pct'] = reg.predict(X)
predicted_champion = df.loc[df['predicted_w_pct'].idxmax()]['TeamName']
print(f"Predicted NBA Champion: {predicted_champion}")



Predicted NBA Champion: Bucks


