In [2]:
import pandas as pd
import numpy as np

Import NBA statistic from NBA API (player statistics and teams standings)

In [3]:
from nba_api.stats.endpoints import leaguedashplayerstats, leaguestandings

This script uses the RandomForestClassifier from the scikit-learn library to train the model.

In [4]:
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

# Collect data from NBA API

Get team standings dataset 

In [135]:
standings = leaguestandings.LeagueStandings(season='2022-23').get_data_frames()[0]

In [136]:
standings

Unnamed: 0,LeagueID,SeasonID,TeamID,TeamCity,TeamName,Conference,ConferenceRecord,PlayoffRank,ClinchIndicator,Division,...,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,PreAS,PostAS
0,0,22022,1610612743,Denver,Nuggets,West,32-13,1,- nw,Northwest,...,,,,,,4-3,10-4,9-5,41-18,10-6
1,0,22022,1610612749,Milwaukee,Bucks,East,31-15,1,- x,Central,...,,,,,,6-0,9-5,8-7,41-17,13-4
2,0,22022,1610612738,Boston,Celtics,East,30-16,2,- x,Atlantic,...,,,,,,4-2,14-2,8-6,42-17,10-6
3,0,22022,1610612763,Memphis,Grizzlies,West,28-19,2,- sw,Southwest,...,,,,,,4-3,8-6,10-4,35-22,12-5
4,0,22022,1610612755,Philadelphia,76ers,East,30-16,3,- x,Atlantic,...,,,,,,4-4,8-6,9-4,38-19,11-7
5,0,22022,1610612758,Sacramento,Kings,West,29-16,3,,Pacific,...,,,,,,2-4,9-5,8-6,32-25,13-5
6,0,22022,1610612739,Cleveland,Cavaliers,East,31-15,4,- x,Central,...,,,,,,5-1,9-7,9-6,38-23,10-5
7,0,22022,1610612756,Phoenix,Suns,West,25-20,4,,Pacific,...,,,,,,5-1,10-5,5-11,32-28,8-7
8,0,22022,1610612746,LA,Clippers,West,23-23,5,,Pacific,...,,,,,,3-4,10-6,8-7,33-28,7-8
9,0,22022,1610612752,New York,Knicks,East,28-19,5,,Atlantic,...,,,,,,3-3,7-9,9-6,33-27,10-6


Get player stats data

In [12]:
player_stats = leaguedashplayerstats.LeagueDashPlayerStats(season='2022-23', season_type_all_star='Regular Season', per_mode_detailed='PerGame').get_data_frames()[0]
player_stats

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK
0,1630639,A.J. Lawson,A.J.,1610612742,DAL,22.0,13,5,8,0.385,...,472,393,468,471,443,338,481,230,35,468
1,1631260,AJ Green,AJ,1610612749,MIL,23.0,32,25,7,0.781,...,472,474,432,491,366,317,428,230,35,404
2,1631100,AJ Griffin,AJ,1610612737,ATL,19.0,67,31,36,0.463,...,341,283,375,405,211,162,271,230,35,259
3,203932,Aaron Gordon,Aaron,1610612743,DEN,27.0,62,43,19,0.694,...,67,36,212,59,79,2,78,58,35,87
4,1628988,Aaron Holiday,Aaron,1610612737,ATL,26.0,58,30,28,0.517,...,322,290,352,332,386,222,388,230,35,388
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,1628380,Zach Collins,Zach,1610612759,SAS,25.0,60,18,42,0.300,...,69,135,22,112,140,486,132,71,35,134
518,203897,Zach LaVine,Zach,1610612741,CHI,28.0,70,35,35,0.500,...,282,17,149,46,21,199,43,149,35,31
519,1630192,Zeke Nnaji,Zeke,1610612743,DEN,22.0,47,32,15,0.681,...,147,329,211,250,339,424,363,180,35,369
520,1630533,Ziaire Williams,Ziaire,1610612763,MEM,21.0,36,21,15,0.583,...,386,407,283,320,332,404,383,230,35,366


# Prepare the dataset for training and testing 

Merge team standings with player stats by Team ID 

In [13]:
df = pd.merge(player_stats, standings, left_on='TEAM_ID', right_on='TeamID', how='outer')

In [14]:
df

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,PreAS,PostAS
0,1630639,A.J. Lawson,A.J.,1610612742,DAL,22.0,13,5,8,0.385,...,,,,,,3-3,7-7,11-6,31-29,6-10
1,1629735,Chris Silva,Chris,1610612742,DAL,26.0,1,1,0,1.000,...,,,,,,3-3,7-7,11-6,31-29,6-10
2,1626174,Christian Wood,Christian,1610612742,DAL,27.0,63,32,31,0.508,...,,,,,,3-3,7-7,11-6,31-29,6-10
3,202722,Davis Bertans,Davis,1610612742,DAL,30.0,42,20,22,0.476,...,,,,,,3-3,7-7,11-6,31-29,6-10
4,203939,Dwight Powell,Dwight,1610612742,DAL,31.0,71,36,35,0.507,...,,,,,,3-3,7-7,11-6,31-29,6-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517,1629011,Mitchell Robinson,Mitchell,1610612752,NYK,24.0,54,31,23,0.574,...,,,,,,3-3,7-9,9-6,33-27,10-6
518,1630167,Obi Toppin,Obi,1610612752,NYK,25.0,61,33,28,0.541,...,,,,,,3-3,7-9,9-6,33-27,10-6
519,1629656,Quentin Grimes,Quentin,1610612752,NYK,22.0,65,37,28,0.569,...,,,,,,3-3,7-9,9-6,33-27,10-6
520,1629628,RJ Barrett,RJ,1610612752,NYK,22.0,69,38,31,0.551,...,,,,,,3-3,7-9,9-6,33-27,10-6


Feature engineering. Including these two features in the machine learning model can help to capture important information about a team's performance that is not captured by other features such as points per game or rebounds per game.

In [15]:
df['AssistRatio'] = df['AST'] / (df['AST'] + df['TOV'])
#Indication of a player's ability to assist their teamates without turning the ball over

df['WinStreak'] = df['W'] - df['L']
#Indication of how many consecutive game has won (if value +) or lost (if value -)


Feature selection 

In [17]:
feature_columns = ['PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'FG_PCT', 'FT_PCT', 'FG3_PCT', 'W_PCT', 'AssistRatio', 'WinStreak']
X = df[feature_columns]
y = df['PlayoffRank']

In [18]:
X

Unnamed: 0,PTS,REB,AST,STL,BLK,TOV,FG_PCT,FT_PCT,FG3_PCT,W_PCT,AssistRatio,WinStreak
0,2.6,0.8,0.1,0.1,0.0,0.1,0.500,0.333,0.438,0.385,0.500000,-3
1,2.0,0.0,0.0,0.0,0.0,1.0,1.000,0.000,0.000,1.000,0.000000,1
2,16.9,7.5,1.8,0.4,1.1,1.8,0.516,0.779,0.366,0.508,0.500000,1
3,4.2,1.0,0.5,0.2,0.1,0.2,0.428,0.833,0.393,0.476,0.714286,-2
4,6.9,4.2,0.9,0.7,0.4,0.9,0.726,0.679,0.000,0.507,0.500000,1
...,...,...,...,...,...,...,...,...,...,...,...,...
517,7.3,9.0,0.8,0.9,1.7,0.6,0.694,0.491,0.000,0.574,0.571429,8
518,6.3,2.8,0.7,0.3,0.2,0.5,0.417,0.784,0.324,0.541,0.583333,5
519,10.2,3.2,2.0,0.6,0.4,1.0,0.454,0.795,0.373,0.569,0.666667,9
520,19.7,5.0,2.8,0.4,0.2,2.3,0.436,0.746,0.318,0.551,0.549020,7


In [14]:
# Impute missing values with mean value
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [15]:
X

Unnamed: 0,PTS,REB,AST,STL,BLK,TOV,FG_PCT,FT_PCT,FG3_PCT,W_PCT,AssistRatio,WinStreak
0,2.6,0.8,0.1,0.1,0.0,0.1,0.500,0.333,0.438,0.385,0.500000,-3.0
1,2.0,0.0,0.0,0.0,0.0,1.0,1.000,0.000,0.000,1.000,0.000000,1.0
2,17.1,7.6,1.8,0.5,1.1,1.9,0.516,0.779,0.367,0.500,0.486486,0.0
3,4.1,1.0,0.5,0.2,0.1,0.2,0.419,0.833,0.383,0.463,0.714286,-3.0
4,7.0,4.3,0.9,0.7,0.4,0.9,0.727,0.679,0.000,0.500,0.500000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
517,7.3,9.1,0.8,0.9,1.7,0.6,0.693,0.490,0.000,0.566,0.571429,7.0
518,6.1,2.8,0.8,0.3,0.2,0.5,0.414,0.784,0.322,0.533,0.615385,4.0
519,10.1,3.1,1.9,0.6,0.4,1.0,0.455,0.795,0.371,0.563,0.655172,8.0
520,19.7,5.1,2.7,0.4,0.2,2.3,0.435,0.748,0.318,0.544,0.540000,6.0


In [16]:
y

0      11
1      11
2      11
3      11
4      11
       ..
517     5
518     5
519     5
520     5
521     5
Name: PlayoffRank, Length: 522, dtype: int64

 Split dataset into training and testing sets

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Train a machine learning model

Instantiate and train the RandomForestClassifier

In [18]:
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
clf.fit(X_train, y_train)

# Test the model to find the most accurate prediction

Predict the test set results

In [19]:
y_pred = clf.predict(X_test)

In [20]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 0.22900763358778625


In [21]:
# Predict the champion
df['predicted_rank'] = clf.predict(X)
predicted_champion = df.loc[df['predicted_rank'].idxmin()]['TeamName']
print(f"Predicted NBA Champion: {predicted_champion}")

Predicted NBA Champion: Bucks


In [128]:
# Assign weights to each feature based on their importance
feature_weights = {
    'PTS': 0.1,
    'REB': 0.15,
    'AST': 0.15,
    'STL': 0.05,
    'BLK': 0.05,
    'TOV': -0.15,  # Negative weight, as fewer turnovers are better
    'FG_PCT': 0.1,
    'FT_PCT': 0.05,
    'FG3_PCT': 0.05,
    'W_PCT': 0.2,
    'AssistRatio': 0.15,
    'WinStreak': 0.1
}

In [129]:
# Calculate the weighted score for each team
normalized_team_averages['WeightedScore'] = normalized_team_averages[feature_weights.keys()].dot(pd.Series(feature_weights))
normalized_team_averages['WeightedScore']

TeamName
76ers            0.626709
Bucks            0.606548
Bulls            0.435354
Cavaliers        0.542555
Celtics          0.721461
Clippers         0.627124
Grizzlies        0.725278
Hawks            0.524912
Heat             0.483171
Hornets          0.478999
Jazz             0.383186
Kings            0.293739
Knicks           0.721884
Lakers           0.615156
Magic            0.551450
Mavericks        0.399017
Nets             0.447600
Nuggets          0.768332
Pacers           0.526739
Pelicans         0.655852
Pistons          0.311653
Raptors          0.645331
Rockets          0.295546
Spurs            0.278747
Suns             0.737504
Thunder          0.642012
Timberwolves     0.592538
Trail Blazers    0.378831
Warriors         0.633128
Wizards          0.312252
Name: WeightedScore, dtype: float64

In [137]:
standings = standings.set_index('TeamName')

In [139]:
# Add the 'Conference' column to the 'normalized_team_averages' dataframe
normalized_team_averages = normalized_team_averages.join(standings['Conference'])
normalized_team_averages

ValueError: columns overlap but no suffix specified: Index(['Conference'], dtype='object')

In [94]:
# Rank the teams within each conference based on their weighted score
normalized_team_averages['ConferenceWeightedRank'] = normalized_team_averages.groupby('Conference')['WeightedScore'].rank(ascending=False)
normalized_team_averages['ConferenceWeightedRank'] 

TeamName
76ers             4.0
Bucks             5.0
Bulls            13.0
Cavaliers         7.0
Celtics           2.0
Clippers          7.0
Grizzlies         3.0
Hawks             9.0
Heat             10.0
Hornets          11.0
Jazz             11.0
Kings            14.0
Knicks            1.0
Lakers            8.0
Magic             6.0
Mavericks        10.0
Nets             12.0
Nuggets           1.0
Pacers            8.0
Pelicans          4.0
Pistons          15.0
Raptors           3.0
Rockets          13.0
Spurs            15.0
Suns              2.0
Thunder           5.0
Timberwolves      9.0
Trail Blazers    12.0
Warriors          6.0
Wizards          14.0
Name: ConferenceWeightedRank, dtype: float64

In [95]:
# Sort the teams by conference and weighted rank
ranked_teams_weighted_conference = normalized_team_averages.sort_values(['Conference', 'ConferenceWeightedRank'], ascending=True)
ranked_teams_weighted_conference

Unnamed: 0_level_0,PTS,REB,AST,STL,BLK,TOV,FG_PCT,FT_PCT,FG3_PCT,W_PCT,AssistRatio,WinStreak,TeamScore,TeamRank,WeightedScore,WeightedRank,Conference,ConferenceWeightedRank
TeamName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Knicks,0.775239,1.0,0.533557,0.507834,0.497076,0.423423,0.355236,0.530857,0.266736,0.739494,0.865482,0.744702,1.274496,21.0,0.721884,4.0,East,1.0
Celtics,0.497099,0.793112,0.418674,0.334779,0.727096,0.279279,0.47864,0.364501,0.392142,0.909404,0.754917,0.97967,1.321911,20.0,0.721461,5.0,East,2.0
Raptors,0.453398,0.575798,0.253797,1.0,0.612188,0.108582,0.596432,0.193074,0.258121,0.610096,1.0,0.570078,1.69649,14.0,0.645331,7.0,East,3.0
76ers,0.677355,0.543536,0.475671,0.694988,0.671053,0.436937,0.241304,0.700602,0.233362,0.905821,0.394704,0.921323,1.390813,19.0,0.626709,11.0,East,4.0
Bucks,0.56132,0.744302,0.451902,0.321045,0.463938,0.489489,0.185506,0.903925,0.485484,1.0,0.114257,1.0,1.52009,17.0,0.606548,13.0,East,5.0
Magic,0.668834,0.867521,0.432886,0.928111,0.607537,0.687688,0.353878,0.843238,0.529703,0.46585,0.489429,0.452573,1.138418,23.0,0.55145,15.0,East,6.0
Cavaliers,0.282919,0.140774,0.223253,0.535647,0.434698,0.173291,0.528361,9.4e-05,0.53361,0.947922,0.530298,0.884846,1.903597,10.0,0.542555,16.0,East,7.0
Pacers,0.573382,0.532931,0.614094,0.751965,1.0,0.533651,0.27077,0.736236,0.312234,0.417919,0.520178,0.486852,1.450909,18.0,0.526739,17.0,East,8.0
Hawks,0.646731,0.721468,0.323924,0.628355,0.688109,0.279279,0.401962,0.872642,0.12075,0.518945,0.227246,0.51757,1.745897,13.0,0.524912,18.0,East,9.0
Heat,0.586585,0.7818,0.394986,0.891027,0.259259,0.491256,0.06949,0.914353,0.0,0.614249,0.15782,0.649789,1.806774,12.0,0.483171,19.0,East,10.0


In [105]:
# Extract the team names and playoff ranks
playoff_ranks = standings[['TeamName', 'PlayoffRank']]
playoff_ranks

Unnamed: 0,TeamName,PlayoffRank
0,Nuggets,1
1,Bucks,1
2,Celtics,2
3,Grizzlies,2
4,Kings,3
5,76ers,3
6,Cavaliers,4
7,Suns,4
8,Clippers,5
9,Knicks,5


In [116]:
# Merge the weighted ranks and playoff ranks dataframes using indexes
comparison_df = ranked_teams_weighted_conference[['ConferenceWeightedRank']].merge(playoff_ranks, left_index=True, right_index=True)
comparison_df

Unnamed: 0,ConferenceWeightedRank,TeamName,PlayoffRank


In [117]:
# Reset the index to have 'TeamName' as a column
comparison_df.reset_index(inplace=True)

In [120]:
# Calculate the difference between the WeightedRank and PlayoffRank
comparison_df['RankDifference'] = comparison_df['ConferenceWeightedRank'] - comparison_df['PlayoffRank']
comparison_df['RankDifference']

Series([], Name: RankDifference, dtype: float64)

In [119]:
# Calculate the percentage difference
comparison_df['PercentageDifference'] = (comparison_df['RankDifference'] / comparison_df['PlayoffRank']) * 100

# Display the comparison dataframe
comparison_df

Unnamed: 0,index,ConferenceWeightedRank,TeamName,PlayoffRank,RankDifference,PercentageDifference


In [122]:
# Merge the weighted ranks and playoff ranks dataframes using indexes
comparison_df = ranked_teams_weighted_conference[['ConferenceWeightedRank']].merge(playoff_ranks, left_index=True, right_index=True)
comparison_df


Unnamed: 0,ConferenceWeightedRank,TeamName,PlayoffRank


In [123]:
# Reset the index to have 'TeamName' as a column
comparison_df.reset_index(inplace=True)

In [124]:
# Calculate the difference between the WeightedRank and PlayoffRank
comparison_df['RankDifference'] = comparison_df['ConferenceWeightedRank'] - comparison_df['PlayoffRank']

In [125]:
comparison_df['RankDifference']

Series([], Name: RankDifference, dtype: float64)

In [None]:
# Reset the index to have 'TeamName' as a column
comparison_df.reset_index(inplace=True)

# Calculate the difference between the WeightedRank and PlayoffRank
comparison_df['RankDifference'] = comparison_df['WeightedRank'] - comparison_df['PlayoffRank']

# Calculate the percentage difference
comparison_df['PercentageDifference'] = (comparison_df['RankDifference'] / comparison_df['PlayoffRank']) * 100

# Display the comparison dataframe
comparison_df

