In [1]:
import pandas as pd
import numpy as np

Import NBA statistic from NBA API (player statistics and teams standings)

In [2]:
from nba_api.stats.endpoints import leaguedashplayerstats, leaguestandings
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

Get team standings and player statistics dataset

In [3]:
standings = leaguestandings.LeagueStandings(season='2022-23').get_data_frames()[0]
standings

Unnamed: 0,LeagueID,SeasonID,TeamID,TeamCity,TeamName,Conference,ConferenceRecord,PlayoffRank,ClinchIndicator,Division,...,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,PreAS,PostAS
0,0,22022,1610612743,Denver,Nuggets,West,34-18,1,- w,Northwest,...,,,,,,4-3,10-4,9-5,41-18,12-11
1,0,22022,1610612749,Milwaukee,Bucks,East,35-17,1,- e,Central,...,,,,,,6-0,9-5,8-7,41-17,17-7
2,0,22022,1610612738,Boston,Celtics,East,34-18,2,- a,Atlantic,...,,,,,,4-2,14-2,8-6,42-17,15-8
3,0,22022,1610612763,Memphis,Grizzlies,West,30-22,2,- sw,Southwest,...,,,,,,4-3,8-6,10-4,35-22,16-9
4,0,22022,1610612758,Sacramento,Kings,West,32-20,3,- p,Pacific,...,,,,,,2-4,9-5,8-6,32-25,16-9
5,0,22022,1610612755,Philadelphia,76ers,East,34-18,3,- x,Atlantic,...,,,,,,4-4,8-6,9-4,38-19,16-9
6,0,22022,1610612739,Cleveland,Cavaliers,East,34-18,4,- x,Central,...,,,,,,5-1,9-7,9-6,38-23,13-8
7,0,22022,1610612756,Phoenix,Suns,West,30-22,4,- x,Pacific,...,,,,,,5-1,10-5,5-11,32-28,13-9
8,0,22022,1610612746,LA,Clippers,West,27-25,5,- x,Pacific,...,,,,,,3-4,10-6,8-7,33-28,11-10
9,0,22022,1610612752,New York,Knicks,East,32-20,5,- x,Atlantic,...,,,,,,3-3,7-9,9-6,33-27,14-8


In [4]:
player_stats = leaguedashplayerstats.LeagueDashPlayerStats(season='2022-23', season_type_all_star='Regular Season', per_mode_detailed='PerGame').get_data_frames()[0]
player_stats

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,BLK_RANK,BLKA_RANK,PF_RANK,PFD_RANK,PTS_RANK,PLUS_MINUS_RANK,NBA_FANTASY_PTS_RANK,DD2_RANK,TD3_RANK,WNBA_FANTASY_PTS_RANK
0,1630639,A.J. Lawson,A.J.,1610612742,DAL,22.0,15,5,10,0.333,...,491,358,475,457,429,462,482,253,39,466
1,1631260,AJ Green,AJ,1610612749,MIL,23.0,35,27,8,0.771,...,491,498,454,518,392,309,455,253,39,436
2,1631100,AJ Griffin,AJ,1610612737,ATL,19.0,72,34,38,0.472,...,372,268,394,422,217,169,287,253,39,273
3,203932,Aaron Gordon,Aaron,1610612743,DEN,27.0,68,45,23,0.662,...,65,32,205,58,82,4,77,56,39,89
4,1628988,Aaron Holiday,Aaron,1610612737,ATL,26.0,63,32,31,0.508,...,345,315,377,356,415,212,413,253,39,416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,1628380,Zach Collins,Zach,1610612759,SAS,25.0,63,20,43,0.317,...,63,123,19,113,142,486,130,70,39,131
535,203897,Zach LaVine,Zach,1610612741,CHI,28.0,77,38,39,0.494,...,305,27,167,47,21,216,44,155,39,33
536,1630192,Zeke Nnaji,Zeke,1610612743,DEN,22.0,53,34,19,0.642,...,162,339,181,282,351,391,380,192,39,389
537,1630533,Ziaire Williams,Ziaire,1610612763,MEM,21.0,37,21,16,0.568,...,385,432,298,312,332,420,387,253,39,367


Merge team standings with player stats by Team ID

In [5]:
df = pd.merge(player_stats, standings, left_on='TEAM_ID', right_on='TeamID', how='outer')
df

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,PreAS,PostAS
0,1630639,A.J. Lawson,A.J.,1610612742,DAL,22.0,15,5,10,0.333,...,,,,,,3-3,7-7,11-6,31-29,7-15
1,1629735,Chris Silva,Chris,1610612742,DAL,26.0,1,1,0,1.000,...,,,,,,3-3,7-7,11-6,31-29,7-15
2,1626174,Christian Wood,Christian,1610612742,DAL,27.0,67,33,34,0.493,...,,,,,,3-3,7-7,11-6,31-29,7-15
3,202722,Davis Bertans,Davis,1610612742,DAL,30.0,45,20,25,0.444,...,,,,,,3-3,7-7,11-6,31-29,7-15
4,203939,Dwight Powell,Dwight,1610612742,DAL,31.0,76,37,39,0.487,...,,,,,,3-3,7-7,11-6,31-29,7-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,1629011,Mitchell Robinson,Mitchell,1610612752,NYK,25.0,59,35,24,0.593,...,,,,,,3-3,7-9,9-6,33-27,14-8
535,1630167,Obi Toppin,Obi,1610612752,NYK,25.0,67,37,30,0.552,...,,,,,,3-3,7-9,9-6,33-27,14-8
536,1629656,Quentin Grimes,Quentin,1610612752,NYK,22.0,71,41,30,0.577,...,,,,,,3-3,7-9,9-6,33-27,14-8
537,1629628,RJ Barrett,RJ,1610612752,NYK,22.0,73,40,33,0.548,...,,,,,,3-3,7-9,9-6,33-27,14-8


# Feature engineering
Including these two features can help to capture important information about a team's performance that is not captured by other features such as points per game or rebounds per game.

In [6]:
df['AssistRatio'] = df['AST'] / (df['AST'] + df['TOV'])
#Indication of a player's ability to assist their teamates without turning the ball over

df['WinStreak'] = df['W'] - df['L']
#Indication of how many consecutive game has won (if value +) or lost (if value -)

In [7]:
df['AssistRatio']

0      0.333333
1      0.000000
2      0.500000
3      0.714286
4      0.500000
         ...   
534    0.562500
535    0.625000
536    0.677419
537    0.560000
538         NaN
Name: AssistRatio, Length: 539, dtype: float64

In [8]:
df['WinStreak']

0      -5
1       1
2      -1
3      -5
4      -2
       ..
534    11
535     7
536    11
537     7
538     3
Name: WinStreak, Length: 539, dtype: int64

Feature selections and split dataset into training and testing sets

In [9]:
feature_columns = ['PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'FG_PCT', 'FT_PCT', 'FG3_PCT', 'AssistRatio', 'WinStreak']

X = df[feature_columns]
y = df['W_PCT']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [11]:
X

Unnamed: 0,PTS,REB,AST,STL,BLK,TOV,FG_PCT,FT_PCT,FG3_PCT,AssistRatio,WinStreak
0,3.7,1.4,0.1,0.1,0.0,0.2,0.500,0.250,0.400,0.333333,-5
1,2.0,0.0,0.0,0.0,0.0,1.0,1.000,0.000,0.000,0.000000,1
2,16.6,7.3,1.8,0.4,1.1,1.8,0.515,0.772,0.376,0.500000,-1
3,4.6,1.2,0.5,0.2,0.2,0.2,0.431,0.867,0.390,0.714286,-5
4,6.7,4.1,0.9,0.6,0.3,0.9,0.732,0.667,0.000,0.500000,-2
...,...,...,...,...,...,...,...,...,...,...,...
534,7.4,9.4,0.9,0.9,1.8,0.7,0.671,0.484,0.000,0.562500,11
535,7.4,2.8,1.0,0.3,0.2,0.6,0.446,0.809,0.344,0.625000,7
536,11.3,3.2,2.1,0.7,0.4,1.0,0.468,0.796,0.386,0.677419,11
537,19.6,5.0,2.8,0.4,0.2,2.2,0.434,0.740,0.310,0.560000,7


In [12]:
y

0      0.333
1      1.000
2      0.493
3      0.444
4      0.487
       ...  
534    0.593
535    0.552
536    0.577
537    0.548
538    1.000
Name: W_PCT, Length: 539, dtype: float64

# Simple Imputer
Create a SimpleImputer with strategy set to 'mean'

In [13]:
imputer = SimpleImputer(strategy='mean')

Fit the imputer on the training data

In [14]:
imputer.fit(X_train)

Transform both the training and test data using the imputer

In [15]:
X_train_imputed = imputer.transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [16]:
lr = LinearRegression()
lr.fit(X_train_imputed, y_train)

Calculate predictions and evaluate the model with the imputed data

In [17]:
y_pred = lr.predict(X_test_imputed)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [18]:
y_pred

array([0.39392769, 0.36386777, 0.32630712, 0.40605849, 0.59969102,
       0.4096735 , 0.67405746, 0.56004154, 0.42754869, 0.50644603,
       0.40310898, 0.13770782, 0.60676826, 0.43989233, 0.15252192,
       0.50803301, 0.39557464, 0.42466888, 0.30309746, 0.53972032,
       0.31295795, 0.63917832, 0.53125049, 0.13946104, 0.27111327,
       0.49010491, 0.57824547, 0.43597691, 0.37394245, 0.4082085 ,
       0.53940648, 0.49626947, 0.36149854, 0.58739308, 0.64297681,
       0.48941625, 0.54530778, 0.70906002, 0.5449362 , 0.57922494,
       0.23593588, 0.40854576, 0.63859565, 0.61324009, 0.19487843,
       0.37260121, 0.49033409, 0.45026085, 0.53964806, 0.79828909,
       0.57373243, 0.58757438, 0.18980572, 0.45927979, 0.56714979,
       0.52126899, 0.4913739 , 0.3520081 , 0.5532792 , 0.35304847,
       0.37733994, 0.69604055, 0.40686579, 0.51428122, 0.6149694 ,
       0.39628459, 0.30375753, 0.60464631, 0.56001223, 0.56837649,
       0.3742896 , 0.70909655, 0.47943787, 0.49135044, 0.51476

In [24]:
print(f"Mean Squared Error: {mse:.4f}")
print(f"R2 Score: {r2:.4f}")

Mean Squared Error: 0.0117
R2 Score: 0.5188


Mean Squared Error (MSE): This is the average of the squared differences between the actual values and the predicted values. The MSE is 0.0117, which is quite low. This indicates that on average, the squared difference between the actual and predicted win percentages is small.

R2 Score: This metric represents the proportion of the variance in the dependent variable that is predictable from the independent variables. An R2 score of 0.5188 suggests that the model is able to explain about 51.88% of the variance in the win percentage.


Impute the missing values in the whole dataset

In [20]:
X_imputed = imputer.transform(X)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [21]:
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
X

Unnamed: 0,PTS,REB,AST,STL,BLK,TOV,FG_PCT,FT_PCT,FG3_PCT,AssistRatio,WinStreak
0,3.7,1.4,0.1,0.1,0.0,0.2,0.500,0.250,0.400,0.333333,-5.0
1,2.0,0.0,0.0,0.0,0.0,1.0,1.000,0.000,0.000,0.000000,1.0
2,16.6,7.3,1.8,0.4,1.1,1.8,0.515,0.772,0.376,0.500000,-1.0
3,4.6,1.2,0.5,0.2,0.2,0.2,0.431,0.867,0.390,0.714286,-5.0
4,6.7,4.1,0.9,0.6,0.3,0.9,0.732,0.667,0.000,0.500000,-2.0
...,...,...,...,...,...,...,...,...,...,...,...
534,7.4,9.4,0.9,0.9,1.8,0.7,0.671,0.484,0.000,0.562500,11.0
535,7.4,2.8,1.0,0.3,0.2,0.6,0.446,0.809,0.344,0.625000,7.0
536,11.3,3.2,2.1,0.7,0.4,1.0,0.468,0.796,0.386,0.677419,11.0
537,19.6,5.0,2.8,0.4,0.2,2.2,0.434,0.740,0.310,0.560000,7.0


# Predict the champion

In [22]:
df['predicted_w_pct'] = lr.predict(X)
predicted_champion = df.loc[df['predicted_w_pct'].idxmax()]['TeamName']
print(f"Predicted NBA Champion: {predicted_champion}")

Predicted NBA Champion: Bucks


