In [15]:
import pandas as pd
import plotly.express as px

data_path = './games.csv' 
games_data = pd.read_csv(data_path)
games_data['GAME_DATE_EST'] = pd.to_datetime(games_data['GAME_DATE_EST'])
games_data['YEAR'] = games_data['GAME_DATE_EST'].dt.year
yearly_stats = games_data.groupby('YEAR').agg(
    Total_Games=('HOME_TEAM_WINS', 'size'),
    Home_Wins=('HOME_TEAM_WINS', 'sum')
)

yearly_stats['Home_Win_Percentage'] = (yearly_stats['Home_Wins'] / yearly_stats['Total_Games']) * 100

#  interactive plot of home win percentage over the years
fig = px.line(yearly_stats, y='Home_Win_Percentage', title='Percentage of Home Team Wins Over Time',
              labels={'Home_Win_Percentage': 'Percentage of Home Wins', 'YEAR': 'Year'},
              markers=True)  
fig.update_layout(xaxis_title='Year',
                  yaxis_title='Percentage of Home Wins (%)',
                  xaxis=dict(tickmode='linear'),
                  hovermode='x')
fig.show()



#### Annotations
* This graph shows the total percentage of the time the home team wins of NBA games from 2003-2022.
* We can see how the home team wins over 50% of the time for all of the seasons.
*  We can also see how in 2020,covid and no fans attending games, impacted the win percentage/ home court advantage.

In [16]:
all_stats = pd.read_csv('./nba_2022-23_all_stats.csv')

In [17]:
# Function to expand and duplicate rows for players with multiple teams
def expand_teams(data):
    # Split the 'Team' column where there are multiple teams (using '/')
    expanded_teams = data['Team'].str.split('/', expand=True)

    # Create a new DataFrame for each team and concatenate them into a single DataFrame
    rows = []
    for i in range(expanded_teams.shape[1]):
        temp_df = data.copy()
        temp_df['Team'] = expanded_teams[i]
        temp_df = temp_df[temp_df['Team'].notna()]
        rows.append(temp_df)

    # Concatenate all the dataframes
    return pd.concat(rows, ignore_index=True)

# Apply the function to the all_stats DataFrame
all_stats = expand_teams(all_stats)

# Sort the DataFrame by 'Unnamed: 0' to maintain the original order and then by 'Team'
all_stats = all_stats.sort_values(by=['Unnamed: 0', 'Team']).reset_index(drop=True)

# Display the first few rows of the updated DataFrame
print(all_stats.head())

   Unnamed: 0               Player Name Position  Age Team  GP  GS    MP   FG  \
0           0  Nickeil Alexander-Walker       SG   24  MIN  59   3  15.0  2.2   
1           0  Nickeil Alexander-Walker       SG   24  UTA  59   3  15.0  2.2   
2           1          Ryan Arcidiacono       PG   28  NYK  20   4   8.6  0.5   
3           1          Ryan Arcidiacono       PG   28  POR  20   4   8.6  0.5   
4           2                  Mo Bamba        C   24  LAL  49   7  15.7  2.4   

   FGA  ...  TOV%  USG%  OWS  DWS   WS  WS/48  OBPM  DBPM  BPM  VORP  
0  5.0  ...  14.6  17.9  0.3  0.8  1.1  0.062  -1.4   0.4 -0.9   0.2  
1  5.0  ...  14.6  17.9  0.3  0.8  1.1  0.062  -1.4   0.4 -0.9   0.2  
2  1.9  ...  15.9  11.1 -0.2  0.1 -0.2 -0.043  -7.3  -1.5 -8.8  -0.3  
3  1.9  ...  15.9  11.1 -0.2  0.1 -0.2 -0.043  -7.3  -1.5 -8.8  -0.3  
4  4.9  ...  10.1  16.6  1.1  1.1  2.2  0.139  -0.2   0.7  0.5   0.5  

[5 rows x 51 columns]


In [18]:
mean_ages = all_stats.groupby('Team')['Age'].mean()

In [19]:
median_ages = all_stats.groupby('Team')['Age'].median()

In [20]:
average_age_per_team = all_stats.groupby('Team')['Age'].mean().reset_index()
average_age_per_team.rename(columns={'Team': 'abbreviation', 'Age': 'average_age'}, inplace=True)


In [21]:
summary = pd.read_csv('./Team Summaries.csv')

In [22]:
summary_copy = summary.copy()
summary_2022 = summary_copy[(summary_copy['season']==2022)]
summary_2022.reset_index()

Unnamed: 0,index,season,lg,team,abbreviation,playoffs,age,w,l,pw,...,tov_percent,orb_percent,ft_fga,opp_e_fg_percent,opp_tov_percent,opp_drb_percent,opp_ft_fga,arena,attend,attend_g
0,62,2022,NBA,Atlanta Hawks,ATL,True,26.1,43.0,39.0,45.0,...,10.8,23.0,0.205,0.543,11.5,76.9,0.177,State Farm Arena,672742.0,16408.0
1,63,2022,NBA,Boston Celtics,BOS,True,26.1,51.0,31.0,59.0,...,12.4,24.0,0.195,0.502,12.5,77.3,0.183,TD Garden,727928.0,17754.0
2,64,2022,NBA,Brooklyn Nets,BRK,True,29.1,44.0,38.0,43.0,...,12.5,23.9,0.198,0.521,11.7,75.1,0.201,Barclays Center,711539.0,17355.0
3,65,2022,NBA,Chicago Bulls,CHI,True,26.3,46.0,36.0,40.0,...,11.8,20.4,0.201,0.541,11.9,78.3,0.199,United Center,856148.0,20882.0
4,66,2022,NBA,Charlotte Hornets,CHO,False,25.5,43.0,39.0,42.0,...,11.6,23.3,0.173,0.544,13.1,74.8,0.187,Spectrum Center,700755.0,17092.0
5,67,2022,NBA,Cleveland Cavaliers,CLE,False,24.7,44.0,38.0,47.0,...,13.2,24.0,0.198,0.52,12.3,76.5,0.172,Rocket Mortgage Fieldhouse,758228.0,18493.0
6,68,2022,NBA,Dallas Mavericks,DAL,True,26.7,52.0,30.0,50.0,...,11.7,21.3,0.192,0.521,12.2,78.0,0.185,American Airlines Center,808037.0,19708.0
7,69,2022,NBA,Denver Nuggets,DEN,True,27.7,48.0,34.0,47.0,...,13.2,21.9,0.194,0.537,11.7,78.3,0.188,Ball Arena,695262.0,16958.0
8,70,2022,NBA,Detroit Pistons,DET,False,23.6,23.0,59.0,22.0,...,12.6,23.4,0.194,0.541,13.1,75.6,0.226,Little Caesars Arena,663556.0,16184.0
9,71,2022,NBA,Golden State Warriors,GSW,True,27.6,53.0,29.0,55.0,...,13.5,22.8,0.181,0.509,13.0,78.7,0.201,Chase Center,740624.0,18064.0


In [23]:
# extract necessary columns for merging and visualization
wins_per_team = summary_2022[['abbreviation', 'w']]


In [24]:
# merge on the abbreviation
merged_data = pd.merge(average_age_per_team, wins_per_team, on='abbreviation', how='inner')

In [25]:
print(merged_data.isnull().sum())


merged_data_clean = merged_data.dropna(subset=['average_age', 'w'])

# check the variability (standard deviation) in the data
print(merged_data_clean[['average_age', 'w']].std())


abbreviation    0
average_age     0
w               0
dtype: int64
average_age     1.525837
w              11.566897
dtype: float64


In [26]:
import statsmodels.api as sm
import plotly.express as px

merged_data_clean = merged_data.dropna(subset=['average_age', 'w'])
X = merged_data_clean['average_age']
y = merged_data_clean['w']
X = sm.add_constant(X)

# regression model
model = sm.OLS(y, X).fit()

# regression results
print(model.summary())

# scatter plot
fig = px.scatter(merged_data_clean, x='average_age', y='w', color='abbreviation',
                 labels={'average_age': 'Average Age', 'w': 'Wins'},
                 title='Comparison of Average Team Age to Wins with Manual Regression Line')

# regression line
fig.add_scatter(x=merged_data_clean['average_age'], y=model.predict(), mode='lines', name='Regression Line')
fig.show()


                            OLS Regression Results                            
Dep. Variable:                      w   R-squared:                       0.408
Model:                            OLS   Adj. R-squared:                  0.387
Method:                 Least Squares   F-statistic:                     19.31
Date:                Sun, 21 Apr 2024   Prob (F-statistic):           0.000145
Time:                        12:31:49   Log-Likelihood:                -107.64
No. Observations:                  30   AIC:                             219.3
Df Residuals:                      28   BIC:                             222.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const         -84.3282     28.568     -2.952      

#### Annotations
* This graph shows the average age of teams and the total number of wins each team got for the 2022 season.
* We can seee from the linear regression line that age has a positive correlation with the total number of wins.
* There are some outliers that we percieve to come from the teams having all star players.

In [27]:
import pandas as pd
import plotly.express as px

data_path = './games.csv'
games_data = pd.read_csv(data_path)
games_data['GAME_DATE_EST'] = pd.to_datetime(games_data['GAME_DATE_EST'])
games_data['YEAR'] = games_data['GAME_DATE_EST'].dt.year

# Aggregate the data
yearly_stats = games_data.groupby('YEAR').agg(
    Total_Games=('HOME_TEAM_WINS', 'size'),
    Home_Wins=('HOME_TEAM_WINS', 'sum')
).reset_index()  

yearly_stats['Home_Win_Percentage'] = (yearly_stats['Home_Wins'] / yearly_stats['Total_Games']) * 100

# Create an animated horizontal bar chart
fig = px.bar(
    yearly_stats,
    y='YEAR',  
    x='Home_Win_Percentage',
    labels={'Home_Win_Percentage': 'Percentage of Home Wins', 'YEAR': 'Year'},
    title='Percentage of Home Team Wins Over Time',
    animation_frame='YEAR',  
    orientation='h',  
    range_x=[0, 100]  
)

fig.update_layout(
    height=800,  
    width=1200,  
    yaxis_title='Year',
    xaxis_title='Percentage of Home Wins (%)',
    yaxis=dict(
        range=[yearly_stats['YEAR'].min(), yearly_stats['YEAR'].max()],
        tickmode='linear'
    ),
    hovermode='y'  
)


fig.show()