# TITLE: NBA Wins Analysis  

## Name1: Alec
## Name2: Enrique
## Name3: Jackson

In [1]:
## Libaries
import pandas as pd
import numpy as np
import plotly.express as px
import pytz # get timezone inf
import matplotlib.pyplot as plt
import datetime
import plotly.graph_objects as go
import altair as alt


In [2]:
# Files
API_GameLogs = 'gameLogs.csv'
data = pd.read_csv(API_GameLogs)

data.tail()


# Enrique's Datasets
game_data = pd.read_csv('Datasets/NBA Database Datasets/game.csv')
players = pd.read_csv('Datasets/NBA Database Datasets/player.csv')
team_data = pd.read_csv('Datasets/NBA Database Datasets/team.csv')


In [3]:
# Jackson's Datasets
merged = pd.read_csv('Datasets/Jackson Data/merged.csv')
teamsum_df = pd.read_csv('Datasets/Jackson Data/teamsum.csv')

In [4]:
# Alec's Datasets
games_data = pd.read_csv('./ALEC FOLDER/games.csv' )
all_stats = pd.read_csv('.//ALEC FOLDER/nba_2022-23_all_stats.csv')
summary = pd.read_csv('./ALEC FOLDER/Team Summaries.csv')

# Basics Graphs

### Alec's

In [5]:
games_data['GAME_DATE_EST'] = pd.to_datetime(games_data['GAME_DATE_EST'])
games_data['YEAR'] = games_data['GAME_DATE_EST'].dt.year
yearly_stats = games_data.groupby('YEAR').agg(
    Total_Games=('HOME_TEAM_WINS', 'size'),
    Home_Wins=('HOME_TEAM_WINS', 'sum')
)

yearly_stats['Home_Win_Percentage'] = (yearly_stats['Home_Wins'] / yearly_stats['Total_Games']) * 100

#  interactive plot of home win percentage over the years
win_over_time_chart = px.line(yearly_stats, y='Home_Win_Percentage', title='Percentage of Home Team Wins Over Time',
              labels={'Home_Win_Percentage': 'Percentage of Home Wins', 'YEAR': 'Year'},
              markers=True)  
win_over_time_chart.update_layout(xaxis_title='Year',
                  yaxis_title='Percentage of Home Wins (%)',
                  xaxis=dict(tickmode='linear'),
                  hovermode='x')

win_over_time_chart


In [6]:
import statsmodels.api as sm
import plotly.express as px

# Function to expand and duplicate rows for players with multiple teams
def expand_teams(data):
    # Split the 'Team' column where there are multiple teams (using '/')
    expanded_teams = data['Team'].str.split('/', expand=True)

    # Create a new DataFrame for each team and concatenate them into a single DataFrame
    rows = []
    for i in range(expanded_teams.shape[1]):
        temp_df = data.copy()
        temp_df['Team'] = expanded_teams[i]
        temp_df = temp_df[temp_df['Team'].notna()]
        rows.append(temp_df)

    # Concatenate all the dataframes
    return pd.concat(rows, ignore_index=True)

# Apply the function to the all_stats DataFrame
all_stats = expand_teams(all_stats)

# Sort the DataFrame by 'Unnamed: 0' to maintain the original order and then by 'Team'
all_stats = all_stats.sort_values(by=['Unnamed: 0', 'Team']).reset_index(drop=True)

mean_ages = all_stats.groupby('Team')['Age'].mean()

median_ages = all_stats.groupby('Team')['Age'].median()

average_age_per_team = all_stats.groupby('Team')['Age'].mean().reset_index()
average_age_per_team.rename(columns={'Team': 'abbreviation', 'Age': 'average_age'}, inplace=True)

summary_copy = summary.copy()
summary_2022 = summary_copy[(summary_copy['season']==2022)]

wins_per_team = summary_2022[['abbreviation', 'w']]

merged_data = pd.merge(average_age_per_team, wins_per_team, on='abbreviation', how='inner')

merged_data_clean = merged_data.dropna(subset=['average_age', 'w'])


merged_data_clean = merged_data.dropna(subset=['average_age', 'w'])
X = merged_data_clean['average_age']
y = merged_data_clean['w']
X = sm.add_constant(X)

# regression model
model = sm.OLS(y, X).fit()

# regression results
print(model.summary())

# scatter plot
fig = px.scatter(merged_data_clean, x='average_age', y='w', color='abbreviation',
                 labels={'average_age': 'Average Age', 'w': 'Wins'},
                 title='Comparison of Average Team Age to Wins with Manual Regression Line')

# regression line
fig.add_scatter(x=merged_data_clean['average_age'], y=model.predict(), mode='lines', name='Regression Line')
fig.show()



                            OLS Regression Results                            
Dep. Variable:                      w   R-squared:                       0.408
Model:                            OLS   Adj. R-squared:                  0.387
Method:                 Least Squares   F-statistic:                     19.31
Date:                Sun, 21 Apr 2024   Prob (F-statistic):           0.000145
Time:                        13:45:01   Log-Likelihood:                -107.64
No. Observations:                  30   AIC:                             219.3
Df Residuals:                      28   BIC:                             222.1
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const         -84.3282     28.568     -2.952      

In [7]:

games_data = pd.read_csv('./ALEC FOLDER/games.csv' )
games_data['GAME_DATE_EST'] = pd.to_datetime(games_data['GAME_DATE_EST'])
games_data['YEAR'] = games_data['GAME_DATE_EST'].dt.year

# Aggregate the data
yearly_stats = games_data.groupby('YEAR').agg(
    Total_Games=('HOME_TEAM_WINS', 'size'),
    Home_Wins=('HOME_TEAM_WINS', 'sum')
).reset_index()  

yearly_stats['Home_Win_Percentage'] = (yearly_stats['Home_Wins'] / yearly_stats['Total_Games']) * 100

# Create an animated horizontal bar chart
fig = px.bar(
    yearly_stats,
    y='YEAR',  
    x='Home_Win_Percentage',
    labels={'Home_Win_Percentage': 'Percentage of Home Wins', 'YEAR': 'Year'},
    title='Percentage of Home Team Wins Over Time',
    animation_frame='YEAR',  
    orientation='h',  
    range_x=[0, 100]  
)

fig.update_layout(
    height=800,  
    width=1200,  
    yaxis_title='Year',
    xaxis_title='Percentage of Home Wins (%)',
    yaxis=dict(
        range=[yearly_stats['YEAR'].min(), yearly_stats['YEAR'].max()],
        tickmode='linear'
    ),
    hovermode='y'  
)


fig.show()

In [8]:
# Load the CSV file
data = pd.read_csv('./ALEC FOLDER/game 2.csv')

# Convert the 'game_date' column to datetime format
data['game_date'] = pd.to_datetime(data['game_date'])

# Filter the data for games from 2012 onward
df = data[data['game_date'].dt.year >= 2012]



# Define a function to map season_id to the format "YYYY-YYYY"
def map_season_id(season_id):
    season_str = str(season_id)
    start_year = int(season_str[-4:])  # Extract the last four digits
    return f"{start_year-1}-{start_year}"  # Format it to "YYYY-YYYY"

# Apply this function to create a 'season' column
df['season'] = df['season_id'].apply(map_season_id)

# Display the transformed data to check the new 'season' column
print(df[['season_id', 'season']].drop_duplicates())




# Define the list of 30 NBA teams
nba_teams = ['ATL', 'BOS', 'BKN', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW',
             'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOP', 'NYK',
             'OKC', 'ORL', 'PHI', 'PHX', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']

# Filter the teams
df = df[(df['team_abbreviation_home'].isin(nba_teams)) & (df['team_abbreviation_away'].isin(nba_teams))]

# Calculate points allowed
df['pts_allowed_home'] = df['pts_away']
df['pts_allowed_away'] = df['pts_home']

# Calculate possessions
df['poss_home'] = df['fga_home'] + df['tov_home'] + (0.44 * df['fta_home'])
df['poss_away'] = df['fga_away'] + df['tov_away'] + (0.44 * df['fta_away'])

# Calculate defensive efficiency
df['def_eff_home'] = (df['pts_allowed_home'] / df['poss_away']) * 100
df['def_eff_away'] = (df['pts_allowed_away'] / df['poss_home']) * 100

# Calculate personal fouls per game
df['pf_per_game_home'] = df['pf_home']
df['pf_per_game_away'] = df['pf_away']

# Calculate seasonal averages for defensive efficiency and personal fouls per game
seasonal_stats = df.groupby(['season_id', 'team_name_home']).agg({
    'def_eff_home': 'mean',
    'pf_per_game_home': 'mean'
}).reset_index()




# Aggregating data by season and team abbreviation
seasonal_team_stats = df.groupby(['season', 'team_abbreviation_home']).agg({
    'pf_per_game_home': 'sum',  # Total personal fouls per season
    'def_eff_home': 'mean'  # Average defensive efficiency per season
}).reset_index()

seasonal_team_stats.rename(columns={'team_abbreviation_home': 'team_abbreviation', 'pf_per_game_home': 'total_pf', 'def_eff_home': 'avg_def_eff'}, inplace=True)



# Create a scatter plot to visualize the relationship
fig = px.scatter(seasonal_team_stats, x='total_pf', y='avg_def_eff',
                 color='team_abbreviation', hover_data=['season'],
                 labels={'total_pf': 'Total Personal Fouls', 'avg_def_eff': 'Average Defensive Efficiency'},
                 title='Comparison of Total Personal Fouls and Defensive Efficiency by Season and Team',
                 animation_frame='season')  # Use 'season' for the slider

# Update axes ranges
# Assuming you know the data ranges, you can set them like this:
fig.update_xaxes(range=[0, seasonal_team_stats['total_pf'].max() + 50])  # Add some padding to the maximum value
fig.update_yaxes(range=[seasonal_team_stats['avg_def_eff'].min() - 5, seasonal_team_stats['avg_def_eff'].max() + 5])

# Show the figure
fig.show()





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



       season_id     season
50906      22011  2010-2011
51870      42011  2010-2011
51954      32011  2010-2011
51955      12012  2011-2012
52071      42012  2011-2012
52156      32012  2011-2012
52158      22013  2012-2013
53388      12013  2012-2013
53503      42013  2012-2013
53592      32013  2012-2013
53593      22014  2013-2014
54823      12014  2013-2014
54941      42014  2013-2014
55022      32014  2013-2014
55024      22015  2014-2015
56254      12015  2014-2015
56363      42015  2014-2015
56449      32015  2014-2015
56451      22016  2015-2016
57681      12016  2015-2016
57782      42016  2015-2016
57861      32016  2015-2016
57863      22017  2016-2017
59093      12017  2016-2017
59171      42017  2016-2017
59253      32017  2016-2017
59255      22018  2017-2018
60485      42018  2017-2018
60567      32018  2017-2018
60569      22019  2018-2019
61628      42019  2018-2019
61711      32019  2018-2019
61712      22020  2019-2020
62792      12020  2019-2020
62841      42020  20

### Jackson's 

In [9]:
slider = alt.binding_range(min = int(teamsum_df['season'].min()), max = int(teamsum_df['season'].max()), step = 1)
select_season = alt.selection(
    type = 'single',  
    fields = ['season'],
    bind = slider,
    name = "Select"
)

attendance_chart = alt.Chart(teamsum_df).mark_circle(size = 80).encode(
    x = alt.X('attend_g:Q', title = 'Average Fan Attendace at Home Stadium', scale = alt.Scale(domain = [10000, 24000])), 
    y = alt.Y('e_fg_percent:Q', title = 'Effective FG Percent', scale = alt.Scale(domain = [0.43, 0.575])),
    tooltip = ['team', 'arena'], 
    color = alt.Color('season:O', scale = alt.Scale(scheme = 'viridis'), legend = alt.Legend(title = 'Season'))
).properties(
    width = 800,  
    height = 400,  
    title = 'Relationship Between Fan Attendance and Effective Feild Goal Percentage'
).add_selection(
    select_season
).transform_filter(
    select_season
)

attendance_chart.display()



'selection' is deprecated.
   Use 'selection_point()' or 'selection_interval()' instead; these functions also include more helpful docstrings.


The types 'single' and 'multi' are now
        combined and should be specified using "selection_point()".


'add_selection' is deprecated. Use 'add_params' instead.



In [10]:
height_chart = alt.Chart(merged).mark_circle(size = 80).encode(
    x = alt.X('avg_player_height:Q', title = 'Average Height (cm)', scale = alt.Scale(domain = [195, 207])),
    y = alt.Y('w:Q', title = 'Number of Wins', scale = alt.Scale(domain = [6, 80])),
    tooltip = ['team'], 
    color = alt.Color('season:O', scale = alt.Scale(scheme = 'viridis'), legend = alt.Legend(title = 'Season'))
).properties(
    width = 800,  
    height = 400,  
    title = 'Relationship Between Team Average Height and Wins'
).add_selection(
    select_season
).transform_filter(
    select_season
)

height_chart.display()

### Enrique's

In [11]:
import pandas as pd
import altair as alt


# Sample code to prepare the game_data_filtered if you've not already:
game_data_filtered = pd.read_csv('Datasets/NBA Database Datasets/game.csv')


# use season year and not game date, season_id is the year of the season. It also has an extra digit in the front we need to remove 
game_data_filtered['year'] = game_data_filtered['season_id'].astype(str).apply(lambda x: int(x[1:5]))

# Group by stats for home and away teams combined
tov_data = game_data_filtered.groupby('year').agg({
    'tov_home': 'mean',
    'tov_away': 'mean'
}).reset_index().melt('year', var_name='stat', value_name='tov_value')

fg_pct_data = game_data_filtered.groupby('year').agg({
    'fg_pct_home': 'mean',
    'fg_pct_away': 'mean'
}).reset_index().melt('year', var_name='stat', value_name='fg_pct_value')

reb_home_data = game_data_filtered.groupby('year').agg({
    'reb_home': 'mean',
    'reb_away': 'mean'
}).reset_index().melt('year', var_name='stat', value_name='reb_value')

oreb_data = game_data_filtered.groupby('year').agg({
    'oreb_home': 'mean',
    'oreb_away': 'mean'
}).reset_index().melt('year', var_name='stat', value_name='oreb_value')


# Create a slider for year selection
# slider = alt.binding_range(min=1990, max=tov_data['year'].max(), step=1, name='Year:')
# slider_selection = alt.selection_point(bind=slider, fields=['year'], name='Selector')

# Create a 'team_type' column for coloring the bars
tov_data['team_type'] = tov_data['stat'].apply(lambda x: 'Home' if 'home' in x else 'Away')
fg_pct_data['team_type'] = fg_pct_data['stat'].apply(lambda x: 'Home' if 'home' in x else 'Away')
reb_home_data['team_type'] = reb_home_data['stat'].apply(lambda x: 'Home' if 'home' in x else 'Away')
oreb_data['team_type'] = oreb_data['stat'].apply(lambda x: 'Home' if 'home' in x else 'Away')

# Create the charts with color encoding
def create_chart(data, x_title, y_title, tooltip_fields):
    return alt.Chart(data).mark_bar().encode(
        x=alt.X('stat:N', axis=alt.Axis(title=x_title)),
        y=alt.Y(f'{y_title}:Q', axis=alt.Axis(title='Value')),
        color=alt.Color('team_type:N', scale=alt.Scale(domain=['Home', 'Away'], range=['blue', 'red'])),
        tooltip=tooltip_fields
    ).properties(
        width=200
    )
# Creating each chart
tov_Chart = create_chart(tov_data, 'Turnovers', 'tov_value', ['year', 'stat', 'tov_value'])
fg_pct_Chart = create_chart(fg_pct_data, 'Field Goal Percentage', 'fg_pct_value', ['year', 'stat', 'fg_pct_value'])
reb_Chart = create_chart(reb_home_data, 'Rebounds', 'reb_value', ['year', 'stat', 'reb_value'])
oreb_Chart = create_chart(oreb_data, 'Offesive Rebounds', 'oreb_value', ['year', 'stat', 'oreb_value'])

# Combine the charts
combined_charts_stats = alt.hconcat(tov_Chart, fg_pct_Chart, reb_Chart, oreb_Chart).resolve_scale(y='independent').properties(title='Average Game Statistics per Year')

# Display the chart
combined_charts_stats



# Model
Only the machine learning single nuetral net

In [12]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import preprocessing 
from sklearn.metrics import classification_report
import plotly.express as px

data = pd.read_csv('nbaHomeWinLossModelDataset.csv').drop(['Unnamed: 0'],axis=1)
data = data.dropna()

data_new = pd.read_csv('gameLogs.csv')

In [13]:
validation = data[data['SEASON'] == '2022-23']
modelData = data[data['SEASON'] != '2022-23'].sample(frac=1)

X = modelData.drop(['HOME_W','SEASON'],axis=1)
y = modelData['HOME_W']

# Splitting the data into training and testing sets
# training set is 67% of the data and testing set is 33% of the data
# X_train is the training data set
# X_test is the testing data set 
# y_train is the set of labels to all the data in X_train
# y_test is the set of labels to all the data in X_test

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.33)

# Standard Scaling Prediction Variables
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
scaled_data_train = scaler.transform(X_train)

scaler.fit(X_test)
scaled_data_test = scaler.transform(X_test)

#Logistic Regression
model = LogisticRegression()
model.fit(scaled_data_train,y_train)
model.score(scaled_data_test,y_test)



0.6226158038147139

In [14]:
import tensorflow as tf
from tensorflow.keras.models import Sequential # The Sequential model is a linear stack of layers.
from tensorflow.keras.layers import Dense # The Dense layer is a fully connected layer.

# Define the model architecture, loss function, and optimizer. Uses binary crossentropy as the loss function and the Adam optimizer.
model = Sequential()
model.add(Dense(1, activation='sigmoid', input_shape=(X_train.shape[1],)))

# Compile the model, specifying the Adam optimizer, binary crossentropy loss function, and accuracy as the evaluation metric.
# The Adam optimizer is a variant of gradient descent that is widely used in machine learning because it automatically adapts the learning rate during training.
# Binary crossentropy is a loss function used for binary classification problems, and accuracy is a common evaluation metric for classification models.
# The compile method configures the model for training by specifying the optimizer, loss function, and evaluation metric.
# The optimizer is responsible for updating the model's weights during training, the loss function is used to measure the model's performance, and the evaluation metric is used to monitor the model's performance during training.
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model, specifying the training data, number of epochs, and batch size.
# The fit method trains the model on the training data for a fixed number of epochs (iterations over the entire dataset).
# The batch size specifies the number of samples that are used to compute the gradient update during training.
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

# Evaluate the model, calculating the loss and accuracy on the test set.
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

# Plot training and validation loss over epochs, to visualize the model's performance during training.

Epoch 1/10
 1/47 [..............................] - ETA: 7s - loss: 2.6038 - accuracy: 0.4062

2024-04-21 13:45:04.462101: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.9001
Test Accuracy: 0.4591


In [15]:
# Convert the history object into a pandas DataFrame
history_df = pd.DataFrame(history.history)
history_df['epoch'] = range(1, len(history_df) + 1)

# Plot for Accuracy
accuracy_chart = alt.Chart(history_df).mark_line(point=True).encode(
    x='epoch:N',
    y=alt.Y('accuracy', title='Accuracy', scale=alt.Scale(zero=False)),
    color=alt.value('blue')
).properties(
    title='Training Accuracy over Epochs'
) + alt.Chart(history_df).mark_line(point=True).encode(
    x='epoch:N',
    y=alt.Y('val_accuracy', title='Validation Accuracy', scale=alt.Scale(zero=False)),
    color=alt.value('green')
).properties(
    title='Validation Accuracy over Epochs'
)

# Plot for Loss
loss_chart = alt.Chart(history_df).mark_line(point=True).encode(
    x='epoch:N',
    y=alt.Y('loss', title='Loss', scale=alt.Scale(zero=False)),
    color=alt.value('red')
).properties(
    title='Training Loss over Epochs'
) + alt.Chart(history_df).mark_line(point=True).encode(
    x='epoch:N',
    y=alt.Y('val_loss', title='Validation Loss', scale=alt.Scale(zero=False)),
    color=alt.value('orange')
).properties(
    title='Validation Loss over Epochs'
)

# Combine the charts
model_performance = alt.vconcat(accuracy_chart, loss_chart).properties(
    spacing=30
).configure_title(
    fontSize=20,
    anchor='start',
    color='gray'
)

# DASH
### link: http://127.0.0.1:8050/

In [16]:
from dash import Dash, html, dcc, callback, Output, Input
import plotly.express as px
import pandas as pd
import altair as alt
import dash_vega_components as dvc


app = Dash(__name__)

app.layout = html.Div([
    html.H1(children='NBA Wins Analysis', style={'textAlign':'center'}),
    
    html.Div([
        html.Label('Select Year For all Charts:'),
        dcc.Slider(
            id='year-slider',
            min=1990,
            max=2022,
            step=1,
            value=1990,
            marks={i: str(i) for i in range(1990, 2023)}
        )
    ]),

    dvc.Vega(
        id='graph-content-combined_charts_stats',
        opt={},
        spec=combined_charts_stats.to_dict()
    ),

    dvc.Vega(
        id='graph-content-height_chart',
        opt={},
        spec=height_chart.to_dict()
    ),
    dvc.Vega(
        id='graph-content-attendance_chart',
        opt={},
        spec=attendance_chart.to_dict()
    ),

    dvc.Vega(
        id='graph-content-model_performance',
        opt={},
        spec=model_performance.to_dict()
    )


])

@callback(
    Output('graph-content-combined_charts_stats', 'spec'),
    Input('year-slider', 'value')
)
def update_graph(value):
    return slider_selection.update(value=value)

if __name__ == '__main__':
    app.run(debug=True)