In [1]:
from nba_api.stats.endpoints import shotchartdetail
import json
import pandas as pd
import time

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
def shot_chart_data(year):
    response = shotchartdetail.ShotChartDetail(
        team_id=0,
        player_id=0,
        season_nullable=f'{year}-{str(year + 1)[2:]}',
        context_measure_simple = 'FGA',
        season_type_all_star='Regular Season'
    )
    
    content = json.loads(response.get_json())
    
    results = content['resultSets'][0]
    headers = results['headers']
    rows = results['rowSet']
    df = pd.DataFrame(rows, columns=headers)

    relevant_cols = ['PLAYER_ID', 'PLAYER_NAME', 'TEAM_NAME', 'PERIOD', 'EVENT_TYPE', 'ACTION_TYPE', 'SHOT_TYPE',
                    'SHOT_ZONE_BASIC', 'SHOT_ZONE_AREA', 'SHOT_ZONE_RANGE', 'SHOT_DISTANCE',
                    'LOC_X', 'LOC_Y', 'SHOT_ATTEMPTED_FLAG', 'SHOT_MADE_FLAG', 'GAME_DATE']
    # Dates
    df["SEASON"] = f'{year}-{str(year + 1)[2:]}'
    df['GAME_DATE'] = pd.to_datetime(df['GAME_DATE'], format='%Y%m%d')

    # Format the datetime column to the desired output format
    df['GAME_DATE'] = df['GAME_DATE'].dt.strftime('%Y-%m-%d')
    df.to_csv(f"shotcharts/NBA_{year}-{str(year + 1)[2:]}_reg_season.csv", index=False)
    return df

In [3]:
for year in range(1996, 2024):
    print(year)
    df = shot_chart_data(year)
    print(f"year: {df.shape[0]} rows")
    time.sleep(2)

1996
year: 188589 rows
1997
year: 189536 rows
1998
year: 113377 rows
1999
year: 195220 rows
2000
year: 191662 rows
2001
year: 193251 rows
2002
year: 192109 rows
2003
year: 189803 rows
2004
year: 197626 rows
2005
year: 194314 rows
2006
year: 196072 rows
2007
year: 200501 rows
2008
year: 199030 rows
2009
year: 200966 rows
2010
year: 199761 rows
2011
year: 161205 rows
2012
year: 201579 rows
2013
year: 204126 rows
2014
year: 205550 rows
2015
year: 207893 rows
2016
year: 209929 rows
2017
year: 211707 rows
2018
year: 219458 rows
2019
year: 188116 rows
2020
year: 190983 rows
2021
year: 216722 rows
2022
year: 217220 rows
2023
year: 218701 rows


In [None]:
import os
import polars as pl

In [None]:
directory = 'shotcharts/'

# List all files and directories in the specified directory
all_files = os.listdir(directory)

# Filter out directories and keep only CSV files
csv_files = [f"{directory}/{f}" for f in all_files if os.path.isfile(os.path.join(directory, f)) and f.endswith('.csv')]

print(csv_files)

In [None]:
# Create dataframes for each CSV file
dfs = [pl.read_csv(csv_file) for csv_file in csv_files]

# Concatenate
combined_df = pl.concat(dfs)

combined_df.write_parquet(
    "data/shotcharts_historical.parquet.gz",
    compression="gzip"
)

# Flow of the PPT
* Game has changed
* What are the positions
* TDA uncovers specific positions
* Frequent Itemset Analysis for of winning team compositions

# Data vis ideas
* Players become better at shooting threes (ratio )
* Players are shooting more threes over time
* Histogram of shot distance