# Preprocessing

## Master Data

In [1]:
import pandas as pd

df_players_raw = pd.read_csv("../../data/raw/players/player_master.csv")
df_players_raw.head()


Unnamed: 0,PERSON_ID,DISPLAY_LAST_COMMA_FIRST,DISPLAY_FIRST_LAST,ROSTERSTATUS,FROM_YEAR,TO_YEAR,PLAYERCODE,PLAYER_SLUG,TEAM_ID,TEAM_CITY,TEAM_NAME,TEAM_ABBREVIATION,TEAM_CODE,TEAM_SLUG,GAMES_PLAYED_FLAG,OTHERLEAGUE_EXPERIENCE_CH
0,76001,"Abdelnaby, Alaa",Alaa Abdelnaby,0,1990,1994,HISTADD_alaa_abdelnaby,alaa_abdelnaby,0,,,,,,Y,0
1,76002,"Abdul-Aziz, Zaid",Zaid Abdul-Aziz,0,1968,1977,HISTADD_zaid_abdul-aziz,zaid_abdul-aziz,0,,,,,,Y,0
2,76003,"Abdul-Jabbar, Kareem",Kareem Abdul-Jabbar,0,1969,1988,HISTADD_kareem_abdul-jabbar,kareem_abdul-jabbar,0,,,,,,Y,0
3,51,"Abdul-Rauf, Mahmoud",Mahmoud Abdul-Rauf,0,1990,2000,mahmoud_abdul-rauf,mahmoud_abdul-rauf,0,,,,,,Y,0
4,1505,"Abdul-Wahad, Tariq",Tariq Abdul-Wahad,0,1997,2003,tariq_abdul-wahad,tariq_abdul-wahad,0,,,,,,Y,0


In [2]:
# Clean column names for consistency
df_players = df_players_raw.copy()
df_players.columns = (
    df_players.columns
        .str.lower()
        .str.strip()
        .str.replace(" ", "_")
        .str.replace("-", "_")
        .str.replace("/", "_")
)

# Rename key columns to clearer names
df_players = df_players.rename(columns={
    "person_id": "player_id",
    "display_first_last": "player_name",
    "display_last_comma_first": "player_name_last_first",
    "from_year": "from_season",
    "to_year": "to_season",
})

# Convert important columns to integers
df_players["player_id"] = df_players["player_id"].astype(int)
df_players["from_season"] = df_players["from_season"].astype(int)
df_players["to_season"] = df_players["to_season"].astype(int)

# Remove duplicate players based on player_id
df_players = df_players.drop_duplicates(subset=["player_id"])

# Show missing values for each column
missing_values = df_players.isnull().sum()
print("Missing values per column:")
print(missing_values)

df_players.head()



Missing values per column:
player_id                       0
player_name_last_first          0
player_name                     0
rosterstatus                    0
from_season                     0
to_season                       0
playercode                      0
player_slug                     0
team_id                         0
team_city                    4586
team_name                    4586
team_abbreviation            4586
team_code                    4586
team_slug                    4586
games_played_flag               0
otherleague_experience_ch       0
dtype: int64


Unnamed: 0,player_id,player_name_last_first,player_name,rosterstatus,from_season,to_season,playercode,player_slug,team_id,team_city,team_name,team_abbreviation,team_code,team_slug,games_played_flag,otherleague_experience_ch
0,76001,"Abdelnaby, Alaa",Alaa Abdelnaby,0,1990,1994,HISTADD_alaa_abdelnaby,alaa_abdelnaby,0,,,,,,Y,0
1,76002,"Abdul-Aziz, Zaid",Zaid Abdul-Aziz,0,1968,1977,HISTADD_zaid_abdul-aziz,zaid_abdul-aziz,0,,,,,,Y,0
2,76003,"Abdul-Jabbar, Kareem",Kareem Abdul-Jabbar,0,1969,1988,HISTADD_kareem_abdul-jabbar,kareem_abdul-jabbar,0,,,,,,Y,0
3,51,"Abdul-Rauf, Mahmoud",Mahmoud Abdul-Rauf,0,1990,2000,mahmoud_abdul-rauf,mahmoud_abdul-rauf,0,,,,,,Y,0
4,1505,"Abdul-Wahad, Tariq",Tariq Abdul-Wahad,0,1997,2003,tariq_abdul-wahad,tariq_abdul-wahad,0,,,,,,Y,0


## Player Stats

In [3]:
import pandas as pd
import glob
import os

# 1. Locate all player stats CSV files in the raw directory
# The folder contains one CSV per season
# glob() finds every file matching the pattern so we can load them all.
path = "../../data/raw/player_stats/*.csv"
files = glob.glob(path)

df_player_stats_raw_list = []

# 2. Loop through each CSV file and load it
# - Extract the season from the filename 
# - Add a new column called "season" so we know which season each row belongs to
# - Append the DataFrame to a list for combining later
for file in files:
    filename = os.path.basename(file)  
    season = filename.replace("player_stats_", "").replace(".csv", "")
    
    temp_df = pd.read_csv(file)
    temp_df["season"] = season  # Add season column
    
    df_player_stats_raw_list.append(temp_df)

# 3. Combine all seasons into one DataFrame
df_player_stats_raw = pd.concat(df_player_stats_raw_list, ignore_index=True)


In [4]:
# Make a copy of the raw combined DataFrame
df_player_stats = df_player_stats_raw.copy()


# Clean column names for consistency
df_player_stats.columns = (
    df_player_stats.columns
        .str.lower()
        .str.strip()
        .str.replace(" ", "_")
        .str.replace("-", "_")
        .str.replace("/", "_")
)

# Rename key columns to clearer names
df_player_stats = df_player_stats.rename(columns={
    "player_id": "player_id",
    "team_id": "team_id",
    "team_abbreviation": "team_abbreviation"
    # player_name already renamed above
})

# Convert important numeric columns
df_player_stats["player_id"] = df_player_stats["player_id"].astype(int)
df_player_stats["team_id"] = df_player_stats["team_id"].astype(int)
df_player_stats["age"] = df_player_stats["age"].astype(float)
df_player_stats["gp"] = df_player_stats["gp"].astype(int)
df_player_stats["w"] = df_player_stats["w"].astype(int)
df_player_stats["l"] = df_player_stats["l"].astype(int)

# Drop duplicate rows
df_player_stats = df_player_stats.drop_duplicates()

# Check missing values to confirm name column survived
missing_values = df_player_stats.isnull().sum()
print("Missing values per column:")
print(missing_values)

df_player_stats.head()


Missing values per column:
player_id                0
player_name              0
nickname                 0
team_id                  0
team_abbreviation        0
                        ..
dd2_rank                 0
td3_rank                 0
wnba_fantasy_pts_rank    0
team_count               0
season                   0
Length: 68, dtype: int64


Unnamed: 0,player_id,player_name,nickname,team_id,team_abbreviation,age,gp,w,l,w_pct,...,pf_rank,pfd_rank,pts_rank,plus_minus_rank,nba_fantasy_pts_rank,dd2_rank,td3_rank,wnba_fantasy_pts_rank,team_count,season
0,920,A.C. Green,A.C.,1610612748,MIA,37.0,82,50,32,0.61,...,220,39,212,95,218,126,26,223,1,200001
1,2062,A.J. Guyton,A.J.,1610612741,CHI,23.0,33,6,27,0.182,...,92,112,292,378,317,224,26,314,1,200001
2,243,Aaron McKie,Aaron,1610612755,PHI,28.0,76,51,25,0.671,...,316,112,87,20,84,77,4,81,1,200001
3,1425,Aaron Williams,Aaron,1610612751,NJN,29.0,82,26,56,0.317,...,441,112,91,434,77,53,26,79,1,200001
4,228,Adam Keefe,Adam,1610612744,GSW,31.0,67,14,53,0.209,...,200,112,307,385,274,224,26,280,1,200001


## Team Stats

In [5]:
import pandas as pd
import glob
import os

# 1. Locate all team stats CSV files in the raw directory
# The folder contains one CSV per season
# glob() finds every file matching the pattern so we can load them all.
path = "../../data/raw/team_stats/*.csv"
files = glob.glob(path)

df_team_stats_raw_list = []

# 2. Loop through each CSV file and load it
# - Extract the season from the filename
# - Add a new column called "season" so we know which season each row belongs to
# - Append the DataFrame to a list for combining later
for file in files:
    filename = os.path.basename(file)
    season = filename.replace("team_stats_", "").replace(".csv", "")
    
    temp_df = pd.read_csv(file)
    temp_df["season"] = season
    
    df_team_stats_raw_list.append(temp_df)

# 3. Combine all seasons into one DataFrame
df_team_stats_raw = pd.concat(df_team_stats_raw_list, ignore_index=True)



In [6]:
# Make a copy of the raw combined DataFrame
df_team_stats = df_team_stats_raw.copy()

# Clean column names for consistency
df_team_stats.columns = (
    df_team_stats.columns
        .str.lower()
        .str.strip()
        .str.replace(" ", "_")
        .str.replace("-", "_")
        .str.replace("/", "_")
)

# Convert important numeric columns
# Only converting what actually needs to be numeric so I don't wipe out text fields
df_team_stats["team_id"] = df_team_stats["team_id"].astype(int)
df_team_stats["gp"] = df_team_stats["gp"].astype(int)
df_team_stats["w"] = df_team_stats["w"].astype(int)
df_team_stats["l"] = df_team_stats["l"].astype(int)

# Drop duplicate rows
df_team_stats = df_team_stats.drop_duplicates()

# Check missing values to confirm everything looks good
missing_values = df_team_stats.isnull().sum()
print("Missing values per column:")
print(missing_values)

df_team_stats.head()


Missing values per column:
team_id            0
team_name          0
gp                 0
w                  0
l                  0
w_pct              0
min                0
fgm                0
fga                0
fg_pct             0
fg3m               0
fg3a               0
fg3_pct            0
ftm                0
fta                0
ft_pct             0
oreb               0
dreb               0
reb                0
ast                0
tov                0
stl                0
blk                0
blka               0
pf                 0
pfd                0
pts                0
plus_minus         0
gp_rank            0
w_rank             0
l_rank             0
w_pct_rank         0
min_rank           0
fgm_rank           0
fga_rank           0
fg_pct_rank        0
fg3m_rank          0
fg3a_rank          0
fg3_pct_rank       0
ftm_rank           0
fta_rank           0
ft_pct_rank        0
oreb_rank          0
dreb_rank          0
reb_rank           0
ast_rank           0
tov_ran

Unnamed: 0,team_id,team_name,gp,w,l,w_pct,min,fgm,fga,fg_pct,...,ast_rank,tov_rank,stl_rank,blk_rank,blka_rank,pf_rank,pfd_rank,pts_rank,plus_minus_rank,season
0,1610612737,Atlanta Hawks,82,25,57,0.305,3946.0,2876,6668,0.431,...,28,28,16,21,27,14,8,26,25,200001
1,1610612738,Boston Celtics,82,36,46,0.439,3966.0,2773,6485,0.428,...,19,19,3,28,17,25,22,16,20,200001
2,1610612766,Charlotte Hornets,82,46,36,0.561,3976.0,2800,6501,0.431,...,7,8,11,11,6,11,18,24,14,200001
3,1610612741,Chicago Bulls,82,15,67,0.183,3971.0,2721,6411,0.424,...,12,21,7,24,14,19,22,29,29,200001
4,1610612739,Cleveland Cavaliers,82,30,52,0.366,3965.613333,2890,6532,0.442,...,19,27,15,13,29,20,6,22,23,200001


## Team Game Log

In [7]:
import pandas as pd 
import glob 
import os 

# 1. Locate all team game log CSV files in the raw directory
path = "../../data/raw/team_game_logs/*.csv"
files = glob.glob(path)
df_team_game_logs_raw_list = []

# 2. Loop through each CSV file and load it
# - Extract the season from the filename
# - Add a new column called "season" so I know which season each row belongs to
for file in files:
    filename = os.path.basename(file)
    season = filename.replace("team_game_logs_", "").replace(".csv", "")
    temp_df = pd.read_csv(file)
    temp_df["season"] = season
    df_team_game_logs_raw_list.append(temp_df)

# 3. Combine all seasons into one DataFrame
df_team_game_logs_raw = pd.concat(df_team_game_logs_raw_list, ignore_index=True)

In [8]:
# Make a copy of the raw combined DataFrame
df_team_game_logs = df_team_game_logs_raw.copy()

# Clean column names for consistency
df_team_game_logs.columns = (
    df_team_game_logs.columns
        .str.lower()
        .str.strip()
        .str.replace(" ", "_")
        .str.replace("-", "_")
        .str.replace("/", "_")
)

# Convert important numeric columns
# Only converting what actually needs to be numeric so I don't wipe out text fields
df_team_game_logs["team_id"] = df_team_game_logs["team_id"].astype(int)
df_team_game_logs["game_id"] = df_team_game_logs["game_id"].astype(int)

# Convert game date to datetime
df_team_game_logs["game_date"] = pd.to_datetime(df_team_game_logs["game_date"])

# Drop duplicate rows
df_team_game_logs = df_team_game_logs.drop_duplicates()

# Check missing values to confirm everything looks good
missing_values = df_team_game_logs.isnull().sum()
print("Missing values per column:")
print(missing_values)

df_team_game_logs.head()


Missing values per column:
season_id            0
team_id              0
team_abbreviation    0
team_name            0
game_id              0
game_date            0
matchup              0
wl                   2
min                  0
fgm                  0
fga                  0
fg_pct               2
fg3m                 0
fg3a                 0
fg3_pct              2
ftm                  0
fta                  0
ft_pct               3
oreb                 0
dreb                 0
reb                  0
ast                  0
stl                  0
blk                  0
tov                  0
pf                   0
pts                  0
plus_minus           0
video_available      0
season               0
dtype: int64


Unnamed: 0,season_id,team_id,team_abbreviation,team_name,game_id,game_date,matchup,wl,min,fgm,...,reb,ast,stl,blk,tov,pf,pts,plus_minus,video_available,season
0,22000,1610612737,ATL,Atlanta Hawks,20000004,2000-10-31,ATL vs. CHH,L,240,30,...,29,14,9,2,13,31,82,-24,0,200001
1,22000,1610612741,CHI,Chicago Bulls,20000006,2000-10-31,CHI vs. SAC,L,240,26,...,29,19,8,4,20,20,81,-19,0,200001
2,22000,1610612744,GSW,Golden State Warriors,20000011,2000-10-31,GSW vs. PHX,W,240,32,...,55,18,11,5,21,22,96,2,0,200001
3,22000,1610612746,LAC,Los Angeles Clippers,20000010,2000-10-31,LAC @ UTA,L,240,34,...,44,18,3,6,24,32,94,-13,0,200001
4,22000,1610612749,MIL,Milwaukee Bucks,20000007,2000-10-31,MIL @ DAL,L,240,33,...,51,16,6,7,20,27,93,-4,0,200001


## Player Game Logs

In [9]:
import pandas as pd
import glob
import os

# 1. Locate all player game log CSV files in the raw directory
path = "../../data/raw/player_game_logs/*.csv"
files = glob.glob(path)

if not files:
    raise FileNotFoundError("No files found at data/raw/player_game_logs/. Run src/data/get_player_game_logs.py first.")

df_player_game_logs_raw_list = []

# 2. Loop through each CSV file and load it
# - Extract the season from the filename
# - Add a new column called "season" so we know which season each row belongs to
for file in files:
    filename = os.path.basename(file)
    season = filename.replace("player_game_logs_", "").replace(".csv", "")

    temp_df = pd.read_csv(file)
    temp_df["season"] = season
    df_player_game_logs_raw_list.append(temp_df)

# 3. Combine all seasons into one DataFrame
df_player_game_logs_raw = pd.concat(df_player_game_logs_raw_list, ignore_index=True)


In [10]:
# Make a copy of the raw combined DataFrame
df_player_game_logs = df_player_game_logs_raw.copy()

# Clean column names for consistency
df_player_game_logs.columns = (
    df_player_game_logs.columns
        .str.lower()
        .str.strip()
        .str.replace(" ", "_")
        .str.replace("-", "_")
        .str.replace("/", "_")
)

# Convert important numeric columns
df_player_game_logs["player_id"] = df_player_game_logs["player_id"].astype(int)
df_player_game_logs["team_id"] = df_player_game_logs["team_id"].astype(int)
df_player_game_logs["game_id"] = df_player_game_logs["game_id"].astype(int)

# Convert game date to datetime
df_player_game_logs["game_date"] = pd.to_datetime(df_player_game_logs["game_date"])

# Drop duplicate rows
df_player_game_logs = df_player_game_logs.drop_duplicates()

# Check missing values
missing_values = df_player_game_logs.isnull().sum()
print("Missing values per column:")
print(missing_values)

df_player_game_logs.head()


Missing values per column:
season_id                 0
player_id                 0
player_name               0
team_id                   0
team_abbreviation         0
team_name                 0
game_id                   0
game_date                 0
matchup                   0
wl                        0
min                       0
fgm                       0
fga                       0
fg_pct                31590
fg3m                      0
fg3a                      0
fg3_pct              234063
ftm                       0
fta                       0
ft_pct               260183
oreb                      0
dreb                      0
reb                       0
ast                       0
stl                       0
blk                       0
tov                       0
pf                        0
pts                       0
plus_minus                0
fantasy_pts               0
video_available           0
season                    0
dtype: int64


Unnamed: 0,season_id,player_id,player_name,team_id,team_abbreviation,team_name,game_id,game_date,matchup,wl,...,ast,stl,blk,tov,pf,pts,plus_minus,fantasy_pts,video_available,season
0,22000,1074,Matt Maloney,1610612737,ATL,Atlanta Hawks,20000004,2000-10-31,ATL vs. CHH,L,...,0,0,0,1,1,3,-6,2.0,0,200001
1,22000,1533,Anthony Johnson,1610612737,ATL,Atlanta Hawks,20000004,2000-10-31,ATL vs. CHH,L,...,2,0,0,1,4,0,-8,3.2,0,200001
2,22000,1891,Jason Terry,1610612737,ATL,Atlanta Hawks,20000004,2000-10-31,ATL vs. CHH,L,...,2,1,0,3,3,4,-21,8.2,0,200001
3,22000,1538,Cedric Henderson,1610612739,CLE,Cleveland Cavaliers,20000002,2000-10-31,CLE @ NJN,W,...,1,0,0,0,0,0,-8,1.5,0,200001
4,22000,1802,Brad Miller,1610612741,CHI,Chicago Bulls,20000006,2000-10-31,CHI vs. SAC,L,...,1,0,1,2,2,0,-4,6.1,0,200001


## Player Stats Advanced

In [11]:
import pandas as pd
import glob
import os

# 1. Locate all advanced player stats CSV files in the raw directory
path = "../../data/raw/player_stats_advanced/*.csv"
files = glob.glob(path)

if not files:
    raise FileNotFoundError("No files found at data/raw/player_stats_advanced/. Run src/data/get_player_stats_advanced.py first.")

df_player_stats_advanced_raw_list = []

# 2. Loop through each CSV file and load it
# - Extract the season from the filename
# - Add a new column called "season" so we know which season each row belongs to
for file in files:
    filename = os.path.basename(file)
    season = filename.replace("player_stats_advanced_", "").replace(".csv", "")

    temp_df = pd.read_csv(file)
    temp_df["season"] = season
    df_player_stats_advanced_raw_list.append(temp_df)

# 3. Combine all seasons into one DataFrame
df_player_stats_advanced_raw = pd.concat(df_player_stats_advanced_raw_list, ignore_index=True)


In [12]:
# Make a copy of the raw combined DataFrame
df_player_stats_advanced = df_player_stats_advanced_raw.copy()

# Clean column names for consistency
df_player_stats_advanced.columns = (
    df_player_stats_advanced.columns
        .str.lower()
        .str.strip()
        .str.replace(" ", "_")
        .str.replace("-", "_")
        .str.replace("/", "_")
)

# Convert important numeric columns
df_player_stats_advanced["player_id"] = df_player_stats_advanced["player_id"].astype(int)
df_player_stats_advanced["team_id"] = df_player_stats_advanced["team_id"].astype(int)
df_player_stats_advanced["age"] = df_player_stats_advanced["age"].astype(float)
df_player_stats_advanced["gp"] = df_player_stats_advanced["gp"].astype(int)

# Drop duplicate rows
df_player_stats_advanced = df_player_stats_advanced.drop_duplicates()

# Check missing values
missing_values = df_player_stats_advanced.isnull().sum()
print("Missing values per column:")
print(missing_values)

df_player_stats_advanced.head()


Missing values per column:
player_id            0
player_name          0
nickname             0
team_id              0
team_abbreviation    0
                    ..
fgm_pg_rank          0
fga_pg_rank          0
fg_pct_rank          0
team_count           0
season               0
Length: 80, dtype: int64


Unnamed: 0,player_id,player_name,nickname,team_id,team_abbreviation,age,gp,w,l,w_pct,...,pace_rank,sp_work_pace_rank,pie_rank,fgm_rank,fga_rank,fgm_pg_rank,fga_pg_rank,fg_pct_rank,team_count,season
0,920,A.C. Green,A.C.,1610612748,MIA,37.0,82,50,32,0.61,...,412,412,158,213,216,281,286,171,1,200001
1,2062,A.J. Guyton,A.J.,1610612741,CHI,23.0,33,6,27,0.182,...,416,416,282,291,285,222,208,293,1,200001
2,243,Aaron McKie,Aaron,1610612755,PHI,28.0,76,51,25,0.671,...,289,289,68,80,92,93,102,106,1,200001
3,1425,Aaron Williams,Aaron,1610612751,NJN,29.0,82,26,56,0.317,...,274,274,151,105,109,133,140,141,1,200001
4,228,Adam Keefe,Adam,1610612744,GSW,31.0,67,14,53,0.209,...,160,160,336,306,300,369,374,300,1,200001


## Team Stats Advanced

In [13]:
import pandas as pd
import glob
import os

# 1. Locate all advanced team stats CSV files in the raw directory
path = "../../data/raw/team_stats_advanced/*.csv"
files = glob.glob(path)

if not files:
    raise FileNotFoundError("No files found at data/raw/team_stats_advanced/. Run src/data/get_team_stats_advanced.py first.")

df_team_stats_advanced_raw_list = []

# 2. Loop through each CSV file and load it
# - Extract the season from the filename
# - Add a new column called "season" so we know which season each row belongs to
for file in files:
    filename = os.path.basename(file)
    season = filename.replace("team_stats_advanced_", "").replace(".csv", "")

    temp_df = pd.read_csv(file)
    temp_df["season"] = season
    df_team_stats_advanced_raw_list.append(temp_df)

# 3. Combine all seasons into one DataFrame
df_team_stats_advanced_raw = pd.concat(df_team_stats_advanced_raw_list, ignore_index=True)


In [14]:
# Make a copy of the raw combined DataFrame
df_team_stats_advanced = df_team_stats_advanced_raw.copy()

# Clean column names for consistency
df_team_stats_advanced.columns = (
    df_team_stats_advanced.columns
        .str.lower()
        .str.strip()
        .str.replace(" ", "_")
        .str.replace("-", "_")
        .str.replace("/", "_")
)

# Convert important numeric columns
df_team_stats_advanced["team_id"] = df_team_stats_advanced["team_id"].astype(int)
df_team_stats_advanced["gp"] = df_team_stats_advanced["gp"].astype(int)
df_team_stats_advanced["w"] = df_team_stats_advanced["w"].astype(int)
df_team_stats_advanced["l"] = df_team_stats_advanced["l"].astype(int)

# Drop duplicate rows
df_team_stats_advanced = df_team_stats_advanced.drop_duplicates()

# Check missing values
missing_values = df_team_stats_advanced.isnull().sum()
print("Missing values per column:")
print(missing_values)

df_team_stats_advanced.head()


Missing values per column:
team_id            0
team_name          0
gp                 0
w                  0
l                  0
w_pct              0
min                0
e_off_rating       0
off_rating         0
e_def_rating       0
def_rating         0
e_net_rating       0
net_rating         0
ast_pct            0
ast_to             0
ast_ratio          0
oreb_pct           0
dreb_pct           0
reb_pct            0
tm_tov_pct         0
efg_pct            0
ts_pct             0
e_pace             0
pace               0
pace_per40         0
poss               0
pie                0
gp_rank            0
w_rank             0
l_rank             0
w_pct_rank         0
min_rank           0
off_rating_rank    0
def_rating_rank    0
net_rating_rank    0
ast_pct_rank       0
ast_to_rank        0
ast_ratio_rank     0
oreb_pct_rank      0
dreb_pct_rank      0
reb_pct_rank       0
tm_tov_pct_rank    0
efg_pct_rank       0
ts_pct_rank        0
pace_rank          0
pie_rank           0
season 

Unnamed: 0,team_id,team_name,gp,w,l,w_pct,min,e_off_rating,off_rating,e_def_rating,...,ast_ratio_rank,oreb_pct_rank,dreb_pct_rank,reb_pct_rank,tm_tov_pct_rank,efg_pct_rank,ts_pct_rank,pace_rank,pie_rank,season
0,1610612737,Atlanta Hawks,82,25,57,0.305,3946.0,95.6,96.9,101.9,...,29,15,12,14,28,25,26,11,26,200001
1,1610612738,Boston Celtics,82,36,46,0.439,3966.0,99.0,100.1,100.9,...,19,26,15,28,20,14,12,10,23,200001
2,1610612766,Charlotte Hornets,82,46,36,0.561,3976.0,99.3,99.7,96.3,...,7,14,1,6,10,24,23,21,7,200001
3,1610612741,Chicago Bulls,82,15,67,0.183,3971.0,94.3,95.9,104.9,...,9,20,27,27,25,28,28,27,28,200001
4,1610612739,Cleveland Cavaliers,82,30,52,0.366,3966.0,97.4,99.3,102.0,...,20,7,23,10,27,23,22,17,22,200001


## Standings

In [15]:
import pandas as pd
import glob
import os

# 1. Locate all standings CSV files in the raw directory
path = "../../data/raw/standings/*.csv"
files = glob.glob(path)

if not files:
    raise FileNotFoundError("No files found at data/raw/standings/. Run src/data/get_standings.py first.")

df_standings_raw_list = []

# 2. Loop through each CSV file and load it
# - Extract the season from the filename
# - Add a new column called "season" so we know which season each row belongs to
for file in files:
    filename = os.path.basename(file)
    season = filename.replace("standings_", "").replace(".csv", "")

    temp_df = pd.read_csv(file)
    temp_df["season"] = season
    df_standings_raw_list.append(temp_df)

# 3. Combine all seasons into one DataFrame
df_standings_raw = pd.concat(df_standings_raw_list, ignore_index=True)


In [16]:
# Make a copy of the raw combined DataFrame
df_standings = df_standings_raw.copy()

# Clean column names for consistency
df_standings.columns = (
    df_standings.columns
        .str.lower()
        .str.strip()
        .str.replace(" ", "_")
        .str.replace("-", "_")
        .str.replace("/", "_")
)

# Rename key columns to clearer names
df_standings = df_standings.rename(columns={
    "teamid": "team_id",
    "teamcity": "team_city",
    "teamname": "team_name",
    "teamslug": "team_slug",
    "wins": "w",
    "losses": "l",
    "winpct": "w_pct",
})

# Convert important numeric columns
df_standings["team_id"] = df_standings["team_id"].astype(int)
df_standings["w"] = df_standings["w"].astype(int)
df_standings["l"] = df_standings["l"].astype(int)

# Drop duplicate rows
df_standings = df_standings.drop_duplicates()

# Check missing values
missing_values = df_standings.isnull().sum()
print("Missing values per column:")
print(missing_values)

df_standings.head()


Missing values per column:
leagueid                  0
seasonid                  0
team_id                   0
team_city                 0
team_name                 0
                       ... 
seeding_game_4_label    724
seeding_game_5_label    724
seeding_game_6_label    724
seeding_game_7_label    724
seeding_game_8_label    724
Length: 129, dtype: int64


Unnamed: 0,leagueid,seasonid,team_id,team_city,team_name,team_slug,conference,conferencerecord,playoffrank,clinchindicator,...,seeding_game_7_opponent,seeding_game_8_opponent,seeding_game_1_label,seeding_game_2_label,seeding_game_3_label,seeding_game_4_label,seeding_game_5_label,seeding_game_6_label,seeding_game_7_label,seeding_game_8_label
0,0,22000,1610612755,Philadelphia,76ers,sixers,East,40-14,1,,...,,,,,,,,,,
1,0,22000,1610612759,San Antonio,Spurs,spurs,West,39-13,1,,...,,,,,,,,,,
2,0,22000,1610612747,Los Angeles,Lakers,lakers,West,34-18,2,,...,,,,,,,,,,
3,0,22000,1610612749,Milwaukee,Bucks,bucks,East,38-16,2,,...,,,,,,,,,,
4,0,22000,1610612748,Miami,Heat,heat,East,34-20,3,,...,,,,,,,,,,


## Playoff Data

In [17]:
import pandas as pd
import glob
import os

def load_season_folder(path, prefix):
    files = glob.glob(path)
    if not files:
        raise FileNotFoundError(f"No files found at {path}. Run the corresponding ingestion script first.")
    rows = []
    for file in files:
        season = os.path.basename(file).replace(prefix, "").replace(".csv", "")
        temp_df = pd.read_csv(file)
        temp_df["season"] = season
        rows.append(temp_df)
    return pd.concat(rows, ignore_index=True)

def clean_columns(df):
    df.columns = (
        df.columns
            .str.lower()
            .str.strip()
            .str.replace(" ", "_")
            .str.replace("-", "_")
            .str.replace("/", "_")
    )
    return df

# --- Player game logs playoffs ---
df_player_game_logs_playoffs_raw = load_season_folder(
    "../../data/raw/player_game_logs_playoffs/*.csv", "player_game_logs_playoffs_"
)
df_player_game_logs_playoffs = clean_columns(df_player_game_logs_playoffs_raw.copy())
df_player_game_logs_playoffs["player_id"] = df_player_game_logs_playoffs["player_id"].astype(int)
df_player_game_logs_playoffs["team_id"] = df_player_game_logs_playoffs["team_id"].astype(int)
df_player_game_logs_playoffs["game_id"] = df_player_game_logs_playoffs["game_id"].astype(int)
df_player_game_logs_playoffs["game_date"] = pd.to_datetime(df_player_game_logs_playoffs["game_date"])
df_player_game_logs_playoffs = df_player_game_logs_playoffs.drop_duplicates()
print(f"player_game_logs_playoffs: {len(df_player_game_logs_playoffs):,} rows, {df_player_game_logs_playoffs.isnull().sum().sum()} nulls")

# --- Team game logs playoffs ---
df_team_game_logs_playoffs_raw = load_season_folder(
    "../../data/raw/team_game_logs_playoffs/*.csv", "team_game_logs_playoffs_"
)
df_team_game_logs_playoffs = clean_columns(df_team_game_logs_playoffs_raw.copy())
df_team_game_logs_playoffs["team_id"] = df_team_game_logs_playoffs["team_id"].astype(int)
df_team_game_logs_playoffs["game_id"] = df_team_game_logs_playoffs["game_id"].astype(int)
df_team_game_logs_playoffs["game_date"] = pd.to_datetime(df_team_game_logs_playoffs["game_date"])
df_team_game_logs_playoffs = df_team_game_logs_playoffs.drop_duplicates()
print(f"team_game_logs_playoffs: {len(df_team_game_logs_playoffs):,} rows, {df_team_game_logs_playoffs.isnull().sum().sum()} nulls")

# --- Player stats playoffs (base) ---
df_player_stats_playoffs_raw = load_season_folder(
    "../../data/raw/player_stats_playoffs/*.csv", "player_stats_playoffs_"
)
df_player_stats_playoffs = clean_columns(df_player_stats_playoffs_raw.copy())
df_player_stats_playoffs["player_id"] = df_player_stats_playoffs["player_id"].astype(int)
df_player_stats_playoffs["team_id"] = df_player_stats_playoffs["team_id"].astype(int)
df_player_stats_playoffs["age"] = df_player_stats_playoffs["age"].astype(float)
df_player_stats_playoffs = df_player_stats_playoffs.drop_duplicates()
print(f"player_stats_playoffs: {len(df_player_stats_playoffs):,} rows, {df_player_stats_playoffs.isnull().sum().sum()} nulls")

# --- Player stats advanced playoffs ---
df_player_stats_advanced_playoffs_raw = load_season_folder(
    "../../data/raw/player_stats_advanced_playoffs/*.csv", "player_stats_advanced_playoffs_"
)
df_player_stats_advanced_playoffs = clean_columns(df_player_stats_advanced_playoffs_raw.copy())
df_player_stats_advanced_playoffs["player_id"] = df_player_stats_advanced_playoffs["player_id"].astype(int)
df_player_stats_advanced_playoffs["team_id"] = df_player_stats_advanced_playoffs["team_id"].astype(int)
df_player_stats_advanced_playoffs["age"] = df_player_stats_advanced_playoffs["age"].astype(float)
df_player_stats_advanced_playoffs = df_player_stats_advanced_playoffs.drop_duplicates()
print(f"player_stats_advanced_playoffs: {len(df_player_stats_advanced_playoffs):,} rows, {df_player_stats_advanced_playoffs.isnull().sum().sum()} nulls")

# --- Team stats playoffs (base) ---
df_team_stats_playoffs_raw = load_season_folder(
    "../../data/raw/team_stats_playoffs/*.csv", "team_stats_playoffs_"
)
df_team_stats_playoffs = clean_columns(df_team_stats_playoffs_raw.copy())
df_team_stats_playoffs["team_id"] = df_team_stats_playoffs["team_id"].astype(int)
df_team_stats_playoffs["gp"] = df_team_stats_playoffs["gp"].astype(int)
df_team_stats_playoffs["w"] = df_team_stats_playoffs["w"].astype(int)
df_team_stats_playoffs["l"] = df_team_stats_playoffs["l"].astype(int)
df_team_stats_playoffs = df_team_stats_playoffs.drop_duplicates()
print(f"team_stats_playoffs: {len(df_team_stats_playoffs):,} rows, {df_team_stats_playoffs.isnull().sum().sum()} nulls")

# --- Team stats advanced playoffs ---
df_team_stats_advanced_playoffs_raw = load_season_folder(
    "../../data/raw/team_stats_advanced_playoffs/*.csv", "team_stats_advanced_playoffs_"
)
df_team_stats_advanced_playoffs = clean_columns(df_team_stats_advanced_playoffs_raw.copy())
df_team_stats_advanced_playoffs["team_id"] = df_team_stats_advanced_playoffs["team_id"].astype(int)
df_team_stats_advanced_playoffs["gp"] = df_team_stats_advanced_playoffs["gp"].astype(int)
df_team_stats_advanced_playoffs["w"] = df_team_stats_advanced_playoffs["w"].astype(int)
df_team_stats_advanced_playoffs["l"] = df_team_stats_advanced_playoffs["l"].astype(int)
df_team_stats_advanced_playoffs = df_team_stats_advanced_playoffs.drop_duplicates()
print(f"team_stats_advanced_playoffs: {len(df_team_stats_advanced_playoffs):,} rows, {df_team_stats_advanced_playoffs.isnull().sum().sum()} nulls")


player_game_logs_playoffs: 42,785 rows, 38782 nulls
team_game_logs_playoffs: 4,142 rows, 0 nulls
player_stats_playoffs: 5,120 rows, 0 nulls


player_stats_advanced_playoffs: 5,120 rows, 0 nulls
team_stats_playoffs: 400 rows, 0 nulls
team_stats_advanced_playoffs: 400 rows, 0 nulls


## Clutch Stats

In [18]:
import pandas as pd
import glob
import os

path = "../../data/raw/player_stats_clutch/*.csv"
files = glob.glob(path)

if not files:
    raise FileNotFoundError("No files found. Run src/data/get_player_stats_clutch.py first.")

df_player_stats_clutch_raw_list = []
for file in files:
    season = os.path.basename(file).replace("player_stats_clutch_", "").replace(".csv", "")
    temp_df = pd.read_csv(file)
    temp_df["season"] = season
    df_player_stats_clutch_raw_list.append(temp_df)

df_player_stats_clutch_raw = pd.concat(df_player_stats_clutch_raw_list, ignore_index=True)

df_player_stats_clutch = df_player_stats_clutch_raw.copy()
df_player_stats_clutch.columns = (
    df_player_stats_clutch.columns
        .str.lower().str.strip()
        .str.replace(" ", "_").str.replace("-", "_").str.replace("/", "_")
)
df_player_stats_clutch["player_id"] = df_player_stats_clutch["player_id"].astype(int)
df_player_stats_clutch["team_id"] = df_player_stats_clutch["team_id"].astype(int)
df_player_stats_clutch["age"] = df_player_stats_clutch["age"].astype(float)
df_player_stats_clutch = df_player_stats_clutch.drop_duplicates()

missing_values = df_player_stats_clutch.isnull().sum()
print("Missing values per column:")
print(missing_values)
df_player_stats_clutch.head()


Missing values per column:
group_set                0
player_id                0
player_name              0
nickname                 0
team_id                  0
                        ..
dd2_rank                 0
td3_rank                 0
wnba_fantasy_pts_rank    0
team_count               0
season                   0
Length: 69, dtype: int64


Unnamed: 0,group_set,player_id,player_name,nickname,team_id,team_abbreviation,age,gp,w,l,...,pf_rank,pfd_rank,pts_rank,plus_minus_rank,nba_fantasy_pts_rank,dd2_rank,td3_rank,wnba_fantasy_pts_rank,team_count,season
0,Players,920,A.C. Green,A.C.,1610612748,MIA,37.0,13,9,4,...,90,13,278,187,312,140,13,307,1,200001
1,Players,2062,A.J. Guyton,A.J.,1610612741,CHI,23.0,10,3,7,...,90,13,304,215,291,184,13,301,1,200001
2,Players,243,Aaron McKie,Aaron,1610612755,PHI,28.0,32,20,12,...,356,13,56,47,96,99,6,76,1,200001
3,Players,1425,Aaron Williams,Aaron,1610612751,NJN,29.0,36,13,23,...,300,13,76,373,57,58,13,64,1,200001
4,Players,1502,Adonal Foyle,Adonal,1610612744,GSW,26.0,19,6,13,...,140,13,339,363,176,140,13,198,1,200001


## Scoring Breakdown Stats

In [19]:
import pandas as pd
import glob
import os

path = "../../data/raw/player_stats_scoring/*.csv"
files = glob.glob(path)

if not files:
    raise FileNotFoundError("No files found. Run src/data/get_player_stats_scoring.py first.")

df_player_stats_scoring_raw_list = []
for file in files:
    season = os.path.basename(file).replace("player_stats_scoring_", "").replace(".csv", "")
    temp_df = pd.read_csv(file)
    temp_df["season"] = season
    df_player_stats_scoring_raw_list.append(temp_df)

df_player_stats_scoring_raw = pd.concat(df_player_stats_scoring_raw_list, ignore_index=True)

df_player_stats_scoring = df_player_stats_scoring_raw.copy()
df_player_stats_scoring.columns = (
    df_player_stats_scoring.columns
        .str.lower().str.strip()
        .str.replace(" ", "_").str.replace("-", "_").str.replace("/", "_")
)
df_player_stats_scoring["player_id"] = df_player_stats_scoring["player_id"].astype(int)
df_player_stats_scoring["team_id"] = df_player_stats_scoring["team_id"].astype(int)
df_player_stats_scoring["age"] = df_player_stats_scoring["age"].astype(float)
df_player_stats_scoring = df_player_stats_scoring.drop_duplicates()

missing_values = df_player_stats_scoring.isnull().sum()
print("Missing values per column:")
print(missing_values)
df_player_stats_scoring.head()


Missing values per column:
player_id               0
player_name             0
nickname                0
team_id                 0
team_abbreviation       0
age                     0
gp                      0
w                       0
l                       0
w_pct                   0
min                     0
pct_fga_2pt             0
pct_fga_3pt             0
pct_pts_2pt             0
pct_pts_2pt_mr          0
pct_pts_3pt             0
pct_pts_fb              0
pct_pts_ft              0
pct_pts_off_tov         0
pct_pts_paint           0
pct_ast_2pm             0
pct_uast_2pm            0
pct_ast_3pm             0
pct_uast_3pm            0
pct_ast_fgm             0
pct_uast_fgm            0
fgm                     0
fga                     0
fg_pct                  0
gp_rank                 0
w_rank                  0
l_rank                  0
w_pct_rank              0
min_rank                0
pct_fga_2pt_rank        0
pct_fga_3pt_rank        0
pct_pts_2pt_rank        0
pct_pts_2pt

Unnamed: 0,player_id,player_name,nickname,team_id,team_abbreviation,age,gp,w,l,w_pct,...,pct_uast_2pm_rank,pct_ast_3pm_rank,pct_uast_3pm_rank,pct_ast_fgm_rank,pct_uast_fgm_rank,fgm_rank,fga_rank,fg_pct_rank,team_count,season
0,920,A.C. Green,A.C.,1610612748,MIA,37.0,82,50,32,0.61,...,383,279,208,74,361,213,216,171,1,200001
1,2062,A.J. Guyton,A.J.,1610612741,CHI,23.0,33,6,27,0.182,...,203,189,105,151,283,291,285,293,1,200001
2,243,Aaron McKie,Aaron,1610612755,PHI,28.0,76,51,25,0.671,...,109,124,171,316,119,80,92,106,1,200001
3,1425,Aaron Williams,Aaron,1610612751,NJN,29.0,82,26,56,0.317,...,223,279,208,258,177,105,109,141,1,200001
4,228,Adam Keefe,Adam,1610612744,GSW,31.0,67,14,53,0.209,...,307,1,208,159,275,306,300,300,1,200001


## Hustle Stats

In [20]:
import pandas as pd
import glob
import os

# --- Player hustle stats (available from 2015-16) ---
player_files = glob.glob("../../data/raw/player_hustle_stats/*.csv")
if not player_files:
    raise FileNotFoundError("No files found. Run src/data/get_hustle_stats.py first.")

df_player_hustle_raw_list = []
for file in player_files:
    season = os.path.basename(file).replace("player_hustle_stats_", "").replace(".csv", "")
    temp_df = pd.read_csv(file)
    temp_df["season"] = season
    df_player_hustle_raw_list.append(temp_df)

df_player_hustle_stats_raw = pd.concat(df_player_hustle_raw_list, ignore_index=True)

df_player_hustle_stats = df_player_hustle_stats_raw.copy()
df_player_hustle_stats.columns = (
    df_player_hustle_stats.columns
        .str.lower().str.strip()
        .str.replace(" ", "_").str.replace("-", "_").str.replace("/", "_")
)
df_player_hustle_stats["player_id"] = df_player_hustle_stats["player_id"].astype(int)
df_player_hustle_stats["team_id"] = df_player_hustle_stats["team_id"].astype(int)
df_player_hustle_stats = df_player_hustle_stats.drop_duplicates()
print(f"player_hustle_stats: {len(df_player_hustle_stats):,} rows, {df_player_hustle_stats.isnull().sum().sum()} nulls")

# --- Team hustle stats ---
team_files = glob.glob("../../data/raw/team_hustle_stats/*.csv")
if not team_files:
    raise FileNotFoundError("No team hustle files found. Run src/data/get_hustle_stats.py first.")

df_team_hustle_raw_list = []
for file in team_files:
    season = os.path.basename(file).replace("team_hustle_stats_", "").replace(".csv", "")
    temp_df = pd.read_csv(file)
    temp_df["season"] = season
    df_team_hustle_raw_list.append(temp_df)

df_team_hustle_stats_raw = pd.concat(df_team_hustle_raw_list, ignore_index=True)

df_team_hustle_stats = df_team_hustle_stats_raw.copy()
df_team_hustle_stats.columns = (
    df_team_hustle_stats.columns
        .str.lower().str.strip()
        .str.replace(" ", "_").str.replace("-", "_").str.replace("/", "_")
)
df_team_hustle_stats["team_id"] = df_team_hustle_stats["team_id"].astype(int)
df_team_hustle_stats = df_team_hustle_stats.drop_duplicates()
print(f"team_hustle_stats: {len(df_team_hustle_stats):,} rows, {df_team_hustle_stats.isnull().sum().sum()} nulls")

df_player_hustle_stats.head()


player_hustle_stats: 5,021 rows, 0 nulls
team_hustle_stats: 285 rows, 0 nulls


Unnamed: 0,player_id,player_name,team_id,team_abbreviation,age,g,min,contested_shots,contested_shots_2pt,contested_shots_3pt,...,off_boxouts,def_boxouts,box_out_player_team_rebs,box_out_player_rebs,box_outs,pct_box_outs_off,pct_box_outs_def,pct_box_outs_team_reb,pct_box_outs_reb,season
0,201166,Aaron Brooks,1610612741,CHI,31.0,1,7.0,1,1,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,201516
1,203932,Aaron Gordon,1610612753,ORL,20.0,1,34.0,2,2,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,201516
2,201143,Al Horford,1610612737,ATL,30.0,2,64.0,25,22,3,...,0,0,0,0,0,0.0,0.0,0.0,0.0,201516
3,201582,Alexis Ajinca,1610612740,NOP,28.0,1,17.0,7,5,2,...,0,0,0,0,0,0.0,0.0,0.0,0.0,201516
4,202087,Alonzo Gee,1610612740,NOP,29.0,1,27.0,2,2,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,201516


## Shot Chart (run get_shot_chart.py first — takes 3-4 hours)

In [21]:
import pandas as pd
import glob
import os

path = "../../data/raw/shot_chart/*.csv"
files = glob.glob(path)

df_shot_chart = None  # Default to None — only populated if data has been downloaded

if not files:
    print("Shot chart data not found. Skipping. Run src/data/get_shot_chart.py when ready (takes 3-4 hours).")
else:
    df_shot_chart_raw_list = []
    for file in files:
        season = os.path.basename(file).replace("shot_chart_", "").replace(".csv", "")
        temp_df = pd.read_csv(file)
        temp_df["season"] = season
        df_shot_chart_raw_list.append(temp_df)

    df_shot_chart_raw = pd.concat(df_shot_chart_raw_list, ignore_index=True)

    df_shot_chart = df_shot_chart_raw.copy()
    df_shot_chart.columns = (
        df_shot_chart.columns
            .str.lower().str.strip()
            .str.replace(" ", "_").str.replace("-", "_").str.replace("/", "_")
    )
    df_shot_chart["player_id"] = df_shot_chart["player_id"].astype(int)
    df_shot_chart["team_id"] = df_shot_chart["team_id"].astype(int)
    df_shot_chart["game_id"] = df_shot_chart["game_id"].astype(str)
    df_shot_chart["game_date"] = pd.to_datetime(df_shot_chart["game_date"])
    df_shot_chart = df_shot_chart.drop_duplicates()

    print(f"shot_chart: {len(df_shot_chart):,} rows, {df_shot_chart.isnull().sum().sum()} nulls")
    display(df_shot_chart.head())


Shot chart data not found. Skipping. Run src/data/get_shot_chart.py when ready (takes 3-4 hours).


### Check Missing ID Columns

In [22]:
df_players["player_id"].isnull().sum()
df_player_stats["player_id"].isnull().sum()
df_team_stats["team_id"].isnull().sum()
df_team_game_logs["team_id"].isnull().sum()
df_team_game_logs["game_id"].isnull().sum()
df_player_game_logs["player_id"].isnull().sum()
df_player_game_logs["team_id"].isnull().sum()
df_player_game_logs["game_id"].isnull().sum()
df_player_stats_advanced["player_id"].isnull().sum()
df_team_stats_advanced["team_id"].isnull().sum()
df_standings["team_id"].isnull().sum()


np.int64(0)

## Save Processed Data

In [23]:
# Save existing tables
df_players.to_csv("../../data/processed/players.csv", index=False)
df_player_stats.to_csv("../../data/processed/player_stats.csv", index=False)
df_player_stats_advanced.to_csv("../../data/processed/player_stats_advanced.csv", index=False)
df_player_game_logs.to_csv("../../data/processed/player_game_logs.csv", index=False)
df_team_stats.to_csv("../../data/processed/team_stats.csv", index=False)
df_team_stats_advanced.to_csv("../../data/processed/team_stats_advanced.csv", index=False)
df_team_game_logs.to_csv("../../data/processed/team_game_logs.csv", index=False)
df_standings.to_csv("../../data/processed/standings.csv", index=False)

# Save playoff tables
df_player_game_logs_playoffs.to_csv("../../data/processed/player_game_logs_playoffs.csv", index=False)
df_team_game_logs_playoffs.to_csv("../../data/processed/team_game_logs_playoffs.csv", index=False)
df_player_stats_playoffs.to_csv("../../data/processed/player_stats_playoffs.csv", index=False)
df_player_stats_advanced_playoffs.to_csv("../../data/processed/player_stats_advanced_playoffs.csv", index=False)
df_team_stats_playoffs.to_csv("../../data/processed/team_stats_playoffs.csv", index=False)
df_team_stats_advanced_playoffs.to_csv("../../data/processed/team_stats_advanced_playoffs.csv", index=False)

# Save clutch and scoring tables
df_player_stats_clutch.to_csv("../../data/processed/player_stats_clutch.csv", index=False)
df_player_stats_scoring.to_csv("../../data/processed/player_stats_scoring.csv", index=False)

# Save hustle tables
df_player_hustle_stats.to_csv("../../data/processed/player_hustle_stats.csv", index=False)
df_team_hustle_stats.to_csv("../../data/processed/team_hustle_stats.csv", index=False)

# Save shot chart (only if it has been downloaded)
import glob
if glob.glob("../../data/raw/shot_chart/*.csv"):
    df_shot_chart.to_csv("../../data/processed/shot_chart.csv", index=False)
    print("shot_chart saved.")

print("All cleaned DataFrames saved to data/processed/")


All cleaned DataFrames saved to data/processed/
