In [74]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ['KAGGLE_USERNAME'] = os.getenv('KAGGLE_USERNAME')
os.environ['KAGGLE_KEY'] = os.getenv('KAGGLE_KEY')

In [75]:
%pip install --quiet kaggle

from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

dataset_name = "eoinamoore/historical-nba-data-and-player-box-scores"
download_path = "./data"
os.makedirs(download_path, exist_ok=True)

api.dataset_download_files(dataset_name, path=download_path, unzip=True)
print(f"Dataset downloaded to {download_path}")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Dataset URL: https://www.kaggle.com/datasets/eoinamoore/historical-nba-data-and-player-box-scores
Note: you may need to restart the kernel to use updated packages.
Dataset URL: https://www.kaggle.com/datasets/eoinamoore/historical-nba-data-and-player-box-scores
Dataset downloaded to ./data
Dataset downloaded to ./data


In [76]:
import glob

files = glob.glob(f"{download_path}/*")
print("Downloaded files:")
for f in files:
    print(f"  - {f}")

Downloaded files:
  - ./data/LeagueSchedule25_26.csv
  - ./data/TeamStatistics.csv
  - ./data/LeagueSchedule24_25.csv
  - ./data/PlayerStatistics.csv
  - ./data/Players.csv
  - ./data/TeamHistories.csv
  - ./data/Games.csv
  - ./data/processed


In [77]:
import pandas as pd

df = pd.read_csv(f"{download_path}/TeamStatistics.csv")
team_histories = pd.read_csv(f"{download_path}/TeamHistories.csv")

print(f"Team Statistics Shape: {df.shape}")
print(f"Team Histories Shape: {team_histories.shape}")
print(f"\nTeam Statistics Columns:\n{df.columns.tolist()}")
print(f"\nDate Range: {df['gameDateTimeEst'].min()} to {df['gameDateTimeEst'].max()}")

Team Statistics Shape: (144580, 48)
Team Histories Shape: (140, 7)

Team Statistics Columns:
['gameId', 'gameDateTimeEst', 'teamCity', 'teamName', 'teamId', 'opponentTeamCity', 'opponentTeamName', 'opponentTeamId', 'home', 'win', 'teamScore', 'opponentScore', 'assists', 'blocks', 'steals', 'fieldGoalsAttempted', 'fieldGoalsMade', 'fieldGoalsPercentage', 'threePointersAttempted', 'threePointersMade', 'threePointersPercentage', 'freeThrowsAttempted', 'freeThrowsMade', 'freeThrowsPercentage', 'reboundsDefensive', 'reboundsOffensive', 'reboundsTotal', 'foulsPersonal', 'turnovers', 'plusMinusPoints', 'numMinutes', 'q1Points', 'q2Points', 'q3Points', 'q4Points', 'benchPoints', 'biggestLead', 'biggestScoringRun', 'leadChanges', 'pointsFastBreak', 'pointsFromTurnovers', 'pointsInThePaint', 'pointsSecondChance', 'timesTied', 'timeoutsRemaining', 'seasonWins', 'seasonLosses', 'coachId']

Date Range: 1946-11-26 19:00:00 to 2025-12-05 16:30:00


In [78]:
df['gameDateTimeEst'] = pd.to_datetime(df['gameDateTimeEst'], utc=True, errors='coerce', infer_datetime_format=True)

if df['gameDateTimeEst'].isna().any():
	import dateutil.parser as _dp
	def _safe_parse(x):
		try:
			return pd.to_datetime(_dp.parse(str(x)), utc=True)
		except Exception:
			return pd.NaT
	mask = df['gameDateTimeEst'].isna()
	df.loc[mask, 'gameDateTimeEst'] = df.loc[mask, 'gameDateTimeEst'].apply(_safe_parse)

df['gameDateTimeEst'] = pd.to_datetime(df['gameDateTimeEst'], utc=True, errors='coerce')

if pd.api.types.is_datetime64tz_dtype(df['gameDateTimeEst'].dtype):
	df['gameDateTimeEst'] = df['gameDateTimeEst'].dt.tz_convert(None)

if not pd.api.types.is_datetime64_any_dtype(df['gameDateTimeEst'].dtype):
	print("Warning: gameDateTimeEst could not be converted to datetimelike dtype; some values may be strings or NaT")
	df['year'] = pd.to_datetime(df['gameDateTimeEst'], errors='coerce').dt.year
	df['month'] = pd.to_datetime(df['gameDateTimeEst'], errors='coerce').dt.month
else:
	df['year'] = df['gameDateTimeEst'].dt.year
	df['month'] = df['gameDateTimeEst'].dt.month

df['season'] = df.apply(lambda x: x['year'] if pd.notna(x['year']) and x['month'] >= 10 else (x['year'] - 1 if pd.notna(x['year']) else pd.NA), axis=1)

n_unparsed = df['gameDateTimeEst'].isna().sum()
if n_unparsed:
	print(f"Warning: {n_unparsed} gameDateTimeEst values could not be parsed and are NaT")

print(f"Seasons available: {sorted([s for s in df['season'].dropna().unique()])}")
print(f"\nGames per season:")
print(df.groupby('season').size().sort_index())

  df['gameDateTimeEst'] = pd.to_datetime(df['gameDateTimeEst'], utc=True, errors='coerce', infer_datetime_format=True)
['NaT', 'NaT']
Length: 2, dtype: datetime64[ns]' has dtype incompatible with datetime64[ns, UTC], please explicitly cast to a compatible dtype first.
  df.loc[mask, 'gameDateTimeEst'] = df.loc[mask, 'gameDateTimeEst'].apply(_safe_parse)
  if pd.api.types.is_datetime64tz_dtype(df['gameDateTimeEst'].dtype):


Seasons available: [1946.0, 1947.0, 1948.0, 1949.0, 1950.0, 1951.0, 1952.0, 1953.0, 1954.0, 1955.0, 1956.0, 1957.0, 1958.0, 1959.0, 1960.0, 1961.0, 1962.0, 1963.0, 1964.0, 1965.0, 1966.0, 1967.0, 1968.0, 1969.0, 1970.0, 1971.0, 1972.0, 1973.0, 1974.0, 1975.0, 1976.0, 1977.0, 1978.0, 1979.0, 1980.0, 1981.0, 1982.0, 1983.0, 1984.0, 1985.0, 1986.0, 1987.0, 1988.0, 1989.0, 1990.0, 1991.0, 1992.0, 1993.0, 1994.0, 1995.0, 1996.0, 1997.0, 1998.0, 1999.0, 2000.0, 2001.0, 2002.0, 2003.0, 2004.0, 2005.0, 2006.0, 2007.0, 2008.0, 2009.0, 2010.0, 2011.0, 2012.0, 2013.0, 2014.0, 2015.0, 2016.0, 2017.0, 2018.0, 2019.0, 2020.0, 2021.0, 2022.0, 2023.0, 2024.0, 2025.0]

Games per season:
season
1946.0      40
1947.0      48
1948.0     166
1949.0     274
1950.0     442
          ... 
2021.0    2780
2022.0    2768
2023.0    2766
2024.0    2770
2025.0     820
Length: 80, dtype: int64


In [79]:
print("Team Histories:")
print(team_histories.to_string())

team_histories['seasonActiveTill'] = team_histories['seasonActiveTill'].astype(int)
team_histories['seasonFounded'] = team_histories['seasonFounded'].astype(int)

current_teams = team_histories[team_histories['seasonActiveTill'] >= 2024][['teamId', 'teamCity', 'teamName', 'teamAbbrev']].drop_duplicates()
print(f"\nCurrent Active Teams ({len(current_teams)}):")
print(current_teams.sort_values('teamName'))

Team Histories:
         teamId           teamCity                   teamName teamAbbrev  seasonFounded  seasonActiveTill                  league
0    1610612737         Tri-Cities                 Blackhawks      TRI             1946              1948                     BAA
1    1610612737          Milwaukee                      Hawks      MIL             1951              1954                     NBA
2    1610612737          St. Louis                      Hawks      STL             1955              1967                     NBA
3    1610612737            Atlanta                      Hawks      ATL             1968              2100                     NBA
4    1610612738             Boston                    Celtics      BOS             1946              1948                     BAA
5    1610612739          Cleveland                  Cavaliers      CLE             1970              2100                     NBA
6    1610612740        New Orleans                    Hornets      NOH    

In [80]:
current_team_names = team_histories[team_histories['seasonActiveTill'] >= 2024].set_index('teamId')[['teamCity', 'teamName', 'teamAbbrev']].to_dict('index')

df['currentTeamCity'] = df['teamId'].map(lambda x: current_team_names.get(x, {}).get('teamCity', 'Unknown'))
df['currentTeamName'] = df['teamId'].map(lambda x: current_team_names.get(x, {}).get('teamName', 'Unknown'))
df['currentTeamAbbrev'] = df['teamId'].map(lambda x: current_team_names.get(x, {}).get('teamAbbrev', 'UNK'))

df['currentOppTeamCity'] = df['opponentTeamId'].map(lambda x: current_team_names.get(x, {}).get('teamCity', 'Unknown'))
df['currentOppTeamName'] = df['opponentTeamId'].map(lambda x: current_team_names.get(x, {}).get('teamName', 'Unknown'))

unmapped = df[df['currentTeamName'] == 'Unknown'][['teamId', 'teamCity', 'teamName', 'season']].drop_duplicates()
if len(unmapped) > 0:
    print("Unmapped teams found:")
    print(unmapped)
else:
    print("All teams mapped successfully")
    
print(f"\nSample of mapped data:")
df[['teamCity', 'teamName', 'currentTeamCity', 'currentTeamName', 'season']].drop_duplicates().head(30)

Unmapped teams found:
     teamId              teamCity        teamName  season
728   15018             Guangzhou     Loong-Lions  2025.0
807   50013  South East Melbourne         Phoenix  2025.0
813   50014                Hapoel  Jerusalem B.C.  2025.0
818   15016             Melbourne          United  2025.0

Sample of mapped data:


Unnamed: 0,teamCity,teamName,currentTeamCity,currentTeamName,season
0,Dallas,Mavericks,Dallas,Mavericks,2025.0
1,Oklahoma City,Thunder,Oklahoma City,Thunder,2025.0
2,Chicago,Bulls,Chicago,Bulls,2025.0
3,Houston,Rockets,Houston,Rockets,2025.0
4,LA,Clippers,Los Angeles,Clippers,2025.0
5,Milwaukee,Bucks,Milwaukee,Bucks,2025.0
6,Indiana,Pacers,Indiana,Pacers,2025.0
7,Philadelphia,76ers,Philadelphia,76ers,2025.0
8,Phoenix,Suns,Phoenix,Suns,2025.0
9,Memphis,Grizzlies,Memphis,Grizzlies,2025.0


In [81]:
available_seasons = sorted(df['season'].dropna().unique())
print(f"Available seasons: {available_seasons}")

season_game_counts = df.groupby('season').size()
season_team_counts = df.groupby('season')['teamId'].nunique()

complete_season_list = [s for s in available_seasons 
                       if season_team_counts.get(s, 0) >= 25 and season_game_counts.get(s, 0) >= 1000]
print(f"Complete seasons: {complete_season_list}")

train_seasons = [s for s in range(2000, 2015) if s in complete_season_list]
val_seasons = [s for s in range(2015, 2019) if s in complete_season_list]
test_seasons = [s for s in range(2021, 2025) if s in complete_season_list]

print(f"\nTraining seasons: {train_seasons}")
print(f"Validation seasons: {val_seasons}")
print(f"Test seasons: {test_seasons}")


Available seasons: [1946.0, 1947.0, 1948.0, 1949.0, 1950.0, 1951.0, 1952.0, 1953.0, 1954.0, 1955.0, 1956.0, 1957.0, 1958.0, 1959.0, 1960.0, 1961.0, 1962.0, 1963.0, 1964.0, 1965.0, 1966.0, 1967.0, 1968.0, 1969.0, 1970.0, 1971.0, 1972.0, 1973.0, 1974.0, 1975.0, 1976.0, 1977.0, 1978.0, 1979.0, 1980.0, 1981.0, 1982.0, 1983.0, 1984.0, 1985.0, 1986.0, 1987.0, 1988.0, 1989.0, 1990.0, 1991.0, 1992.0, 1993.0, 1994.0, 1995.0, 1996.0, 1997.0, 1998.0, 1999.0, 2000.0, 2001.0, 2002.0, 2003.0, 2004.0, 2005.0, 2006.0, 2007.0, 2008.0, 2009.0, 2010.0, 2011.0, 2012.0, 2013.0, 2014.0, 2015.0, 2016.0, 2017.0, 2018.0, 2019.0, 2020.0, 2021.0, 2022.0, 2023.0, 2024.0, 2025.0]
Complete seasons: [1988.0, 1989.0, 1990.0, 1991.0, 1992.0, 1993.0, 1994.0, 1995.0, 1996.0, 1997.0, 1998.0, 1999.0, 2000.0, 2001.0, 2002.0, 2003.0, 2004.0, 2005.0, 2006.0, 2007.0, 2008.0, 2009.0, 2010.0, 2011.0, 2012.0, 2013.0, 2014.0, 2015.0, 2016.0, 2017.0, 2018.0, 2019.0, 2020.0, 2021.0, 2022.0, 2023.0, 2024.0]

Training seasons: [2000,

In [82]:
allstar_break_dates = {
    1999: pd.Timestamp('2000-02-13'),
    2000: pd.Timestamp('2001-02-11'),
    2001: pd.Timestamp('2002-02-10'),
    2002: pd.Timestamp('2003-02-09'),
    2003: pd.Timestamp('2004-02-15'),
    2004: pd.Timestamp('2005-02-20'),
    2005: pd.Timestamp('2006-02-19'),
    2006: pd.Timestamp('2007-02-18'),
    2007: pd.Timestamp('2008-02-17'),
    2008: pd.Timestamp('2009-02-15'),
    2009: pd.Timestamp('2010-02-14'),
    2010: pd.Timestamp('2011-02-20'),
    2011: pd.Timestamp('2012-02-26'),
    2012: pd.Timestamp('2013-02-17'),
    2013: pd.Timestamp('2014-02-16'),
    2014: pd.Timestamp('2015-02-15'),
    2015: pd.Timestamp('2016-02-14'),
    2016: pd.Timestamp('2017-02-19'),
    2017: pd.Timestamp('2018-02-18'),
    2018: pd.Timestamp('2019-02-17'),
    2019: pd.Timestamp('2020-02-16'),
    2020: pd.Timestamp('2021-03-07'),
    2021: pd.Timestamp('2022-02-20'),
    2022: pd.Timestamp('2023-02-19'),
    2023: pd.Timestamp('2024-02-18'),
    2024: pd.Timestamp('2025-02-16'),
}

# Create a copy of training data filtered for All-Star break
df_train_filtered = df[df['season'].isin(train_seasons)].copy()

def filter_before_allstar(df_season):
    season = df_season['season'].iloc[0]
    if season in allstar_break_dates:
        cutoff_date = allstar_break_dates[season]
        df_season = df_season[df_season['gameDateTimeEst'] < cutoff_date]
    return df_season

df_train_filtered = df_train_filtered.groupby('season', group_keys=False).apply(filter_before_allstar)
df_train_filtered = df_train_filtered.groupby('season', group_keys=False).apply(filter_before_allstar)
print(f"Original training games: {len(df[df['season'].isin(train_seasons)])}")
print(f"Training games before All-Star break: {len(df_train_filtered)}")
print(f"\nGames removed: {len(df[df['season'].isin(train_seasons)]) - len(df_train_filtered)}")

# Show game count per season before and after filtering
print("\nGames per training season (before vs after All-Star break):")
print("\nGames per season:")
before = df[df['season'].isin(train_seasons)].groupby('season').size()
after = df_train_filtered.groupby('season').size()
comparison = pd.DataFrame({'before': before, 'after': after})
comparison['removed'] = comparison['before'] - comparison['after']
print(comparison)

Original training games: 40682
Training games before All-Star break: 24672

Games removed: 16010

Games per training season (before vs after All-Star break):

Games per season:
        before  after  removed
season                        
2000.0    2520   1418     1102
2001.0    2520   1394     1126
2002.0    2554   1414     1140
2003.0    2572   1548     1024
2004.0    2724   1658     1066
2005.0    2864   1784     1080
2006.0    2838   1796     1042
2007.0    2820   1746     1074
2008.0    2840   1784     1056
2009.0    2848   1782     1066
2010.0    2844   1892      952
2011.0    2208   1064     1144
2012.0    2840   1788     1052
2013.0    2854   1794     1060
2014.0    2836   1810     1026


  df_train_filtered = df_train_filtered.groupby('season', group_keys=False).apply(filter_before_allstar)
  df_train_filtered = df_train_filtered.groupby('season', group_keys=False).apply(filter_before_allstar)


In [83]:
df_train_agg = df_train_filtered.copy()
df_val_test_agg = df[df['season'].isin(val_seasons + test_seasons)].copy()
df_for_agg = pd.concat([df_train_agg, df_val_test_agg])

season_stats = df_for_agg.groupby(['season', 'teamId', 'currentTeamName']).agg({
    'win': 'sum',
    'teamScore': ['mean', 'std', 'sum'],
    'opponentScore': ['mean', 'std', 'sum'],
    
    'assists': 'mean',
    'fieldGoalsMade': 'mean',
    'fieldGoalsAttempted': 'mean',
    'fieldGoalsPercentage': 'mean',
    'threePointersMade': 'mean',
    'threePointersAttempted': 'mean',
    'threePointersPercentage': 'mean',
    'freeThrowsMade': 'mean',
    'freeThrowsAttempted': 'mean',
    'freeThrowsPercentage': 'mean',
    
    'blocks': 'mean',
    'steals': 'mean',
    'reboundsDefensive': 'mean',
    'reboundsOffensive': 'mean',
    'reboundsTotal': 'mean',
    
    'turnovers': 'mean',
    'foulsPersonal': 'mean',
    'plusMinusPoints': 'mean',
    'pointsFromTurnovers': 'mean',
    
    'gameId': 'count'
}).reset_index()

season_stats.columns = ['_'.join(col).strip('_') if isinstance(col, tuple) else col 
                        for col in season_stats.columns]

season_stats = season_stats.rename(columns={
    'win_sum': 'wins',
    'gameId_count': 'games_played',
    'teamScore_mean': 'avg_points_scored',
    'teamScore_std': 'std_points_scored',
    'opponentScore_mean': 'avg_points_allowed',
    'opponentScore_std': 'std_points_allowed'
})

season_stats['losses'] = season_stats['games_played'] - season_stats['wins']
season_stats['losses'] = season_stats['games_played'] - season_stats['wins']
season_stats['win_pct'] = season_stats['wins'] / season_stats['games_played']
season_stats['point_diff'] = season_stats['avg_points_scored'] - season_stats['avg_points_allowed']

USE_OFFENSIVE_STRENGTH = True
USE_DEFENSIVE_STRENGTH = True
USE_ASSISTS_SQUARED = True

if USE_OFFENSIVE_STRENGTH:
    season_stats['efg_pct'] = (
        season_stats['fieldGoalsMade_mean'] + 0.5 * season_stats['threePointersMade_mean']
    ) / (season_stats['fieldGoalsAttempted_mean'] + 1e-6)
    season_stats['offensive_strength'] = (
        season_stats['avg_points_scored'] + 
        season_stats['assists_mean'] + 
        season_stats['efg_pct']
    ) / 3

if USE_DEFENSIVE_STRENGTH:
    season_stats['defensive_strength'] = (
        season_stats['avg_points_allowed'] + 
        season_stats['blocks_mean'] + 
        season_stats['steals_mean']
    ) / 3

if USE_ASSISTS_SQUARED:
    season_stats['assists_mean_squared'] = season_stats['assists_mean'] ** 2

print(f"Season stats shape: {season_stats.shape}")
print(f"\nNote: Training season stats (2000-2014) computed from games before All-Star break")

Season stats shape: (686, 37)

Note: Training season stats (2000-2014) computed from games before All-Star break


In [84]:
def assign_playoff_status(season_df):
    season_df = season_df.sort_values('win_pct', ascending=False)
    season_df['playoff_rank'] = range(1, len(season_df) + 1)
    season_df['made_playoffs'] = (season_df['playoff_rank'] <= 16).astype(int)
    return season_df

season_stats = season_stats.groupby('season', group_keys=False).apply(assign_playoff_status)

# Verify playoff distribution
print("Playoff teams per season:")
print(season_stats.groupby('season')['made_playoffs'].sum())
print(f"\nOverall playoff distribution:\n{season_stats['made_playoffs'].value_counts()}")

Playoff teams per season:
season
2000.0    16
2001.0    16
2002.0    16
2003.0    16
2004.0    16
2005.0    16
2006.0    16
2007.0    16
2008.0    16
2009.0    16
2010.0    16
2011.0    16
2012.0    16
2013.0    16
2014.0    16
2015.0    16
2016.0    16
2017.0    16
2018.0    16
2021.0    16
2022.0    16
2023.0    16
2024.0    16
Name: made_playoffs, dtype: int64

Overall playoff distribution:
made_playoffs
1    368
0    318
Name: count, dtype: int64


  season_stats = season_stats.groupby('season', group_keys=False).apply(assign_playoff_status)


In [85]:
train_df = season_stats[season_stats['season'].isin(train_seasons)].copy()
val_df = season_stats[season_stats['season'].isin(val_seasons)].copy()
test_df = season_stats[season_stats['season'].isin(test_seasons)].copy()

print(f"Training set: {len(train_df)} team-seasons ({len(train_seasons)} seasons)")
print(f"Validation set: {len(val_df)} team-seasons ({len(val_seasons)} seasons)")
print(f"Test set: {len(test_df)} team-seasons ({len(test_seasons)} seasons)")

print(f"\nPlayoff distribution:")
print(f"  Train - Playoffs: {train_df['made_playoffs'].sum()}, Non-playoffs: {(train_df['made_playoffs']==0).sum()}")
print(f"  Val   - Playoffs: {val_df['made_playoffs'].sum()}, Non-playoffs: {(val_df['made_playoffs']==0).sum()}")
print(f"  Test  - Playoffs: {test_df['made_playoffs'].sum()}, Non-playoffs: {(test_df['made_playoffs']==0).sum()}")

Training set: 446 team-seasons (15 seasons)
Validation set: 120 team-seasons (4 seasons)
Test set: 120 team-seasons (4 seasons)

Playoff distribution:
  Train - Playoffs: 240, Non-playoffs: 206
  Val   - Playoffs: 64, Non-playoffs: 56
  Test  - Playoffs: 64, Non-playoffs: 56


In [86]:
exclude_cols = ['season', 'teamId', 'currentTeamName', 'made_playoffs', 'playoff_rank', 
                'wins', 'losses', 'games_played', 'teamScore_sum', 'opponentScore_sum']

feature_cols = [col for col in season_stats.columns if col not in exclude_cols]
print(f"Features ({len(feature_cols)}):")
print(feature_cols)
X_train = train_df[feature_cols].values
y_train = train_df['made_playoffs'].values

X_val = val_df[feature_cols].values
y_val = val_df['made_playoffs'].values

X_test = test_df[feature_cols].values
y_test = test_df['made_playoffs'].values

print(f"\nFeature matrix shapes:")
print(f"  X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"  X_val: {X_val.shape}, y_val: {y_val.shape}")
print(f"  X_test: {X_test.shape}, y_test: {y_test.shape}")

Features (29):
['avg_points_scored', 'std_points_scored', 'avg_points_allowed', 'std_points_allowed', 'assists_mean', 'fieldGoalsMade_mean', 'fieldGoalsAttempted_mean', 'fieldGoalsPercentage_mean', 'threePointersMade_mean', 'threePointersAttempted_mean', 'threePointersPercentage_mean', 'freeThrowsMade_mean', 'freeThrowsAttempted_mean', 'freeThrowsPercentage_mean', 'blocks_mean', 'steals_mean', 'reboundsDefensive_mean', 'reboundsOffensive_mean', 'reboundsTotal_mean', 'turnovers_mean', 'foulsPersonal_mean', 'plusMinusPoints_mean', 'pointsFromTurnovers_mean', 'win_pct', 'point_diff', 'efg_pct', 'offensive_strength', 'defensive_strength', 'assists_mean_squared']

Feature matrix shapes:
  X_train: (446, 29), y_train: (446,)
  X_val: (120, 29), y_val: (120,)
  X_test: (120, 29), y_test: (120,)


In [87]:
from sklearn.preprocessing import StandardScaler
import numpy as np

print(f"NaN values: {np.isnan(X_train).sum()}")

X_train = np.nan_to_num(X_train, nan=0)
X_val = np.nan_to_num(X_val, nan=0)
X_test = np.nan_to_num(X_test, nan=0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print(f"Scaled - mean: {X_train_scaled.mean():.4f}, std: {X_train_scaled.std():.4f}")

NaN values: 442
Scaled - mean: 0.0000, std: 0.9826


In [88]:
output_path = "./data/processed"
os.makedirs(output_path, exist_ok=True)

season_stats.to_csv(f"{output_path}/season_stats_all.csv", index=False)
train_df.to_csv(f"{output_path}/train_2000_2014.csv", index=False)
val_df.to_csv(f"{output_path}/val_2015_2018.csv", index=False)
test_df.to_csv(f"{output_path}/test_2021_2024.csv", index=False)

np.save(f"{output_path}/X_train_scaled.npy", X_train_scaled)
np.save(f"{output_path}/X_val_scaled.npy", X_val_scaled)
np.save(f"{output_path}/X_test_scaled.npy", X_test_scaled)
np.save(f"{output_path}/y_train.npy", y_train)
np.save(f"{output_path}/y_val.npy", y_val)
np.save(f"{output_path}/y_test.npy", y_test)

import json
with open(f"{output_path}/feature_names.json", 'w') as f:
    json.dump(feature_cols, f)

print(f"Data saved to {output_path}/")
for f in glob.glob(f"{output_path}/*"):
    print(f"  {f}")

Data saved to ./data/processed/
  ./data/processed/train_2000_2014.csv
  ./data/processed/X_val_scaled.npy
  ./data/processed/test_2021_2024.csv
  ./data/processed/feature_names.json
  ./data/processed/X_train_scaled.npy
  ./data/processed/val_2015_2018.csv
  ./data/processed/y_train.npy
  ./data/processed/season_stats_all.csv
  ./data/processed/y_test.npy
  ./data/processed/X_test_scaled.npy
  ./data/processed/y_val.npy


In [89]:
print("=" * 60)
print("DATA PREPARATION SUMMARY")
print("=" * 60)

print(f"""
Dataset:
  - Source: Kaggle NBA Historical Data
  - Games: {len(df):,}
  - Teams: {len(current_team_names)}

Splits:
  - Train (2000-2014): {len(train_df)} team-seasons
  - Val (2015-2018): {len(val_df)} team-seasons
  - Test (2021-2024): {len(test_df)} team-seasons

Target:
  - Binary: made_playoffs (top 16 teams per season)

Features ({len(feature_cols)}):
  - Scoring, shooting, rebounds, defense, advanced stats
  
Notes:
  - Training uses pre-All-Star break games only
  - COVID season (2019-2020) excluded
""")

DATA PREPARATION SUMMARY

Dataset:
  - Source: Kaggle NBA Historical Data
  - Games: 144,580
  - Teams: 70

Splits:
  - Train (2000-2014): 446 team-seasons
  - Val (2015-2018): 120 team-seasons
  - Test (2021-2024): 120 team-seasons

Target:
  - Binary: made_playoffs (top 16 teams per season)

Features (29):
  - Scoring, shooting, rebounds, defense, advanced stats

Notes:
  - Training uses pre-All-Star break games only
  - COVID season (2019-2020) excluded

