In [2]:
import pandas as pd

# 1. Load the game.csv
game_df = pd.read_csv("game.csv", parse_dates=["game_date"])

# 2. Create long-format (one row per team per game)
home_df = game_df[[
    "game_id", "game_date", "team_name_home", "pts_home", "reb_home", "ast_home", "plus_minus_home"
]].rename(columns={
    "team_name_home": "team",
    "pts_home": "pts",
    "reb_home": "reb",
    "ast_home": "ast",
    "plus_minus_home": "plus_minus"
})
home_df["home_away"] = "home"

away_df = game_df[[
    "game_id", "game_date", "team_name_away", "pts_away", "reb_away", "ast_away", "plus_minus_away"
]].rename(columns={
    "team_name_away": "team",
    "pts_away": "pts",
    "reb_away": "reb",
    "ast_away": "ast",
    "plus_minus_away": "plus_minus"
})
away_df["home_away"] = "away"

# 3. Combine both sides into one dataset
long_df = pd.concat([home_df, away_df], ignore_index=True)

# 4. Sort and compute rolling stats
long_df = long_df.sort_values(["team", "game_date"])
for stat in ["pts", "reb", "ast", "plus_minus"]:
    long_df[f"{stat}_avg_last10"] = (
        long_df.groupby("team")[stat]
        .transform(lambda x: x.shift(1).rolling(window=10, min_periods=1).mean())
    )

# 5. Pivot back: merge rolling stats into original game_df
home_features = long_df[long_df["home_away"] == "home"][[
    "game_id", "pts_avg_last10", "reb_avg_last10", "ast_avg_last10", "plus_minus_avg_last10"
]].rename(columns=lambda x: f"home_{x}" if x != "game_id" else x)

away_features = long_df[long_df["home_away"] == "away"][[
    "game_id", "pts_avg_last10", "reb_avg_last10", "ast_avg_last10", "plus_minus_avg_last10"
]].rename(columns=lambda x: f"away_{x}" if x != "game_id" else x)

# 6. Merge new features into original game_df
final_df = game_df.merge(home_features, on="game_id", how="left")
final_df = final_df.merge(away_features, on="game_id", how="left")

# 7. Save if needed
final_df.to_csv("game_with_rolling_features.csv", index=False)

In [4]:
game = pd.read_csv("game_with_rolling_features.csv")
game.sample(5)
game.columns

Index(['season_id', 'team_id_home', 'team_abbreviation_home', 'team_name_home',
       'game_id', 'game_date', 'matchup_home', 'wl_home', 'min', 'fgm_home',
       'fga_home', 'fg_pct_home', 'fg3m_home', 'fg3a_home', 'fg3_pct_home',
       'ftm_home', 'fta_home', 'ft_pct_home', 'oreb_home', 'dreb_home',
       'reb_home', 'ast_home', 'stl_home', 'blk_home', 'tov_home', 'pf_home',
       'pts_home', 'plus_minus_home', 'video_available_home', 'team_id_away',
       'team_abbreviation_away', 'team_name_away', 'matchup_away', 'wl_away',
       'fgm_away', 'fga_away', 'fg_pct_away', 'fg3m_away', 'fg3a_away',
       'fg3_pct_away', 'ftm_away', 'fta_away', 'ft_pct_away', 'oreb_away',
       'dreb_away', 'reb_away', 'ast_away', 'stl_away', 'blk_away', 'tov_away',
       'pf_away', 'pts_away', 'plus_minus_away', 'video_available_away',
       'season_type', 'home_pts_avg_last10', 'home_reb_avg_last10',
       'home_ast_avg_last10', 'home_plus_minus_avg_last10',
       'away_pts_avg_last10', 'aw

In [None]:
rolling_df = pd.read_csv("game_with_rolling_features.csv", parse_dates=["game_date"])
selected_df = pd.read_csv("selected_game_features.csv", parse_dates=["Game Date"])
print(rolling_df.columns)
print(selected_df.columns)

Index(['season_id', 'team_id_home', 'team_abbreviation_home', 'team_name_home',
       'game_id', 'game_date', 'matchup_home', 'wl_home', 'min', 'fgm_home',
       'fga_home', 'fg_pct_home', 'fg3m_home', 'fg3a_home', 'fg3_pct_home',
       'ftm_home', 'fta_home', 'ft_pct_home', 'oreb_home', 'dreb_home',
       'reb_home', 'ast_home', 'stl_home', 'blk_home', 'tov_home', 'pf_home',
       'pts_home', 'plus_minus_home', 'video_available_home', 'team_id_away',
       'team_abbreviation_away', 'team_name_away', 'matchup_away', 'wl_away',
       'fgm_away', 'fga_away', 'fg_pct_away', 'fg3m_away', 'fg3a_away',
       'fg3_pct_away', 'ftm_away', 'fta_away', 'ft_pct_away', 'oreb_away',
       'dreb_away', 'reb_away', 'ast_away', 'stl_away', 'blk_away', 'tov_away',
       'pf_away', 'pts_away', 'plus_minus_away', 'video_available_away',
       'season_type', 'home_pts_avg_last10', 'home_reb_avg_last10',
       'home_ast_avg_last10', 'home_plus_minus_avg_last10',
       'away_pts_avg_last10', 'aw

In [8]:
selected_df.rename(columns={"Game ID": "game_id"}, inplace=True)

rolling_df["target"] = rolling_df["wl_home"].map({"W": 1, "L": 0})

rolling_features = [
    'game_id',
    'home_pts_avg_last10', 'home_reb_avg_last10', 'home_ast_avg_last10', 'home_plus_minus_avg_last10',
    'away_pts_avg_last10', 'away_reb_avg_last10', 'away_ast_avg_last10', 'away_plus_minus_avg_last10',
    'target'
]

rolling_subset = rolling_df[rolling_features]
merged_df = pd.merge(selected_df, rolling_subset, on="game_id", how="inner")

merged_df.sample(10)


Unnamed: 0,game_id,Game Date,Home Days Since Last Game,Home Games Last 7 Days,Home Is Back-to-Back,Away Days Since Last Game,Away Games Last 7 Days,Away Is Back-to-Back,Away Team Recent 10 Win Rate,Home Team Recent 10 Win Rate,...,Away Team Recent 3 Months Win Rate,home_pts_avg_last10,home_reb_avg_last10,home_ast_avg_last10,home_plus_minus_avg_last10,away_pts_avg_last10,away_reb_avg_last10,away_ast_avg_last10,away_plus_minus_avg_last10,target
22554,21600623,2017-01-17,16.0,0,False,2.0,2,False,0.2,0.3,...,0.307692,97.4,39.7,23.1,-9.6,118.5,41.8,26.1,7.0,1.0
26849,21900750,2020-02-03,2.0,2,False,7.0,1,False,0.5,0.6,...,0.636364,114.7,48.2,23.8,5.2,112.3,42.4,24.1,2.6,1.0
11799,20800108,2008-11-12,2.0,2,False,2.0,2,False,0.7,0.6,...,0.571429,97.6,38.1,20.3,4.1,96.0,38.7,19.9,-0.9,0.0
13168,30800001,2009-02-15,364.0,0,False,364.0,0,False,0.4,0.4,...,0.0,131.4,59.5,38.1,3.7,127.7,49.6,31.2,-3.7,1.0
29960,22200123,2022-11-04,4.0,2,False,9.0,0,False,0.6,0.5,...,0.5,109.1,42.3,24.6,-1.2,111.5,39.8,25.4,-3.4,0.0
7237,40400224,2005-05-15,2.0,1,False,2.0,1,False,0.3,0.7,...,0.225,101.4,43.3,17.4,-2.1,115.8,44.2,23.8,9.1,1.0
27883,22000661,2021-03-23,4.0,1,False,11.0,0,False,0.4,0.4,...,0.434783,102.4,44.4,22.6,-11.0,118.0,45.4,28.9,10.4,0.0
11106,20700890,2008-03-04,3.0,1,False,2.0,1,False,0.5,0.7,...,0.516129,108.2,41.5,19.9,6.0,106.1,40.7,24.4,7.5,1.0
14130,20900955,2010-03-10,3.0,1,False,1.0,2,True,0.2,0.4,...,0.444444,98.2,44.4,20.9,-3.7,108.6,44.2,27.9,8.8,0.0
12499,20800803,2009-02-18,7.0,1,False,7.0,1,False,0.3,0.7,...,0.387097,95.0,41.7,21.8,2.1,100.1,40.2,21.0,4.5,0.0


In [9]:
merged_df.to_csv("NBA_cleaned.csv", index=False)