In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from nba_api.stats.endpoints import playbyplayv2
from sklearn.linear_model import LogisticRegression
import random


In [14]:
nba_seasons = {
    "1996-97": {"games": 1189, "first_id": "0029600001"},
    "1997-98": {"games": 1189, "first_id": "0029700001"},
    "1998-99": {"games": 864,  "first_id": "0029800001"},  # lockout
    "1999-00": {"games": 1160, "first_id": "0029900001"},
    "2000-01": {"games": 1160, "first_id": "0020000001"},
    "2001-02": {"games": 1160, "first_id": "0020100001"},
    "2002-03": {"games": 1160, "first_id": "0020200001"},
    "2003-04": {"games": 1160, "first_id": "0020300001"},
    "2004-05": {"games": 1230, "first_id": "0020400001"},
    "2005-06": {"games": 1230, "first_id": "0020500001"},
    "2006-07": {"games": 1230, "first_id": "0020600001"},
    "2007-08": {"games": 1230, "first_id": "0020700001"},
    "2008-09": {"games": 1230, "first_id": "0020800001"},
    "2009-10": {"games": 1230, "first_id": "0020900001"},
    "2010-11": {"games": 1230, "first_id": "0021000001"},
    "2011-12": {"games": 990,  "first_id": "0021100001"},  # lockout
    "2012-13": {"games": 1230, "first_id": "0021200001"},
    "2013-14": {"games": 1230, "first_id": "0021300001"},
    "2014-15": {"games": 1230, "first_id": "0021400001"},
    "2015-16": {"games": 1230, "first_id": "0021500001"},
    "2016-17": {"games": 1230, "first_id": "0021600001"},
    "2017-18": {"games": 1230, "first_id": "0021700001"},
    "2018-19": {"games": 1230, "first_id": "0021800001"},
    "2019-20": {"games": 1059, "first_id": "0021900001"},  # covid
    "2020-21": {"games": 1080, "first_id": "0022000001"},  # covid
    "2021-22": {"games": 1230, "first_id": "0022100001"},
    "2022-23": {"games": 1230, "first_id": "0022200001"},
}
keys = list(nba_seasons.keys())


In [7]:

def makeDf(game_id):
    pbp = playbyplayv2.PlayByPlayV2(game_id=game_id)
    df = pbp.get_data_frames()[0]

    # Clean
    df = df.dropna(subset=["SCORE"])
    df[["h_pts", "a_pts"]] = df["SCORE"].str.split(" - ", expand=True).astype(int)

    # Convert clock → game minute
    def to_minute(period, clock):
        mins, secs = map(int, clock.split(":"))
        elapsed = (period-1)*12 + (12 - mins - secs/60)
        return int(elapsed) + 1

    df["minute"] = df.apply(lambda r: to_minute(r["PERIOD"], r["PCTIMESTRING"]), axis=1)

    return df




In [10]:
def makeMinuteDf(df):
    if df.empty or "h_pts" not in df.columns or df["minute"].max() is None:
        return pd.DataFrame()

    minutes = []

    max_minute = int(df["minute"].max())

    for m in range(1, max_minute+1):
        snapshot = df[df["minute"] <= m]
        if snapshot.empty:
            # If it's the very first minute, pad with 0–0
            if m == 1:
                minutes.append([1, 0, 0, 0])
            continue
        h_pts = snapshot["h_pts"].max()
        a_pts = snapshot["a_pts"].max()
        score_diff = h_pts - a_pts
        minutes.append([m, h_pts, a_pts, score_diff])

    minute_df = pd.DataFrame(minutes, columns=["minute","h_pts","a_pts","score_diff"])

    # Target = final outcome
    final_h = df["h_pts"].max()
    final_a = df["a_pts"].max()
    winner = 1 if final_h > final_a else 0
    minute_df["home_win"] = winner

    return minute_df

pbp_df=pd.DataFrame()
for i in range (100):
    year = random.choice(keys)
    game = random.randint(1, nba_seasons[year]['games'])
    first_id = nba_seasons[year]['first_id']
    prefix = first_id[:-4]
    game_id = f"{prefix}{game:04d}"
    try:
        tempMinDf = makeMinuteDf(makeDf(game_id))
        tempMinDf["game_id"] = game_id
        tempMinDf["season"] = year 
        pbp_df = pd.concat([pbp_df, tempMinDf], ignore_index=True)
    except Exception as e:
        print(f"Skipping {game_id} ({year}): {e}")

pbp_df.to_csv("game_prob.csv", index=False)


Skipping 0020400459 (2004-05): HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Skipping 0020401100 (2004-05): HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Skipping 0029800741 (1998-99): Columns must be same length as key
Skipping 0029900289 (1999-00): HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)
Skipping 0020700924 (2007-08): HTTPSConnectionPool(host='stats.nba.com', port=443): Read timed out. (read timeout=30)


In [13]:
import pandas as pd

# Load your file
df = pd.read_csv("game_prob.csv")

# Count rows per game
counts = df["game_id"].value_counts()

# Games with too many rows (duplicates/append errors)
dupes = counts[counts > 80]

print(f"Total games: {df['game_id'].nunique()}")
print(f"Games with >80 rows: {len(dupes)}")

if not dupes.empty:
    print("\nProblematic game_ids:")
    for game_id, cnt in dupes.items():
        print(f"{game_id} → {cnt} rows")
else:
    print("No game_id exceeds 80 rows ✅")


Total games: 95
Games with >80 rows: 0
No game_id exceeds 80 rows ✅
