In [None]:
# --- Cell 1: setup & utilities ---
import os, json, time, datetime as dt
from pathlib import Path
from typing import List, Tuple, Dict, Optional
import requests
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

CACHE_ROOT = Path("data/raw")
CACHE_ROOT.mkdir(parents=True, exist_ok=True)

HEADERS = {"User-Agent": "ift6758-course/1.0"}
TIMEOUT = 15
THROTTLE_SEC = 0.10
MAX_WORKERS = 8
MAX_RETRIES = 3
BACKOFF_BASE = 0.5

GAME_TYPE_MAP = {
    "preseason": "01",
    "regular": "02",
    "playoffs": "03",
    "allstar": "04",
}


def build_game_id(season_start_year: int, game_type: str, game_number: int) -> str:
    """
    Build a unique NHL GAME_ID string.

    Format:
        GAME_ID = YYYYTTNNNN
            YYYY : season start year (e.g., 2016 for the 2016–17 season)
            TT   : game type code
                   "01" = preseason
                   "02" = regular season
                   "03" = playoffs
                   "04" = all-star
            NNNN : four-digit sequential game number

    Args:
        season_start_year (int): The starting year of the season.
        game_type (str): One of {'preseason', 'regular', 'playoffs', 'allstar'}.
        game_number (int): Sequential number of the game.

    Returns:
        str: The constructed GAME_ID.

    Raises:
        ValueError: If `game_type` is not one of the supported keys.
    """
    if game_type not in GAME_TYPE_MAP:
        raise ValueError(f"Unknown game_type '{game_type}'")
    return f"{season_start_year}{GAME_TYPE_MAP[game_type]}{game_number:04d}"


def cache_path(cache_root: Path, game_id: str) -> Path:
    """
    Build the local cache file path for a given GAME_ID.

    Args:
        cache_root (Path): Root directory of the cache.
        game_id (str): The NHL GAME_ID.

    Returns:
        Path: Full path to the cached JSON file,
              e.g. data/raw/2016/type-02/2016020001.json
    """
    season = game_id[:4]
    gtype = game_id[4:6]
    return cache_root / season / f"type-{gtype}" / f"{game_id}.json"


def http_get_with_retries(url: str, timeout: float = 15.0) -> Optional[requests.Response]:
    """
    Perform an HTTP GET request with retry and exponential backoff.

    Retries are triggered for network errors, HTTP 5xx, or HTTP 429.
    Returns None immediately for a 404 (resource not found).

    Args:
        url (str): The URL to fetch.
        timeout (float, optional): Timeout (seconds) for each request attempt.

    Returns:
        Optional[requests.Response]: The successful response object,
            or None if a 404 is received or all retries fail.
    """
    for attempt in range(MAX_RETRIES):
        try:
            r = requests.get(url, headers=HEADERS, timeout=timeout)
            if r.status_code == 404:
                return None
            if r.status_code >= 500 or r.status_code == 429:
                time.sleep(BACKOFF_BASE * (2 ** attempt))
                continue
            r.raise_for_status()
            return r
        except requests.RequestException:
            time.sleep(BACKOFF_BASE * (2 ** attempt))
    return None


In [None]:
# --- Cell 2: enumerate valid game_ids ---
def list_game_ids_for_season(season_start_year: int, game_type: str, max_games_hint: int = 1500) -> List[str]:
    valid_ids = []
    miss = 0
    for n in tqdm(range(1, max_games_hint + 1), desc=f"{season_start_year} {game_type}"):
        gid = build_game_id(season_start_year, game_type, n)
        url = f"https://api-web.nhle.com/v1/gamecenter/{gid}/play-by-play"
        r = http_get_with_retries(url, timeout=TIMEOUT)
        if r is None:
            miss += 1
            if miss >= 120:
                break
        else:
            valid_ids.append(gid)
            miss = 0
        time.sleep(THROTTLE_SEC)
    print(f" {season_start_year} {game_type}: found {len(valid_ids)} valid games.")
    return valid_ids


In [None]:
# --- Cell 3: fetch & parallel download ---
def fetch_one(game_id: str) -> bool:
    p = cache_path(CACHE_ROOT, game_id)
    if p.exists():
        return True
    url = f"https://api-web.nhle.com/v1/gamecenter/{game_id}/play-by-play"
    r = http_get_with_retries(url, timeout=TIMEOUT)
    if r is None:
        return False
    p.parent.mkdir(parents=True, exist_ok=True)
    p.write_text(json.dumps(r.json()), encoding="utf-8")
    time.sleep(THROTTLE_SEC)
    return True

def fetch_season_fast(season_start_year: int, game_type: str):
    game_ids = list_game_ids_for_season(season_start_year, game_type)
    ok = 0
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        futures = [ex.submit(fetch_one, gid) for gid in game_ids]
        for f in tqdm(as_completed(futures), total=len(futures), desc=f"Downloading {season_start_year} {game_type}"):
            try:
                ok += 1 if f.result() else 0
            except Exception as e:
                print("[ERR]", e)
    print(f" {season_start_year} {game_type}: downloaded {ok}/{len(game_ids)} games.")
    return ok




In [None]:
def fetch_year_range_fast(start_year: int, end_year: int,
                          types=("regular", "playoffs")) -> dict:
    """
    Download NHL play-by-play data for multiple seasons and game types in batch.

    This function iterates over each season and game type, calling
    `fetch_season_fast()` to download all available game JSONs between the given
    start and end years.

    Args:
        start_year (int): First season to include (e.g., 2016 for the 2016–17 season).
        end_year (int): Last season to include (inclusive).
        types (tuple[str]): Sequence of game types to fetch, usually
            ("regular", "playoffs").

    Returns:
        dict[tuple[int, str], int]:
            A dictionary mapping (season_start_year, game_type) →
            number of successfully downloaded (or cached) games.

    Notes:
        - Each call to `fetch_season_fast()` handles retries and caching.
        - Downloads may take significant time; consider testing first
          with a short range (e.g. 2016–2017).
    """
    results = {}
    for year in range(start_year, end_year + 1):
        for gtype in types:
            print(f"\n=== {year} {gtype} ===")
            try:
                ok = fetch_season_fast(year, gtype)
                results[(year, gtype)] = ok
            except Exception as e:
                print(f"[ERR] {year} {gtype}: {e}")
    return results

fetch_year_range_fast(2016, 2024, types=("regular", "playoffs"))

In [None]:
# --- Cell 2: extraction to DataFrame ---
import pandas as pd

def extract_events(json_obj: dict) -> pd.DataFrame:
    """
    Extract play-by-play events from a single NHL game JSON file.

    Supports the newer API structure (api-web.nhle.com), where events
    are stored in the top-level key `"plays"`.

    Args:
        json_obj (dict): Parsed JSON object representing a single game.

    Returns:
        pd.DataFrame: A table of events with the following columns:
            ['event', 'secondaryType', 'period', 'periodTime',
             'dateTime', 'team', 'x', 'y']

        The DataFrame may be empty if no plays are found.
    """
    rows = []

    plays = json_obj.get("plays", None)
    if isinstance(plays, list) and len(plays) > 0:
        for p in plays:
            details = p.get("details", {}) or {}
            team    = p.get("team", {}) or {}
            periodD = p.get("periodDescriptor", {}) or {}

            rows.append({
                "event":         p.get("typeDescKey") or p.get("typeCode"),
                "secondaryType": details.get("shotType") or details.get("eventCode"),
                "period":        periodD.get("number"),
                "periodTime":    p.get("timeInPeriod") or details.get("timeInPeriod"),
                "dateTime":      p.get("timeUTC") or details.get("eventOwnerTeamTime") or None,
                "team":          team.get("name") or team.get("triCode"),
                "x":             details.get("xCoord"),
                "y":             details.get("yCoord"),
            })
        return pd.DataFrame(rows)

    # Return an empty DataFrame with the expected schema if no data
    return pd.DataFrame(columns=[
        "event", "secondaryType", "period", "periodTime",
        "dateTime", "team", "x", "y"
    ])


# --- replace your build_dataset with this version ---
def build_dataset(start: int = 2016, end: int = 2024, gtype: str = "regular") -> pd.DataFrame:
    """
    Build a combined play-by-play dataset by concatenating multiple cached game files.

    Adds two columns for per-game navigation:
      - game_id   (from filename)
      - game_type (from folder: type-02 -> regular, type-03 -> playoffs)
    """
    root = CACHE_ROOT
    tcode = GAME_TYPE_MAP[gtype]
    all_dfs = []

    # reverse map for readability
    rev_type = {"01": "preseason", "02": "regular", "03": "playoffs", "04": "allstar"}

    for year in range(start, end + 1):
        d = root / str(year) / f"type-{tcode}"
        if not d.exists():
            continue
        for f in d.glob("*.json"):
            try:
                data = json.loads(f.read_text())
                df = extract_events(data)
                if df.empty:
                    continue
                # derive fields from path
                game_id = f.stem
                type_code = f.parent.name.replace("type-", "")  # "02"/"03"
                game_type = rev_type.get(type_code, type_code)
                df["game_id"] = game_id
                df["game_type"] = game_type
                all_dfs.append(df)
            except Exception as e:
                print("[WARN]", f, e)

    if not all_dfs:
        return pd.DataFrame(columns=[
            "event","secondaryType","period","periodTime","dateTime","team","x","y",
            "game_id","game_type"
        ])
    return pd.concat(all_dfs, ignore_index=True)


In [None]:

from pathlib import Path
import json

cands = list((CACHE_ROOT / "2016" / "type-02").glob("*.json"))
print("# Files：", len(cands))
sample = json.loads(cands[0].read_text())

df_test = extract_events(sample)
print("Colums：", df_test.columns.tolist())
print("Shape：", df_test.shape)
df_test.head(3)


In [None]:
# --- Cell 3: save clean dataset ---
from pathlib import Path
import pandas as pd

def save_clean_dataset(df: pd.DataFrame, out_path: Path):
    """
    Save a cleaned play-by-play dataset to disk in CSV or compressed CSV (gzip) format.

    The function automatically creates parent directories if they do not exist,
    and determines compression based on the file extension.

    Args:
        df (pd.DataFrame): The DataFrame to save.
        out_path (Path): Destination file path. Compression is inferred:
            - Ends with ".gz" or ".csv.gz" → saved with gzip compression
            - Otherwise → saved as plain CSV

    Returns:
        None
    """
    out_path.parent.mkdir(parents=True, exist_ok=True)
    if out_path.suffix == ".gz" or out_path.suffixes[-2:] == [".csv", ".gz"]:
        df.to_csv(out_path, index=False, compression="gzip")
    else:
        df.to_csv(out_path, index=False)


In [None]:
df_small = build_dataset(2016, 2023, "regular")
print("df_small shape:", df_small.shape)
assert not df_small.empty, "Dataset is empty, please double check."

outp = Path("data/clean/events_regular_2016-2023.csv.zip")
save_clean_dataset(df_small, outp)

print("Output:", outp.resolve(), "size(bytes)=", outp.stat().st_size)
assert outp.exists() and outp.stat().st_size > 0, "Output is empty or does not exist"

# Read back and compare
df_back = pd.read_csv(outp, compression="zip")
print("Read-back shape:", df_back.shape)
print("Columns:", list(df_back.columns))
assert df_back.shape[0] == df_small.shape[0], "Data does not match"


In [None]:
# --- Cell 6: Quick QA + Plot ---
# (Run only after building df_reg / df_po)
import matplotlib.pyplot as plt


def quick_event_counts(df: 'pd.DataFrame', topn: int = 10):
    """
    Display the most frequent play-by-play event types in a dataset.

    Prints the top event counts to the console and plots them as a bar chart.

    Args:
        df (pd.DataFrame): The dataset containing at least an "event" column.
        topn (int): Number of top event categories to display and plot.

    Returns:
        None

    Notes:
        - This function assumes the DataFrame contains a column named 'event'.
        - If df is empty, the function prints a warning and returns without plotting.
        - Uses matplotlib (no seaborn, per project guidelines).
    """
    if df.empty:
        print("Empty DataFrame — build or load the dataset first.")
        return
    if 'event' not in df.columns:
        print("Missing 'event' column — please check your extraction step.")
        return

    counts = df['event'].value_counts().head(topn)
    print("Top event counts:\n", counts)

    plt.figure(figsize=(8, 4))
    counts.plot(kind='bar')
    plt.title(f"Top {topn} play-by-play events")
    plt.xlabel("Event type")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()

In [None]:
target_df = df_small  

quick_event_counts(target_df, topn=12)


In [None]:
# --- Per-game interactive browser  ---
# Browse events per game with season-type toggle and on-rink plotting.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ipywidgets import VBox, HBox, Dropdown, IntSlider, Output
from IPython.display import display
from pathlib import Path

# ========= 1) Load cleaned dataset (auto-detect .gz / .zip) =========
DATA_CANDIDATES = [
    
    "data/clean/events_regular_2016-2023.csv.zip",
     "data/clean/events_regular_2020-2021.csv.zip",
    
]

def _load_any(paths):
    for p in paths:
        path = Path(p)
        if path.exists():
            comp = "gzip" if path.suffix == ".gz" or path.suffixes[-2:] == [".csv", ".gz"] else \
                   "zip"  if path.suffix == ".zip" or path.suffixes[-2:] == [".csv", ".zip"] else None
            print(f"[LOAD] {path} (compression={comp})")
            return pd.read_csv(path, compression=comp)
    raise FileNotFoundError("No cleaned dataset found. Checked:\n  - " + "\n  - ".join(paths))

path = [
    
    "data/clean/events_regular_2016-2023.csv.zip",
    
]
df0 = _load_any(path)

# ========= 2) Sanity checks & minimal enrich =========
required_cols = {
    "event","secondaryType","period","periodTime","dateTime","team","x","y","game_id","game_type"
}
missing = required_cols - set(df0.columns)
if missing:
    raise ValueError(f"Dataset missing columns {sorted(missing)}. "
                     f"Please rebuild with the updated build_dataset() that adds game_id and game_type.")

# Derive season_start_year from dateTime (Aug–Jul season logic)
def _season_from_dt(dt_str):
    try:
        y = int(str(dt_str)[:4]); m = int(str(dt_str)[5:7])
        return y if m >= 8 else (y - 1)
    except Exception:
        return np.nan

if "season_start_year" not in df0.columns:
    df0["season_start_year"] = df0["dateTime"].astype(str).map(_season_from_dt)

# ========= 3) Widgets =========
# Season options from derived season_start_year
season_vals = sorted([int(s) for s in df0["season_start_year"].dropna().unique().tolist()])
season_dd   = Dropdown(options=["(all)"] + season_vals, value="(all)", description="Season:")
gtype_dd    = Dropdown(options=["regular","playoffs"], value="regular", description="Type:")
game_dd     = Dropdown(options=[], description="Game ID:")
event_idx   = IntSlider(value=0, min=0, max=0, step=1, description="Event #")

out_plot = Output()
out_text = Output()

# ========= 4) Helpers =========
def _rink(ax):
    """Minimal rink outline for quick plotting (not to scale)."""
    ax.axhline(0, lw=0.5, alpha=0.3)
    ax.set_xlim(-100, 100)
    ax.set_ylim(-42, 42)
    ax.set_aspect("equal", "box")
    ax.set_xlabel("x"); ax.set_ylabel("y"); ax.set_title("Rink")

def _refresh_games(*_):
    sub = df0.copy()

    # filter by game_type
    if gtype_dd.value:
        sub = sub[sub["game_type"] == gtype_dd.value]

    # filter by season_start_year if selected
    if season_dd.value != "(all)":
        sub = sub[sub["season_start_year"] == season_dd.value]

    games = sub["game_id"].dropna().unique().tolist()
    games.sort()

    game_dd.options = games or ["(none)"]
    game_dd.value = games[0] if games else "(none)"
    _refresh_event_slider()

def _refresh_event_slider(*_):
    if game_dd.value in (None, "(none)"):
        event_idx.max = 0
        event_idx.value = 0
        _render()
        return
    sub = df0[(df0["game_id"] == game_dd.value)]
    event_idx.max = max(len(sub) - 1, 0)
    event_idx.value = 0
    _render()

def _render(*_):
    out_plot.clear_output()
    out_text.clear_output()

    if game_dd.value in (None, "(none)"):
        with out_text:
            print("No game selected.")
        return

    sub = df0[df0["game_id"] == game_dd.value].reset_index(drop=True)
    if sub.empty:
        with out_text:
            print("No events for this game.")
        return

    # clamp index
    i = int(event_idx.value)
    if i < 0: i = 0
    if i >= len(sub): i = len(sub) - 1

    row = sub.iloc[i]

    with out_text:
        # Header info
        print(f"Game: {row['game_id']} | Type: {row.get('game_type','?')} | "
              f"Season: {int(row['season_start_year']) if pd.notna(row['season_start_year']) else 'n/a'}-"
              f"{str(int(row['season_start_year'])+1)[-2:] if pd.notna(row['season_start_year']) else 'n/a'}")
        print(f"Event #{i+1}/{len(sub)}: {row['event']}  | Period {row['period']} @ {row['periodTime']}")
        print(f"Team: {row['team']}  | dateTime: {row['dateTime']}")

        # No coordinates → print a note (as required)
        if pd.isna(row["x"]) or pd.isna(row["y"]):
            print("(No coordinates for this event)")

    with out_plot:
        fig, ax = plt.subplots(figsize=(6,5))
        _rink(ax)
        # Draw the point if coordinates exist
        if pd.notna(row["x"]) and pd.notna(row["y"]):
            ax.scatter([row["x"]], [row["y"]], s=70, alpha=0.9)
            ax.set_title(f"{row['event']} ({row['team']})  P{int(row['period'])} {row['periodTime']}")
        plt.tight_layout()
        plt.show()

# ========= 5) Wiring & initial display =========
season_dd.observe(_refresh_games, "value")
gtype_dd.observe(_refresh_games, "value")
game_dd.observe(_render, "value")
event_idx.observe(_render, "value")

_refresh_games()  # populate game list & render

ui_game = VBox([
    HBox([season_dd, gtype_dd]),
    HBox([game_dd, event_idx]),
    out_text,
    out_plot
])

display(ui_game)

# ========= 6)  Export this widget as a standalone HTML to embed in blog =========
from ipywidgets.embed import embed_minimal_html
from pathlib import Path
Path("_includes").mkdir(exist_ok=True)
embed_minimal_html("_includes/interactive_widget.html", views=[ui_game], title="Explorateur LNH")
print("Exported -> _includes/interactive_widget.html")


In [None]:
from pathlib import Path
from ipywidgets.embed import embed_minimal_html


Path("_includes").mkdir(exist_ok=True)


embed_minimal_html("_includes/interactive_widget.html", views=[ui_game], title="Explorateur LNH")

print("OK → _includes/interactive_widget.html")


In [None]:
import pandas as pd

def extract_shots_and_goals(json_obj: dict, game_id: str) -> pd.DataFrame:
    """
    Version corrigée pour le nouveau format d'API NHL
    Utilise eventOwnerTeamId au lieu de team
    """
    plays = json_obj.get("plays", [])
    rows = []
    
    # Récupérer les équipes home/away pour mapper les IDs
    home_team = json_obj.get("homeTeam", {}).get("abbrev")
    away_team = json_obj.get("awayTeam", {}).get("abbrev")
    home_team_id = json_obj.get("homeTeam", {}).get("id")
    away_team_id = json_obj.get("awayTeam", {}).get("id")
    
    # Mapping des IDs d'équipe vers les abbreviations
    team_id_map = {}
    if home_team_id and home_team:
        team_id_map[home_team_id] = home_team
    if away_team_id and away_team:
        team_id_map[away_team_id] = away_team
    
    for p in plays:
        event = p.get("typeDescKey")
        if event not in ("shot-on-goal", "goal"):
            continue

        details = p.get("details", {}) or {}
        periodD = p.get("periodDescriptor", {}) or {}

        # EXTRACTION DE L'ÉQUIPE via eventOwnerTeamId
        team_id = details.get("eventOwnerTeamId")
        team_name = team_id_map.get(team_id) if team_id else None
        
        # Si pas dans le mapping, utiliser l'ID directement
        if team_name is None and team_id:
            team_name = f"TEAM_{team_id}"

        # Extraction des coordonnées
        x_coord = details.get("xCoord")
        y_coord = details.get("yCoord")
        
        # Ignorer si pas de coordonnées
        if x_coord is None or y_coord is None:
            continue

        rows.append({
            "game_id": game_id,
            "period": periodD.get("number"),
            "periodTime": p.get("timeInPeriod"),
            "team": team_name,
            "eventType": "GOAL" if event == "goal" else "SHOT",
            "shooter": details.get("shooterPlayerId"),
            "goalie": details.get("goalieInNetId"),
            "x": x_coord,
            "y": y_coord,
            "shotType": details.get("shotType"),
            "emptyNet": details.get("emptyNet", False),
            "strength": details.get("strength"),
            "dateTime": p.get("timeInPeriod"),
            "eventOwnerTeamId": team_id  # Garder l'ID original aussi
        })
    
    return pd.DataFrame(rows)

In [None]:
from pathlib import Path
import json
from tqdm import tqdm

def build_clean_shots_dataset(start=2016, end=2024, gtype="regular") -> pd.DataFrame:
    """
    Combine all shots and goals from cached JSONs into one clean DataFrame.
    """
    root = CACHE_ROOT
    tcode = GAME_TYPE_MAP[gtype]
    dfs = []

    for year in range(start, end+1):
        d = root / str(year) / f"type-{tcode}"
        if not d.exists():
            continue
        for f in tqdm(list(d.glob("*.json")), desc=f"{year} {gtype}"):
            try:
                js = json.loads(f.read_text())
                game_id = f.stem
                df = extract_shots_and_goals(js, game_id)
                if not df.empty:
                    df["season_start_year"] = year
                    df["game_type"] = gtype
                    dfs.append(df)
            except Exception as e:
                print("[WARN]", f, e)

    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()

df_clean = build_clean_shots_dataset(2016, 2024, "regular")
df_clean.to_csv("data/clean/shots_goals_regular_2016_2024.csv.zip", index=False, compression="zip")
print(df_clean.head(10))
print("Fichier sauvegardé : data/clean/shots_goals_regular_2016_2024.csv.zip")

In [None]:
# --- Build cleaned shots/goals CSVs from cached JSONs for selected seasons ---
# Input  : data/raw/<YEAR>/type-02/*.json (regular), data/raw/<YEAR>/type-03/*.json (playoffs)
# Output : data/clean/shots_goals_<type>_<YYYY[_YYYY]>.csv.gz  (SHOT & GOAL rows only)

import json
from pathlib import Path
from typing import Dict, Iterable, List, Tuple
import pandas as pd
from tqdm import tqdm

# -------------------------------------------------------------------
# Config: adjust YEARS and TYPES to what you need
# -------------------------------------------------------------------
YEARS: List[int] = [2018, 2019, 2020]                 # e.g. [2020] for season 2020-21 only; or [2018, 2019, 2020]
TYPES: Tuple[str, ...] = ("regular", "playoffs")     # ("regular",) or ("playoffs",) or ("regular","playoffs")

CACHE_ROOT = Path("data/raw")
CLEAN_ROOT = Path("data/clean")
CLEAN_ROOT.mkdir(parents=True, exist_ok=True)

TYPE2CODE: Dict[str, str] = {"preseason":"01","regular":"02","playoffs":"03","allstar":"04"}
CODE2TYPE: Dict[str, str] = {v:k for k,v in TYPE2CODE.items()}

# -------------------------------------------------------------------
# Core extractors
# -------------------------------------------------------------------
def extract_shots_and_goals(json_obj: dict, game_id: str, game_type_label: str, season_start_year: int) -> pd.DataFrame:
    """
    Convert ONE game's play-by-play JSON (new gamecenter) into a tidy table
    containing ONLY 'shot-on-goal' and 'goal' events.

    Returned columns include (minimum per assignment):
      - game_id, season_start_year, game_type
      - period, periodTime
      - team (shooting team)
      - eventType: 'SHOT' | 'GOAL'
      - x, y (ice coordinates; may be NaN)
      - shooter, goalie
      - shotType
      - emptyNet (bool)
      - strength (EV/PP/SH etc., often present for GOAL)
      - dateTime (UTC if available)
    """
    plays = json_obj.get("plays", []) or []
    rows = []
    for p in plays:
        evt = (p.get("typeDescKey") or p.get("typeCode") or "").lower()
        if evt not in ("shot-on-goal", "goal"):
            continue

        det   = p.get("details", {}) or {}
        team  = p.get("team", {}) or {}
        perD  = p.get("periodDescriptor", {}) or {}

        rows.append({
            "game_id":            game_id,
            "season_start_year":  season_start_year,
            "game_type":          game_type_label,     # 'regular' or 'playoffs'
            "period":             perD.get("number"),
            "periodTime":         p.get("timeInPeriod") or det.get("timeInPeriod"),
            "team":               team.get("triCode") or team.get("name"),
            "eventType":          "GOAL" if evt == "goal" else "SHOT",
            "x":                  det.get("xCoord"),
            "y":                  det.get("yCoord"),
            "shooter":            det.get("shooterName"),
            "goalie":             det.get("goalieName"),
            "shotType":           det.get("shotType"),
            "emptyNet":           bool(det.get("emptyNet", False)),
            "strength":           det.get("strength"),
            "dateTime":           p.get("timeUTC") or det.get("eventOwnerTeamTime"),
        })
    return pd.DataFrame(rows)

def _cache_dir_for(year: int, game_type: str) -> Path:
    """Return the cache directory where raw JSON files were saved for a given season & type."""
    tcode = TYPE2CODE[game_type]
    return CACHE_ROOT / str(year) / f"type-{tcode}"

def _iter_game_files(years: Iterable[int], game_type: str) -> List[Path]:
    """
    List all cached JSON files for given seasons and game type.
    Only scans local cache; does not fetch from the web.
    """
    files: List[Path] = []
    for y in years:
        d = _cache_dir_for(y, game_type)
        if d.exists():
            files.extend(sorted(d.glob("*.json")))
    return files

def build_clean_for(years: Iterable[int], game_type: str) -> pd.DataFrame:
    """
    Build a cleaned DataFrame (shots & goals only) for the selected seasons and one game type.
    Reads ONLY from local cache under data/raw.
    """
    files = _iter_game_files(years, game_type)
    if not files:
        raise FileNotFoundError(f"No cached JSON found for {list(years)} {game_type} under {CACHE_ROOT}/<year>/type-*/")

    dfs = []
    for f in tqdm(files, desc=f"Extract {game_type} ({min(years)}..{max(years)})"):
        try:
            js = json.loads(f.read_text(encoding="utf-8"))
            game_id = f.stem
            year = int(f.parent.parent.name)   # the 'year' folder name
            df = extract_shots_and_goals(js, game_id, game_type, year)
            if not df.empty:
                dfs.append(df)
        except Exception as e:
            print("[WARN]", f, e)
    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()

def _season_suffix_range(years: Iterable[int]) -> str:
    """
    Build a short suffix for filenames, e.g. [2020] -> '2020_2021', [2018,2019,2020] -> '2018_2020'.
    For a single start-year, we reflect the hockey span (e.g., 2020 -> 2020_2021).
    """
    years = sorted(set(years))
    if len(years) == 1:
        y = years[0]
        return f"{y}_{y+1}"
    return f"{years[0]}_{years[-1]}"

def save_clean_df(df: pd.DataFrame, out_path: Path) -> Path:
    """Save DataFrame as CSV or CSV.GZ based on suffix."""
    out_path.parent.mkdir(parents=True, exist_ok=True)
    if out_path.suffix == ".gz" or out_path.suffixes[-2:] == [".csv", ".zip"]:
        df.to_csv(out_path, index=False, compression="zip")
    else:
        df.to_csv(out_path, index=False)
    print(f"[OK] Saved {out_path}  rows={len(df)}")
    return out_path

# -------------------------------------------------------------------
# Run: build per requested type, then (optionally) a merged file
# -------------------------------------------------------------------
saved_paths: List[Path] = []

for gtype in TYPES:
    df_clean = build_clean_for(YEARS, gtype)
    if df_clean.empty:
        print(f"[WARN] Empty result for {YEARS} {gtype}")
        continue
    suffix = _season_suffix_range(YEARS)
    outp = CLEAN_ROOT / f"shots_goals_{gtype}_{suffix}.csv.zip"
    save_clean_df(df_clean, outp)
    saved_paths.append(outp)

# Optional: merge all requested types (if more than one) into one file
if len(saved_paths) >= 2:
    merged = pd.concat([pd.read_csv(p, compression="zip") for p in saved_paths], ignore_index=True)
    outm = CLEAN_ROOT / f"shots_goals_{_season_suffix_range(YEARS)}.csv.zip"
    save_clean_df(merged, outm)
    saved_paths.append(outm)

print("\n[SUMMARY]")
for p in saved_paths:
    print(" -", p)


In [None]:
# --- Cell: Simple visualizations for IFT6758 Étape 1 (Q4) ---
# This cell produces:
#   - Q1: Bar chart of total shots vs goals by shot type (single season)
#   - Q1a: Goal probability vs distance curves for seasons 2018-19 to 2020-21
#   - Q2: Heatmap of goal% by distance bin and shot type (single season)
#
# Requirements: pandas, numpy, matplotlib (no seaborn needed)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from typing import Tuple, List

# ---------------------------
# Config: input files & season
# ---------------------------
# Point this to your cleaned shots/goals dataset generated in Step 3.
# Accepts .csv, .csv.gz or .csv.zip
CLEAN_PATHS = [
    "data/clean/shots_goals_regular_2018_2020.csv.zip",
    "data/clean/shots_goals_playoffs_2018_2020.csv.zip",
    "data/clean/shots_goals_2018_2020.csv.zip",
   
]

SEASON_FOR_SINGLE = 2020  # 2020-21 season (season_start_year=2020)
SEASONS_FOR_CURVES = [2018, 2019, 2020]  # 2018-19, 2019-20, 2020-21

# ---------------------------
# Utilities
# ---------------------------
def load_first_available(paths: List[str]) -> pd.DataFrame:
    """
    Try to load the first existing cleaned dataset among candidate paths.
    Supports .gz and .zip via pandas' 'compression' parameter.
    """
    for p in paths:
        path = Path(p)
        if path.exists():
            comp = None
            if path.suffix == ".gz" or path.suffixes[-2:] == [".csv", ".gz"]:
                comp = "gzip"
            elif path.suffix == ".zip" or path.suffixes[-2:] == [".csv", ".zip"]:
                comp = "zip"
            print(f"[LOAD] {path} (compression={comp})")
            return pd.read_csv(path, compression=comp)
    raise FileNotFoundError(
        "No cleaned dataset found. Please generate your cleaned shots/goals file first.\n"
        "Checked:\n  - " + "\n  - ".join(paths)
    )

def ensure_columns(df: pd.DataFrame) -> pd.DataFrame:
    """
    Normalize column names and types expected by this visualization cell.
    Expected semantics:
      - secondaryType: shot type name (if missing, fallback to 'shotType')
      - eventType: 'SHOT' or 'GOAL' (if missing, derive from 'event')
      - season_start_year: infer from dateTime if missing
      - x, y present (may contain NaN)
    """
   
    if "secondaryType" not in df.columns and "shotType" in df.columns:
        df = df.assign(secondaryType=df["shotType"])
    if "secondaryType" not in df.columns:
        raise ValueError("Missing required column 'secondaryType' (no 'shotType' to fallback).")

    # --- eventType fallback from event ---
    if "eventType" not in df.columns:
        if "event" in df.columns:
            mapped = df["event"].astype(str).str.lower().map({
                "goal": "GOAL",
                "shot-on-goal": "SHOT",
                "shot": "SHOT",
            })
            df = df.assign(eventType=mapped)
        else:
            raise ValueError("Neither 'eventType' nor 'event' found to identify SHOT/GOAL.")

    # --- coordinates presence check (allow NaN) ---
    for c in ("x", "y"):
        if c not in df.columns:
            raise ValueError(f"Missing required column '{c}'.")

    # --- season inference if absent ---
    if "season_start_year" not in df.columns:
        if "dateTime" not in df.columns:
            raise ValueError("Missing 'season_start_year' and 'dateTime' to infer it.")
        def _season_from_dt(dt_str):
            try:
                y = int(str(dt_str)[:4]); m = int(str(dt_str)[5:7])
                return y if m >= 8 else (y - 1)
            except Exception:
                return np.nan
        df["season_start_year"] = df["dateTime"].astype(str).map(_season_from_dt)

    return df


def compute_distance_xy(x: pd.Series, y: pd.Series) -> pd.Series:
    """
    Compute shot distance to the attacking net using NHL rink coordinates.
    We assume the offensive net is near x=89, y=0. Distances are in 'rink feet' units.
    """
    return np.sqrt((89 - x.astype(float))**2 + (0 - y.astype(float))**2)

def season_filter(df: pd.DataFrame, season_start_year: int) -> pd.DataFrame:
    """
    Return rows for a specific NHL season (start-year), e.g., 2020 for 2020-21.
    """
    return df[df["season_start_year"] == season_start_year].copy()

def summarize_shots_goals_by_type(df: pd.DataFrame) -> pd.DataFrame:
    """
    Group by 'secondaryType' to compute total shots, total goals and goal rate.
    Only 'SHOT' and 'GOAL' rows are considered.
    """
    sub = df[df["eventType"].isin(["SHOT", "GOAL"])].copy()
    # count by (type, eventType)
    ct = sub.groupby(["secondaryType", "eventType"]).size().unstack(fill_value=0)
    for col in ("SHOT", "GOAL"):
        if col not in ct.columns:
            ct[col] = 0
    ct["goal_rate"] = ct["GOAL"] / (ct["GOAL"] + ct["SHOT"]).replace({0: np.nan})
    return ct.sort_values(by=["SHOT", "GOAL"], ascending=False)

def binned_goal_rate(df: pd.DataFrame, bin_width: int = 2, max_dist: int = 90) -> Tuple[np.ndarray, np.ndarray]:
    """
    Compute goal rate over distance bins for the given dataframe (SHOT & GOAL rows).
    Returns (bin_centers, goal_rate_array).
    """
    sub = df[df["eventType"].isin(["SHOT","GOAL"])].copy()
    sub["distance"] = compute_distance_xy(sub["x"], sub["y"])
    bins = np.arange(0, max_dist + bin_width, bin_width)
    labels = bins[:-1] + bin_width / 2.0
    sub["d_bin"] = pd.cut(sub["distance"], bins=bins, include_lowest=True, labels=labels)
    grp = sub.groupby("d_bin")["eventType"].value_counts().unstack(fill_value=0)
    shots = grp.get("SHOT", pd.Series(0, index=grp.index))
    goals = grp.get("GOAL", pd.Series(0, index=grp.index))
    rate = (goals / (shots + goals).replace({0: np.nan})).astype(float)
    x_centers = grp.index.astype(float)  # bin centers
    return x_centers.values, rate.values

def heatmap_goal_rate_by_distance_and_type(df: pd.DataFrame, bin_width: int = 5, max_dist: int = 90) -> Tuple[np.ndarray, List[str]]:
    """
    Build a 2D matrix of goal% indexed by distance bins (rows) and shot types (columns).
    Returns (matrix, col_labels) where matrix shape is [n_bins, n_types].
    """
    sub = df[df["eventType"].isin(["SHOT","GOAL"])].copy()
    sub["distance"] = compute_distance_xy(sub["x"], sub["y"])
    # distance bins
    bins = np.arange(0, max_dist + bin_width, bin_width)
    sub["d_bin"] = pd.cut(sub["distance"], bins=bins, include_lowest=True)
    # pivot: within each (d_bin, type), compute goals / total
    def _rate(s):
        s = s.dropna()
        if s.empty: return np.nan
        goals = (s == "GOAL").sum()
        total = s.size
        return goals / total if total > 0 else np.nan

    pivot = sub.pivot_table(index="d_bin", columns="secondaryType", values="eventType", aggfunc=_rate)
    # Sort columns alphabetically for stable display
    pivot = pivot.reindex(sorted(pivot.columns), axis=1)
    return pivot.values, pivot.columns.tolist(), [str(idx) for idx in pivot.index]

# ---------------------------
# Load & normalize
# ---------------------------
df_all = load_first_available(CLEAN_PATHS)


if "secondaryType" not in df_all.columns and "shotType" in df_all.columns:
    df_all["secondaryType"] = df_all["shotType"]

df_all = ensure_columns(df_all)

# ---------------------------
# Q1: Shots vs Goals by shot type (single season)
# ---------------------------
df_one = season_filter(df_all, SEASON_FOR_SINGLE)
by_type = summarize_shots_goals_by_type(df_one)

print(f"[INFO] Season {SEASON_FOR_SINGLE}-{str(SEASON_FOR_SINGLE+1)[-2:]} rows:", len(df_one))
display(by_type.head(10))

from pathlib import Path

# Create assets directory if it doesn't exist
Path("assets").mkdir(parents=True, exist_ok=True)
print("Folder ready:", Path('assets').resolve())


plt.figure(figsize=(10,5))
by_type[["SHOT","GOAL"]].plot(kind="bar", ax=plt.gca(), width=0.8)
plt.title(f"Shots vs Goals by Shot Type — Season {SEASON_FOR_SINGLE}-{str(SEASON_FOR_SINGLE+1)[-2:]}")
plt.xlabel("Shot type (secondaryType)")
plt.ylabel("Count")
plt.tight_layout(); plt.show()

plt.savefig("Shots vs Goals by Shot Type.png", dpi=150)


# Optionally print quick takeaways:
most_common = by_type["SHOT"].idxmax() if not by_type.empty else "n/a"
most_dangerous = by_type["goal_rate"].idxmax() if "goal_rate" in by_type.columns and by_type["goal_rate"].notna().any() else "n/a"
print(f"[TAKEAWAY] Most common shot type: {most_common}")
print(f"[TAKEAWAY] Highest goal-rate shot type: {most_dangerous}")

# ---------------------------
# Q1a: Goal probability vs distance for 2018-19, 2019-20, 2020-21
# ---------------------------
plt.figure(figsize=(9,5))
for sy in SEASONS_FOR_CURVES:
    df_s = season_filter(df_all, sy)
    if df_s.empty:
        print(f"[WARN] No data for season_start_year={sy}")
        continue
    x_mid, y_rate = binned_goal_rate(df_s, bin_width=2, max_dist=90)
    plt.plot(x_mid, y_rate, marker="o", linestyle="-", label=f"{sy}-{str(sy+1)[-2:]}")
plt.title("Goal probability vs distance (shots+goals)\nSeasons 2018-19 to 2020-21")
plt.xlabel("Distance to net (feet)"); plt.ylabel("Goal rate")
plt.ylim(0, 1.0); plt.legend(); plt.tight_layout(); plt.show()
plt.savefig("assets/Goal probability vs distance.png", dpi=150)


# ---------------------------
# Q2: Heatmap of goal% by distance × shot type (single season)
# ---------------------------
mat, col_labels, row_labels = heatmap_goal_rate_by_distance_and_type(df_one, bin_width=5, max_dist=90)

plt.figure(figsize=(11,6))
im = plt.imshow(mat, aspect="auto", origin="lower")
plt.colorbar(im, fraction=0.046, pad=0.04, label="Goal rate")
plt.xticks(ticks=np.arange(len(col_labels)), labels=col_labels, rotation=45, ha="right")
plt.yticks(ticks=np.arange(len(row_labels)), labels=row_labels)
plt.title(f"Goal% by distance bin and shot type — Season {SEASON_FOR_SINGLE}-{str(SEASON_FOR_SINGLE+1)[-2:]}")
plt.xlabel("Shot type"); plt.ylabel("Distance bins (feet)")
plt.tight_layout(); plt.show()

plt.savefig("assets/Goals by distance.png", dpi=150)



In [None]:
# --- Cellule : Calcul de la densité de tir ---
# Cette fonction calcule où les équipes tirent le plus sur la patinoire
# Utilise la méthode KDE (estimation de densité par noyau) pour lisser les positions

import pandas as pd
import numpy as np
from scipy.stats import gaussian_kde
import plotly.express as px
import plotly.graph_objects as go

def calculer_densite_tirs(df, annee_saison, equipe=None):
    # Filtre par saison
    df_saison = df[df['season_start_year'] == annee_saison].copy()
    
     # Si on veut une équipe en particulier
    if equipe:
        df_saison = df_saison[df_saison['team'] == equipe]
    
    # Prendre seulement les tirs et buts
    df_tirs = df_saison[df_saison['eventType'].isin(['SHOT', 'GOAL'])]
    
    # Enlever les données sans coordonnées
    df_tirs = df_tirs.dropna(subset=['x', 'y'])
    
    # Garder seulement les tirs en zone offensive (côté droit)
    tirs_offensifs = df_tirs[df_tirs['x'] > 0]

    
     # S'il y'a pas de données, on arrête tout
    if len(tirs_offensifs) == 0:
        return None
    
    # Calcul de la densité avec méthode KDE
    coordonnees = tirs_offensifs[['x', 'y']].values.T
       # Calcul de la densité avec méthode KDE
    coordonnees = tirs_offensifs[['x', 'y']].values.T
    try:
        kde = gaussian_kde(coordonnees)
    except Exception as e:
        print(f"Erreur KDE pour {equipe or 'ligue'} ({len(coordonnees[0])} tirs) → {e}")
        return None

    
    # Créer une grille pour évaluer la densité
    x_grille = np.linspace(0, 100, 50)
    y_grille = np.linspace(-42, 42, 50)
    X, Y = np.meshgrid(x_grille, y_grille)
    coord_grille = np.vstack([X.ravel(), Y.ravel()])
    
    # Calculer la densité sur toute la grille
    densite = kde(coord_grille).reshape(X.shape)
    
    return {
        'x_grille': x_grille,
        'y_grille': y_grille, 
        'densite': densite,
        'equipe': equipe,
        'saison': annee_saison,
        'total_tirs': len(tirs_offensifs)
    }

In [None]:
# --- Cellule : Graphique interactif avec menu équipes ---
# Cette fonction crée la carte avec menu déroulant

def creer_carte_interactive(donnees_calcul, liste_equipes):
    """
    Crée une carte interactive avec menu pour choisir l'équipe
    donnees_calcul: dictionnaire avec les données de densité
    liste_equipes: liste de toutes les équipes disponibles
    """
    fig = go.Figure()
    
    # D'abord, je trace la moyenne de toute la ligue
    fig.add_trace(go.Contour(
        x=donnees_calcul['moyenne_ligue']['x_grille'],
        y=donnees_calcul['moyenne_ligue']['y_grille'], 
        z=donnees_calcul['moyenne_ligue']['densite'],
        colorscale='Blues',
        showscale=False,
        name='Moyenne Ligue',
        visible=True  # Visible par défaut
    ))
    
    # Ensuite, je prépare le tracé pour chaque équipe
    for i, equipe in enumerate(liste_equipes):
        if equipe in donnees_calcul:
            fig.add_trace(go.Contour(
                x=donnees_calcul[equipe]['x_grille'],
                y=donnees_calcul[equipe]['y_grille'],
                z=donnees_calcul[equipe]['densite'],
                colorscale='Reds',
                showscale=False,
                name=equipe,
                visible=False  # Cachées au début
            ))
    
    # Maintenant, je crée le menu déroulant
    boutons_menu = []
    
    # Option 1: Voir la moyenne de la ligue
    boutons_menu.append(dict(
        label='Moyenne Ligue',
        method='update',
        args=[{'visible': [True] + [False] * len(liste_equipes)}]
    ))
    
    # Options 2...: Voir chaque équipe
    for i, equipe in enumerate(liste_equipes):
        if equipe in donnees_calcul:
            # Je prépare quel graphique montrer/cacher
            visibilite = [False] * (len(liste_equipes) + 1)
            visibilite[0] = False    # Cache la moyenne ligue
            visibilite[i + 1] = True  # Montre cette équipe
            
            boutons_menu.append(dict(
                label=equipe,
                method='update',
                args=[{'visible': visibilite}]
            ))
    
    # Finalement, je configure l'apparence
    fig.update_layout(
        title=f"Carte des Tirs Offensifs - Saison {donnees_calcul['moyenne_ligue']['saison']}",
        xaxis_title='Distance du filet (pieds)',
        yaxis_title='Position latérale (pieds)',
        updatemenus=[dict(
            type="dropdown",
            direction="down",
            x=0.1,
            y=1.15,
            buttons=boutons_menu
        )]
    )
    
    return fig

In [None]:
import os

# Créer un dossier pour les graphiques dans data/
output_dir = "data/graphiques"
os.makedirs(output_dir, exist_ok=True)

print(f"Dossier créé : {output_dir}")


# generer graphiques 
for annee_saison in saisons:
    print(f"Création graphique saison {annee_saison}-{annee_saison+1}...")
    
    donnees_calcul = {}
    
    # Moyenne ligue pour cette saison
    moyenne_ligue = calculer_densite_tirs(df_normalise, annee_saison=annee_saison)
    if moyenne_ligue is None:
        print(f"Aucune donnée moyenne ligue pour {annee_saison}")
        continue
    else:
        donnees_calcul['moyenne_ligue'] = moyenne_ligue
        print(f"Moyenne ligue: {moyenne_ligue['total_tirs']} tirs")
    
    # Liste des équipes pour cette saison
    liste_equipes_saison = sorted([
        team for team in df_normalise[df_normalise['season_start_year'] == annee_saison]['team'].unique() 
        if team and pd.notna(team) and team != "UNK"
    ])
    
    print(f"Équipes disponibles: {len(liste_equipes_saison)}")
    
    # Densité par équipe (5 équipes pour test)
    equipes_test = liste_equipes_saison[:5]
    equipes_avec_donnees = 0
    
    for equipe in equipes_test:
        dens = calculer_densite_tirs(df_normalise, annee_saison=annee_saison, equipe=equipe)
        if dens:
            donnees_calcul[equipe] = dens
            equipes_avec_donnees += 1
    
    print(f"Équipes avec données: {equipes_avec_donnees}/{len(equipes_test)}")
    
    # Création du graphique
    if len(donnees_calcul) > 1:
        fig = creer_carte_interactive(donnees_calcul, equipes_test)
        
        # Sauvegarde dans le bon dossier
        nom_fichier = os.path.join(output_dir, f"plan_tir_{annee_saison}_{annee_saison+1}.html")
        fig.write_html(nom_fichier)
        print(f"Sauvegardé: {nom_fichier}")
        
        # Affichage du dernier graphique
        if annee_saison == 2020:
            print("Affichage du dernier graphique...")
            fig.show()
    else:
        print(f"Pas assez de données pour {annee_saison}")
