<a href="https://colab.research.google.com/github/RohanCP26/nbaPredictor/blob/main/InitialStatCrossCheck.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import os, glob, warnings
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)

from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import roc_auc_score

try:
    from scipy.stats import spearmanr
    HAVE_SCIPY = True
except Exception:
    HAVE_SCIPY = False


# =========================
# Logging
# =========================
def log(msg):
    print(msg)

def safe_listdir(path="."):
    try:
        return sorted(os.listdir(path))
    except Exception:
        return []

# =========================
# File discovery (FIXED)
#   - Prefer 24-25
#   - Exclude 2016-17 and 2016_17
# =========================
def is_bad_file(path: str) -> bool:
    p = os.path.basename(path).lower()
    return ("2016-17" in p) or ("2016_17" in p) or ("2016" in p)

def find_best(patterns, prefer_tokens=None, require_tokens=None):
    """
    Find best matching file across patterns.
    - Filters out 2016 files
    - If prefer_tokens given, scores higher for those
    - If require_tokens given, only keeps files containing all require tokens
    """
    hits = []
    for pat in patterns:
        hits.extend(glob.glob(pat, recursive=True))

    hits = [h for h in hits if os.path.isfile(h)]
    hits = [h for h in hits if not is_bad_file(h)]

    if require_tokens:
        req = [t.lower() for t in require_tokens]
        hits = [h for h in hits if all(t in os.path.basename(h).lower() for t in req)]

    if not hits:
        return None

    def score(h):
        name = os.path.basename(h).lower()
        s = 0
        if prefer_tokens:
            for t in prefer_tokens:
                if t.lower() in name:
                    s += 10
        # shorter path/name tie-breaker
        s -= 0.001 * len(h)
        return s

    hits = sorted(hits, key=lambda h: (-score(h), len(h), h))
    return hits[0]


# =========================
# Robust column utilities
# =========================
def normalize_cols(df):
    df = df.copy()
    df.columns = (
        df.columns.astype(str)
        .str.replace("\n", " ", regex=False)
        .str.replace("%", "PCT", regex=False)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )
    return df

def make_unique_columns(df):
    cols = list(df.columns)
    seen = {}
    new_cols = []
    for c in cols:
        if c not in seen:
            seen[c] = 1
            new_cols.append(c)
        else:
            seen[c] += 1
            new_cols.append(f"{c}__{seen[c]}")
    df = df.copy()
    df.columns = new_cols
    return df

# =========================
# Player name helpers
# =========================
def clean_player_name(s):
    return (
        s.astype(str)
        .str.replace("\u00a0", " ", regex=False)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )

def last_first_to_first_last(name):
    if not isinstance(name, str):
        name = str(name)
    if "," in name:
        last, first = [x.strip() for x in name.split(",", 1)]
        return f"{first} {last}".strip()
    return name.strip()

def canonical_name_series(s):
    s = clean_player_name(s).str.lower()
    s = s.str.replace(r"[.\']", "", regex=True)
    s = s.str.replace(",", "", regex=False)
    s = s.str.replace(r"\b(jr|sr|ii|iii|iv)\b", "", regex=True)
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()
    return s

# =========================
# Safe read
# =========================
def safe_read_csv(path):
    if path is None:
        return None
    try:
        df = pd.read_csv(path)
        df = normalize_cols(df)
        df = make_unique_columns(df)
        return df
    except Exception as e:
        log(f"[warn] Could not read CSV '{path}': {e}")
        return None

# =========================
# Player/team column detection
# =========================
def ensure_player_col(df, filename="(unknown)"):
    if df is None or not isinstance(df, pd.DataFrame) or df.empty:
        log(f"[warn] Empty/missing dataframe for {filename}. Skipping.")
        return None

    df = normalize_cols(df)
    df = make_unique_columns(df)

    for c in list(df.columns):
        if c.strip().lower() in ["player", "player_name", "player name", "name", "playerfullname", "player_full_name", "player full name"]:
            df = df.rename(columns={c: "Player"})
            break

    if "PLAYER" in df.columns and "Player" not in df.columns:
        df = df.rename(columns={"PLAYER": "Player"})

    if "Player" not in df.columns:
        obj_cols = [c for c in df.columns if pd.api.types.is_object_dtype(df[c])]
        if obj_cols:
            df = df.rename(columns={obj_cols[0]: "Player"})
            log(f"[warn] No explicit player column in {filename}. Using '{obj_cols[0]}' as Player.")
        else:
            log(f"[warn] Could not find player column in {filename}. Columns={df.columns.tolist()[:30]}")
            return None

    df["Player"] = clean_player_name(df["Player"])
    return df

def ensure_team_col(df):
    if df is None:
        return None
    df = df.copy()
    if "TEAM" in df.columns and "Team" not in df.columns:
        df = df.rename(columns={"TEAM": "Team"})
    if "Team" not in df.columns:
        for c in list(df.columns):
            if c.strip().lower() in ["team", "tm", "team_abbreviation"]:
                df = df.rename(columns={c: "Team"})
                break
    return df

# =========================
# Collapse traded players (safe)
# =========================
def collapse_traded_players(df, weight_col="GP"):
    if df is None or "Player" not in df.columns:
        return df
    try:
        df = df.copy()
        df = ensure_team_col(df)
        df = make_unique_columns(df)

        if "Team" in df.columns:
            tot_mask = df["Team"].astype(str).str.upper().eq("TOT")
            players_with_tot = set(df.loc[tot_mask, "Player"].unique())
            if players_with_tot:
                df = pd.concat([
                    df[df["Player"].isin(players_with_tot) & tot_mask],
                    df[~df["Player"].isin(players_with_tot)]
                ], ignore_index=True)

        if df["Player"].nunique() == len(df):
            return df

        if weight_col in df.columns:
            w = pd.to_numeric(df[weight_col], errors="coerce").fillna(1.0).values
        else:
            w = np.ones(len(df))
        df["_w_"] = w

        for c in df.columns:
            if c in ["Player", "Team", "_w_"]:
                continue
            if pd.api.types.is_object_dtype(df[c]):
                tmp = pd.to_numeric(df[c], errors="coerce")
                if tmp.notna().sum() > 0:
                    df[c] = tmp

        num_cols = [c for c in df.columns if c not in ["Player","Team","_w_"] and pd.api.types.is_numeric_dtype(df[c])]
        other_cols = [c for c in df.columns if c not in num_cols + ["_w_"]]

        def wavg(x):
            x = pd.to_numeric(x, errors="coerce")
            ww = df.loc[x.index, "_w_"].values
            mask = ~x.isna()
            if mask.sum() == 0:
                return np.nan
            return np.average(x[mask], weights=ww[mask])

        g = df.groupby("Player", as_index=False)
        agg_other = g[other_cols].first() if other_cols else pd.DataFrame({"Player": df["Player"].unique()})
        agg_num = g[num_cols].agg(wavg) if num_cols else pd.DataFrame({"Player": df["Player"].unique()})

        out = agg_other.merge(agg_num, on="Player", how="left")
        out = out.drop(columns=["_w_"], errors="ignore")
        out = make_unique_columns(out)
        return out
    except Exception as e:
        log(f"[warn] collapse_traded_players failed: {e}. Returning uncollapsed.")
        return df

# =========================
# Specialized parsers (use your existing ones)
# =========================
def parse_shot_fg_safe(path):
    raw = safe_read_csv(path)
    if raw is None or raw.empty:
        return None
    try:
        header = raw.iloc[0].tolist()
        data = raw.iloc[1:].copy()
        data.columns = header
        data = normalize_cols(data)
        data = make_unique_columns(data)

        if "PLAYER" in data.columns:
            data = data.rename(columns={"PLAYER": "Player"})
        if "TEAM" in data.columns:
            data = data.rename(columns={"TEAM": "Team"})
        if "AGE" in data.columns:
            data = data.rename(columns={"AGE": "Age"})

        data = data.rename(columns={
            "FREQ": "FG_FREQ",
            "2FG FREQ": "FG2_FREQ",
            "3FG FREQ": "FG3_FREQ",
            "2FGA": "FG2A",
            "3PA": "FG3A",
        })

        data = ensure_player_col(data, os.path.basename(path))
        if data is None:
            return None
        data = ensure_team_col(data)
        data = make_unique_columns(data)

        for c in data.columns:
            if c in ["Player","Team"]:
                continue
            data[c] = pd.to_numeric(data[c], errors="coerce")

        keep = [c for c in ["Player","Team","Age","GP","FG_FREQ","FG2_FREQ","FG3_FREQ","FGA","FG2A","FG3A"] if c in data.columns]
        return make_unique_columns(data[keep].copy())
    except Exception as e:
        log(f"[warn] parse_shot_fg_safe failed for {path}: {e}")
        return None

def parse_shot_dist_safe(path):
    raw = safe_read_csv(path)
    if raw is None or raw.empty:
        return None
    try:
        df = raw.iloc[1:].copy()
        cols = raw.columns.tolist()
        if len(cols) < 21:
            log(f"[warn] Shot distance file has {len(cols)} columns (expected ~21). Skipping.")
            return None

        df = df.rename(columns={cols[0]: "Player", cols[1]: "Team", cols[2]: "Age"})
        df = ensure_player_col(df, os.path.basename(path))
        if df is None:
            return None
        df = ensure_team_col(df)
        df = make_unique_columns(df)

        bucket_positions = [
            ("LT5", 4),
            ("5_9", 7),
            ("10_14", 10),
            ("15_19", 13),
            ("20_24", 16),
            ("25_29", 19),
        ]
        for name, pos in bucket_positions:
            df[f"{name}_FGA"] = pd.to_numeric(df.iloc[:, pos], errors="coerce")

        keep = ["Player"] + [c for c in df.columns if c.endswith("_FGA")]
        return make_unique_columns(df[keep].copy())
    except Exception as e:
        log(f"[warn] parse_shot_dist_safe failed for {path}: {e}")
        return None

# =========================
# Injury score
# =========================
def load_injury_scores_safe(path):
    inj = safe_read_csv(path)
    if inj is None or inj.empty:
        log("[warn] Injury file missing/empty. Injury scores will be all 0.")
        return pd.DataFrame(columns=["PLAYER_KEY","INJURY_SCORE","UNIQUE_INJURY_DAYS","INJURY_REPORT_ENTRIES"])

    try:
        inj = normalize_cols(inj)
        inj = make_unique_columns(inj)

        col_map = {c.lower(): c for c in inj.columns}

        def pick_col(names):
            for n in names:
                if n.lower() in col_map:
                    return col_map[n.lower()]
            return None

        player_col = pick_col(["PLAYER", "Player", "Player Name", "PLAYER NAME", "Name"])
        status_col = pick_col(["STATUS", "Status"])
        reason_col = pick_col(["REASON", "Reason"])
        date_col = pick_col(["DATE", "Date"])

        if player_col is None:
            tmp = ensure_player_col(inj, os.path.basename(path))
            if tmp is None:
                return pd.DataFrame(columns=["PLAYER_KEY","INJURY_SCORE","UNIQUE_INJURY_DAYS","INJURY_REPORT_ENTRIES"])
            inj = tmp.rename(columns={"Player": "PLAYER"})
            player_col = "PLAYER"

        if date_col is None:
            inj["DATE"] = pd.NaT
            date_col = "DATE"

        inj["PLAYER_STD"] = inj[player_col].astype(str).apply(last_first_to_first_last)
        inj["PLAYER_STD"] = clean_player_name(inj["PLAYER_STD"])
        inj["PLAYER_KEY"] = canonical_name_series(inj["PLAYER_STD"])

        inj["STATUS_STD"] = inj[status_col].astype(str).str.lower().str.strip() if status_col else ""
        inj["REASON_STD"] = inj[reason_col].astype(str).str.lower().str.strip() if reason_col else ""
        inj["DATE_STD"] = pd.to_datetime(inj[date_col], errors="coerce")

        injury_reason_mask = (
            inj["REASON_STD"].str.contains("injury/illness", na=False) |
            inj["REASON_STD"].str.contains("concussion", na=False) |
            inj["REASON_STD"].str.contains("surgery", na=False) |
            inj["REASON_STD"].str.contains("return to competition", na=False)
        )
        status_mask = inj["STATUS_STD"].isin(["out", "out for season", "doubtful", "questionable"])

        inj2 = inj[injury_reason_mask & status_mask].copy()
        if inj2.empty:
            log("[warn] Injury filters removed all rows. Injury scores will be 0.")
            return pd.DataFrame(columns=["PLAYER_KEY","INJURY_SCORE","UNIQUE_INJURY_DAYS","INJURY_REPORT_ENTRIES"])

        scores = inj2.groupby("PLAYER_KEY", as_index=False).agg(
            INJURY_REPORT_ENTRIES=("DATE_STD", "size"),
            UNIQUE_INJURY_DAYS=("DATE_STD", lambda s: s.dt.date.nunique() if s.notna().any() else 0),
        )
        scores["INJURY_SCORE"] = scores["UNIQUE_INJURY_DAYS"] + 0.25 * scores["INJURY_REPORT_ENTRIES"]
        return scores
    except Exception as e:
        log(f"[warn] Injury scoring failed: {e}. Returning empty scores.")
        return pd.DataFrame(columns=["PLAYER_KEY","INJURY_SCORE","UNIQUE_INJURY_DAYS","INJURY_REPORT_ENTRIES"])


# =========================
# Main
# =========================
def main():
    log("Working dir: " + os.getcwd())
    files = safe_listdir(".")
    log("Files (first 40): " + ", ".join(files[:40]) + (" ..." if len(files) > 40 else ""))

    # ---- Choose ONLY the bottom 5 (24-25) ----
    # Patterns are intentionally broad, but we:
    #   - exclude 2016 automatically
    #   - prefer tokens like "24-25" / "24_25"
    prefer_2425 = ["24-25", "24_25", "2024", "2025"]

    injury_path = find_best(
        ["**/*injury*.csv"],
        prefer_tokens=prefer_2425,
        require_tokens=["injury"]
    )

    box_path = find_best(
        ["**/*box*out*.csv", "**/*Box_Out*.csv", "**/*Box Out*.csv"],
        prefer_tokens=prefer_2425
    )

    hustle_path = find_best(
        ["**/*hustle*.csv", "**/*Hustle*.csv"],
        prefer_tokens=prefer_2425
    )

    general_path = find_best(
        ["**/General*.csv", "**/*general*.csv"],
        prefer_tokens=prefer_2425,
        require_tokens=["general"]
    )

    # Shot selection (prefer 24-25, avoid 2016)
    shot_fg_path = find_best(
        ["**/*Shot*Selection*.csv", "**/*Field*Goals*.csv", "**/*Attempts*.csv"],
        prefer_tokens=prefer_2425
    )

    log("\nUsing files (FOR TRAINING):")
    log(f" injury     = {injury_path}")
    log(f" box_out    = {box_path}")
    log(f" hustle     = {hustle_path}")
    log(f" general    = {general_path}")
    log(f" shot_sel   = {shot_fg_path}")

    # Load tables
    def load_basic(path):
        if not path:
            return None
        df = safe_read_csv(path)
        df = ensure_player_col(df, os.path.basename(path))
        if df is None:
            return None
        df = ensure_team_col(df)
        df = make_unique_columns(df)
        return collapse_traded_players(df, weight_col="GP" if "GP" in df.columns else "G")

    box = load_basic(box_path)
    hustle = load_basic(hustle_path)
    gen = load_basic(general_path)

    # Shot selection can be in either "header row first row" format or normal
    shot_fg = parse_shot_fg_safe(shot_fg_path) if shot_fg_path else None
    if shot_fg is None and shot_fg_path:
        # fallback: treat as a normal csv
        shot_fg = load_basic(shot_fg_path)

    if shot_fg is not None:
        shot_fg = collapse_traded_players(shot_fg, weight_col="GP" if "GP" in shot_fg.columns else "G")

    # Merge available tables
    stat_tables = [t for t in [box, hustle, gen, shot_fg] if t is not None]
    if not stat_tables:
        log("[warn] No stat tables loaded. Nothing to do.")
        return

    df = make_unique_columns(stat_tables[0])
    for nxt in stat_tables[1:]:
        nxt = make_unique_columns(nxt)
        df = df.merge(nxt, on="Player", how="outer", suffixes=("", "_DUP"))
        df = make_unique_columns(df)

    # Player key
    df["PLAYER_KEY"] = canonical_name_series(df["Player"])

    # Injury scores
    inj_scores = load_injury_scores_safe(injury_path) if injury_path else pd.DataFrame(
        columns=["PLAYER_KEY","INJURY_SCORE","UNIQUE_INJURY_DAYS","INJURY_REPORT_ENTRIES"]
    )

    df = df.merge(inj_scores, on="PLAYER_KEY", how="left")
    for c in ["INJURY_SCORE","UNIQUE_INJURY_DAYS","INJURY_REPORT_ENTRIES"]:
        if c not in df.columns:
            df[c] = 0
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)

    # Target
    y = np.log1p(df["INJURY_SCORE"].values.astype(float))

    # Features
    exclude = {"INJURY_SCORE","UNIQUE_INJURY_DAYS","INJURY_REPORT_ENTRIES"}
    feature_cols = []
    X = df.copy()

    for c in list(X.columns):
        if c in ["Player","PLAYER_KEY"] or c in exclude:
            continue

        col = X[c]
        if isinstance(col, pd.DataFrame):
            col = col.iloc[:, 0]
            X[c] = col

        if not pd.api.types.is_numeric_dtype(col):
            coerced = pd.to_numeric(col, errors="coerce")
            if coerced.notna().sum() > 0:
                X[c] = coerced
                col = X[c]

        if pd.api.types.is_numeric_dtype(col):
            feature_cols.append(c)

    if not feature_cols:
        log("[warn] No numeric features found. Saving merged dataset for inspection.")
        df.to_csv("player_season_24_25_with_injury_score.csv", index=False)
        log("Saved: player_season_24_25_with_injury_score.csv")
        return

    X = X[feature_cols]

    # Model
    model = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        ("reg", HistGradientBoostingRegressor(
            max_depth=3, learning_rate=0.06, max_iter=400, random_state=42
        ))
    ])

    # CV predictions
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    preds = np.zeros(len(df))
    for tr, te in cv.split(X):
        model.fit(X.iloc[tr], y[tr])
        preds[te] = model.predict(X.iloc[te])

    pred_score = np.expm1(preds)
    true_score = df["INJURY_SCORE"].values.astype(float)

    log("\nRanking evaluation (predicting injury burden):")
    if HAVE_SCIPY:
        rho, pval = spearmanr(pred_score, true_score)
        log(f" Spearman rho(pred, true injury_score): {rho:.4f} (p={pval:.2e})")

    k = max(1, int(0.10 * len(true_score)))
    top_idx = np.argsort(-pred_score)[:k]
    lift = true_score[top_idx].mean() / (true_score.mean() + 1e-9)
    log(f" Top 10% avg injury_score: {true_score[top_idx].mean():.3f}")
    log(f" Overall avg injury_score: {true_score.mean():.3f}")
    log(f" Top-10% lift: {lift:.3f}x")

    for thr in [1, 3, 5, 8]:
        y_bin = (true_score >= thr).astype(int)
        if y_bin.sum() < 10 or y_bin.sum() > len(y_bin) - 10:
            continue
        auc = roc_auc_score(y_bin, pred_score)
        log(f" AUC for injury_score >= {thr}: {auc:.4f} (positives={int(y_bin.sum())})")

    # Save outputs
    out = df.copy()
    out["PRED_INJURY_SCORE"] = pred_score
    out["PRED_INJURY_RANK"] = out["PRED_INJURY_SCORE"].rank(ascending=False, method="min").astype(int)

    out.to_csv("player_season_24_25_with_injury_score.csv", index=False)
    out[["Player","PRED_INJURY_SCORE","PRED_INJURY_RANK","INJURY_SCORE","UNIQUE_INJURY_DAYS","INJURY_REPORT_ENTRIES"]] \
        .sort_values("PRED_INJURY_RANK") \
        .to_csv("player_injury_risk_scores_24_25.csv", index=False)

    log("\nSaved:")
    log(" player_season_24_25_with_injury_score.csv")
    log(" player_injury_risk_scores_24_25.csv")


if __name__ == "__main__":
    main()


Working dir: /content
Files (first 40): .config, .ipynb_checkpoints, drive, injury_stats_24_25.csv, player_injury_risk_scores_24_25.csv, player_season_24_25_with_injury_score.csv, sample_data, test, train

Using files (FOR TRAINING):
 injury     = injury_stats_24_25.csv
 box_out    = train/Box Out Statistics - Sheet1.csv
 hustle     = train/Hustle Statistics - Sheet1.csv
 general    = train/General + drives - Sheet1 (1).csv
 shot_sel   = train/Shot Selection Stats - Sheet2.csv

Ranking evaluation (predicting injury burden):
 Spearman rho(pred, true injury_score): 0.7294 (p=9.04e-128)
 Top 10% avg injury_score: 37.122
 Overall avg injury_score: 12.413
 Top-10% lift: 2.990x
 AUC for injury_score >= 1: 0.8912 (positives=444)
 AUC for injury_score >= 3: 0.8871 (positives=384)
 AUC for injury_score >= 5: 0.8848 (positives=358)
 AUC for injury_score >= 8: 0.8892 (positives=299)

Saved:
 player_season_24_25_with_injury_score.csv
 player_injury_risk_scores_24_25.csv


In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr

df = pd.read_csv("player_season_24_25_with_injury_score.csv")

y = df["INJURY_SCORE"].values.astype(float)

groups = {
    "Exposure": [c for c in df.columns if c.lower() in ["age", "gp", "min", "minutes"]],

    "Shooting Profile": [c for c in df.columns if (
        "FG" in c or "FGA" in c or "_FREQ" in c or "_FGA" in c
    )],

    "Hustle": [c for c in df.columns if any(
        k in c.lower() for k in ["hustle", "deflect", "loose", "charge", "contest"]
    )],

    "Box Outs": [c for c in df.columns if "box" in c.lower()],

    "Drives": [c for c in df.columns if "drive" in c.lower()],
}

rows = []

for gname, cols in groups.items():
    cols = [c for c in cols if c in df.columns]
    if not cols:
        continue

    Xg = df[cols].apply(pd.to_numeric, errors="coerce").fillna(0.0)

    # ðŸ”‘ key change: aggregate FIRST, then rank
    group_sum = Xg.sum(axis=1)

    # if still constant, skip
    if group_sum.nunique() <= 1:
        continue

    group_signal = group_sum.rank(pct=True)

    rho, _ = spearmanr(group_signal, y)

    rows.append({
        "group": gname,
        "spearman_rho_with_injury": rho,
        "abs_rho": abs(rho),
        "n_features_used": len(cols)
    })

result = (
    pd.DataFrame(rows)
    .sort_values("abs_rho", ascending=False)
    .reset_index(drop=True)
)

print("\nGroups most associated with injury burden (Spearman rho):\n")
display(result)




Groups most associated with injury burden (Spearman rho):



Unnamed: 0,group,spearman_rho_with_injury,abs_rho,n_features_used
0,Exposure,0.543839,0.543839,4
1,Hustle,0.514442,0.514442,10
2,Box Outs,0.504578,0.504578,9
3,Shooting Profile,0.484031,0.484031,4
4,Drives,0.464235,0.464235,1


In [14]:
import pandas as pd
import numpy as np
from scipy.stats import spearmanr

# 1. Load data
df = pd.read_csv("player_season_24_25_with_injury_score.csv")

# 2. Ensure the target exists and is numeric
if "INJURY_SCORE" not in df.columns:
    print("Error: 'INJURY_SCORE' not found in CSV. Check your first script's output.")
else:
    y = pd.to_numeric(df["INJURY_SCORE"], errors="coerce").fillna(0).values

    exclude = {
        "Player", "PLAYER_KEY", "INJURY_SCORE", "UNIQUE_INJURY_DAYS",
        "INJURY_REPORT_ENTRIES", "PRED_INJURY_SCORE", "PRED_INJURY_RANK"
    }

    rows = []

    for col in df.columns:
        if col in exclude:
            continue

        # Force numeric conversion
        s = pd.to_numeric(df[col], errors="coerce")

        # Skip if all NaN or if it's a constant value (0 variance)
        if s.isna().all() or s.nunique(dropna=True) <= 1:
            continue

        # Fill NaNs for the correlation calculation
        s_filled = s.fillna(0)

        # Calculate Spearman Rho
        rho, pval = spearmanr(s_filled, y)

        if not np.isnan(rho):
            rows.append({
                "stat": col,
                "spearman_rho_with_injury": rho,
                "abs_rho": abs(rho),
                "p_value": pval
            })

    # 3. Check if we actually found anything before sorting
    if len(rows) > 0:
        result = pd.DataFrame(rows)
        result = result.sort_values("abs_rho", ascending=False).reset_index(drop=True)

        print("\nIndividual stats most associated with injury burden (Spearman rho):\n")
        # Using print(result.head()) for standard scripts, display() for notebooks
        print(result.head(30))
    else:
        print("No valid numeric columns found to correlate! Check if your CSV contains numeric data.")


Individual stats most associated with injury burden (Spearman rho):

                           stat  spearman_rho_with_injury   abs_rho  \
0                           Min                  0.653165  0.653165   
1                            GP                  0.561377  0.561377   
2                  DEF Box Outs                  0.537712  0.537712   
3          Team Reb On Box Outs                  0.533847  0.533847   
4                      Box Outs                  0.533595  0.533595   
5                       Min_DUP                  0.531561  0.531561   
6           Contested 3PT Shots                  0.492954  0.492954   
7   PCT Player Reb When Box Out                  0.490849  0.490849   
8                        GP_DUP                  0.485480  0.485480   
9     PCT Team Reb When Box Out                  0.485321  0.485321   
10                      FGA_DUP                  0.480278  0.480278   
11             PCT Box Outs Def                  0.478217  0.478217   
12     

In [15]:
import pandas as pd
import numpy as np

# 1. Processing Logic (Same as your training script)
def prepare_new_data(file_paths):
    # Load and clean each file
    # (Simplified versions of your robust functions)
    dfs = []
    for path in file_paths:
        df = pd.read_csv(path)
        # Normalize columns (remove newlines, handle headers)
        if "Unnamed" in df.columns[0]: # Specific to the Shot Selection file format
            df.columns = df.iloc[0]
            df = df.drop(0)

        # Standardize Player column
        for col in df.columns:
            if col.lower() in ["player", "player_name"]:
                df = df.rename(columns={col: "Player"})
        dfs.append(df)

    # Merge all stats on Player name
    merged = dfs[0]
    for nxt in dfs[1:]:
        merged = merged.merge(nxt, on="Player", how="inner", suffixes=("", "_dup"))

    # Remove duplicate columns and keep only numeric features
    merged = merged.loc[:,~merged.columns.str.contains('_dup')]
    return merged

# 2. RUNNING THE PREDICTION
# List your 2016-17 files
current_files = [
    "2016-17_Box Out.csv",
    "2016-17_Hustle_Stats.csv",
    "2016-17_Shot_Selection.csv",
    "2016-17_Tracking_Drives.csv"
]

# Process the data
new_season_data = prepare_new_data(current_files)

# Align features with what your model expects
# (Note: X_new must have the exact same columns as the X you used to train)
X_new = new_season_data[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0)

# Generate the Risk Scores
# We use expm1 because we trained on log-scaled injury scores
log_predictions = model.predict(X_new)
new_season_data["PREDICTED_INJURY_SCORE"] = np.expm1(log_predictions)

# Create the Leaderboard
leaderboard = new_season_data[["Player", "PREDICTED_INJURY_SCORE"]].sort_values(
    "PREDICTED_INJURY_SCORE", ascending=False
)

print("Top 10 Players at Risk (2016-17 Stats):")
print(leaderboard.head(10))

FileNotFoundError: [Errno 2] No such file or directory: '2016-17_Box Out.csv'