In [None]:
df = pd.read_csv("player_injury_score_2020_2025_cleaned.csv")
df

Unnamed: 0,Name,season,minor_count,moderate_count,severe_count,2020_2021,2021_2022,2022_2023,2023_2024,2024_2025,has_severe_injury,total_days_missed,injury_score,games_missed,games_missed_last_season
0,(James) Mike Scott,2020_2021,1.0,3.0,0.0,39.393939,0.000000,0.0,0.0,0.0,False,27.0,,13.023529,
1,(James) Mike Scott,2021_2022,0.0,0.0,0.0,,,,,,True,170.0,100.0,82.000000,13.023529
2,(James) Mike Scott,2022_2023,0.0,0.0,0.0,,,,,,True,170.0,100.0,82.000000,82.000000
3,(James) Mike Scott,2023_2024,0.0,0.0,0.0,,,,,,True,170.0,100.0,82.000000,82.000000
4,(James) Mike Scott,2024_2025,0.0,0.0,0.0,,,,,,True,170.0,100.0,82.000000,82.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3655,Zylan Cheatham,2020_2021,0.0,0.0,0.0,,,,,,True,170.0,100.0,82.000000,
3656,Zylan Cheatham,2021_2022,1.0,0.0,0.0,0.000000,1.315789,0.0,0.0,0.0,False,3.0,,1.447059,82.000000
3657,Zylan Cheatham,2022_2023,0.0,0.0,0.0,,,,,,True,170.0,100.0,82.000000,1.447059
3658,Zylan Cheatham,2023_2024,0.0,0.0,0.0,,,,,,True,170.0,100.0,82.000000,82.000000


In [None]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score


# ===============================
# LOAD DATA
# ===============================

df = pd.read_csv("player_injury_score_2020_2025_cleaned.csv")


# ===============================
# SORT DATA (important for time features)
# ===============================

df = df.sort_values(["Name", "season"])


# ===============================
# CREATE NEXT SEASON TARGET
# ===============================

# shift games missed forward → next season injury
df["games_missed_next_season"] = (
    df.groupby("Name")["games_missed"].shift(-1)
)

# classification target
df["target"] = (df["games_missed_next_season"] >= 20).astype(int)


# ===============================
# DROP LAST SEASON (no future label)
# ===============================

df = df.dropna(subset=["games_missed_next_season"])


# ===============================
# FEATURES (baseline)
# ===============================

feature_cols = [
    "games_missed",
    "games_missed_last_season",
    "total_days_missed",
    "minor_count",
    "moderate_count",
    "severe_count",
    "has_severe_injury"
]

X = df[feature_cols].fillna(0)
y = df["target"]


# ===============================
# TRAIN / TEST SPLIT
# ===============================

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


# ===============================
# MODEL PIPELINE
# ===============================

model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000))
])


# ===============================
# TRAIN MODEL
# ===============================

model.fit(X_train, y_train)


# ===============================
# EVALUATION
# ===============================

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

print("ROC AUC:", roc_auc_score(y_test, y_prob))


# ===============================
# SAVE MODEL
# ===============================

os.makedirs("models", exist_ok=True)

joblib.dump(model, "models/injury_clf.pkl")

print("\n✅ Model saved → models/injury_clf.pkl")



Classification Report:

              precision    recall  f1-score   support

           0       0.64      0.40      0.49       241
           1       0.67      0.85      0.75       345

    accuracy                           0.66       586
   macro avg       0.66      0.62      0.62       586
weighted avg       0.66      0.66      0.64       586

ROC AUC: 0.6737266221660954

✅ Model saved → models/injury_clf.pkl


Improved model:

In [None]:
!pip install xgboost




In [None]:
# ============================================
# XGBOOST INJURY RISK MODEL (NO DATA LEAKAGE)
# ============================================

import pandas as pd
import numpy as np
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier


# --------------------------------------------
# 1. LOAD YOUR DATASET
# --------------------------------------------
df = pd.read_csv("player_injury_score_2020_2025_cleaned.csv")

# Clean player names (remove leading/trailing spaces)
df["Name"] = df["Name"].astype(str).str.strip()

print("Dataset shape:", df.shape)
print("\nColumns:", df.columns)


# --------------------------------------------
# 2. CLEAN DATA
# --------------------------------------------

# ensure proper datatype
df["season"] = df["season"].astype(str)

# convert severe flag to numeric if needed
if df["has_severe_injury"].dtype == bool:
    df["has_severe_injury"] = df["has_severe_injury"].astype(int)


# --------------------------------------------
# 3. SORT BY PLAYER + TIME
# (important for time-series features)
# --------------------------------------------
df = df.sort_values(["Name", "season"]).reset_index(drop=True)


# --------------------------------------------
# 4. CREATE PAST HISTORY FEATURES (LAG FEATURES)
# --------------------------------------------

# previous season days missed
df["prev_days_missed"] = df.groupby("Name")["total_days_missed"].shift(1)

# previous season severe injury
df["prev_severe"] = df.groupby("Name")["has_severe_injury"].shift(1)

# drop first season rows (no history)
df = df.dropna(subset=["prev_days_missed", "prev_severe"])

print("\nAfter lag feature creation:", df.shape)


# --------------------------------------------
# 5. CREATE TARGET VARIABLE
# --------------------------------------------
# injury = missed significant time (>15 days)
df["injury_label"] = (df["total_days_missed"] > 15).astype(int)


# --------------------------------------------
# 6. SELECT FEATURES (NO LEAKAGE)
# --------------------------------------------
features = [
    "minor_count",
    "moderate_count",
    "severe_count",
    "prev_days_missed",
    "prev_severe"
]

X = df[features]
y = df["injury_label"]

print("\nFeatures used:", features)


# --------------------------------------------
# 7. TRAIN TEST SPLIT
# --------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


# --------------------------------------------
# 8. TRAIN XGBOOST MODEL
# --------------------------------------------
model = XGBClassifier(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    eval_metric="logloss",
    random_state=42
)


model.fit(X_train, y_train)


# --------------------------------------------
# 9. EVALUATE MODEL
# --------------------------------------------
pred = model.predict(X_test)
prob = model.predict_proba(X_test)[:, 1]

print("\nClassification Report:\n")
print(classification_report(y_test, pred))

print("ROC AUC:", roc_auc_score(y_test, prob))


# --------------------------------------------
# 10. FEATURE IMPORTANCE
# --------------------------------------------
importance = pd.DataFrame({
    "feature": features,
    "importance": model.feature_importances_
}).sort_values("importance", ascending=False)

print("\nTop Important Features:\n", importance)


Dataset shape: (3660, 15)

Columns: Index(['Name', 'season', 'minor_count', 'moderate_count', 'severe_count',
       '2020_2021', '2021_2022', '2022_2023', '2023_2024', '2024_2025',
       'has_severe_injury', 'total_days_missed', 'injury_score',
       'games_missed', 'games_missed_last_season'],
      dtype='object')

After lag feature creation: (2928, 17)

Features used: ['minor_count', 'moderate_count', 'severe_count', 'prev_days_missed', 'prev_severe']

Classification Report:

              precision    recall  f1-score   support

           0       0.62      0.70      0.65       151
           1       0.89      0.85      0.87       435

    accuracy                           0.81       586
   macro avg       0.75      0.77      0.76       586
weighted avg       0.82      0.81      0.81       586

ROC AUC: 0.8881327548146456

Top Important Features:
             feature  importance
0       minor_count    0.623365
1    moderate_count    0.215913
4       prev_severe    0.068810
2   

In [None]:
# --------------------------------------------
# 11. SAVE MODEL
# --------------------------------------------
os.makedirs("models", exist_ok=True)
joblib.dump(model, "models/injury_xgb.pkl")

print("\n✅ Model saved → models/injury_xgb.pkl")



✅ Model saved → models/injury_xgb.pkl


Predict:

In [None]:
import pandas as pd
import numpy as np

# function to predict next season injury probability
def predict_next_season(model, player_name, df):

    # Use .str.lower() for case-insensitive comparison
    player_data = df[df["Name"].str.lower() == player_name.lower()].sort_values("season")

    if player_data.empty:
        print("Player not found.")
        return None

    latest = player_data.iloc[-1]

    sample = pd.DataFrame([
        {
            "minor_count": latest["minor_count"],
            "moderate_count": latest["moderate_count"],
            "severe_count": latest["severe_count"],
            "prev_days_missed": latest["total_days_missed"],
            "prev_severe": int(latest["has_severe_injury"])
        }
    ])

    prob = model.predict_proba(sample)[0][1]

    return prob

In [None]:
def build_player_table(player_name, df, model):

    player_name = player_name.strip().lower()

    # get all rows for player
    player_df = df[df["Name"].str.lower() == player_name]

    if player_df.empty:
        print("Player not found")
        return None

    # season score columns already in dataset
    season_cols = [
        "2020_2021",
        "2021_2022",
        "2022_2023",
        "2023_2024",
        "2024_2025"
    ]

    # ---- KEY FIX ----
    # aggregate scores across all rows for player
    # use max because dataset is long-format (duplicate rows)
    historical_scores = (
        player_df[season_cols]
        .max()
        .fillna(0)
        .values
    )

    table = pd.DataFrame({
        "season": season_cols,
        "injury_score": historical_scores
    })

    # ---- Predict future season ----
    future_prob = predict_next_season(model, player_name, df)
    future_score = future_prob * 100

    future_row = pd.DataFrame({
        "season": ["2025_2026"],
        "injury_score": [future_score]
    })

    table = pd.concat([table, future_row], ignore_index=True)

    return table


In [None]:
import matplotlib.pyplot as plt

def plot_injury_trend(player_name, table):

    plt.figure(figsize=(8,5))

    past = table[table["season"] != "2025_2026"]
    future = table[table["season"] == "2025_2026"]

    # historical trend (blue line)
    plt.plot(
        past["season"],
        past["injury_score"],
        marker="o",
        label="Historical"
    )

    # future prediction (red marker only)
    plt.scatter(
        future["season"],
        future["injury_score"],
        color="red",
        s=120,
        label="Predicted (2025-26)"
    )

    plt.title(f"Injury Risk Trend — {player_name}")
    plt.xlabel("Season")
    plt.ylabel("Injury Score (0–100)")
    plt.xticks(rotation=45)
    plt.legend()
    plt.show()


In [None]:
player_name = "A.J. Green" # Use an existing player name from the processed df

table = build_player_table(player_name, df, model)

if table is not None:
    print("\nInjury Score Table:")
    print(table)

    plot_injury_trend(player_name, table)
else:
    print(f"Could not build table for player: {player_name}")

Player not found.


TypeError: unsupported operand type(s) for *: 'NoneType' and 'int'