In [1]:
import json
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.signal import savgol_filter
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split


# ======================================================
# Feature engineering (same as V3, but simplified)
# ======================================================

def compute_features(x, y, window=15, poly=3):
    """Smooth + derivatives + physics-based features"""

    # Savitzky-Golay smoothing
    x_s = savgol_filter(x, window, poly)
    y_s = savgol_filter(y, window, poly)

    vx = np.gradient(x_s)
    vy = np.gradient(y_s)
    ax = np.gradient(vx)
    ay = np.gradient(vy)

    speed = np.sqrt(vx**2 + vy**2)
    accel = np.sqrt(ax**2 + ay**2)
    jerk = np.gradient(accel)

    angle = np.degrees(np.arctan2(vy, vx))
    angle_change = np.gradient(angle)

    df = pd.DataFrame({
        "x": x_s,
        "y": y_s,
        "vx": vx,
        "vy": vy,
        "ax": ax,
        "ay": ay,
        "speed": speed,
        "accel": accel,
        "jerk": jerk,
        "angle": angle,
        "angle_change": angle_change,
    })
    return df


# ======================================================
# Load per-point JSON → create ML dataset
# ======================================================

def load_all_points(folder: Path):
    rows = []

    for file in folder.glob("*.json"):
        with open(file, "r") as f:
            data = json.load(f)

        frames = sorted(data.keys(), key=lambda x: int(x))

        x = np.array([data[f]["x"] for f in frames], dtype=float)
        y = np.array([data[f]["y"] for f in frames], dtype=float)
        visible = np.array([data[f]["visible"] for f in frames], dtype=bool)
        action = [data[f]["action"] for f in frames]

        # Only keep visible points (or keep all? choose option A)
        valid = visible  # OPTION A

        # Compute features
        feats = compute_features(x, y)

        for i in range(len(frames)):
            if not valid[i]:
                continue

            rows.append({
                **feats.iloc[i].to_dict(),
                "label": action[i]
            })

    return pd.DataFrame(rows)


# ======================================================
# Baseline ML model
# ======================================================

def train_supervised_baseline(df):

    X = df.drop(columns=["label"])
    y = df["label"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, shuffle=True, stratify=y
    )

    clf = RandomForestClassifier(
        n_estimators=300,
        class_weight="balanced",
        max_depth=12,
        random_state=0
    )

    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print("\n=== Supervised ML — RandomForest Baseline ===")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

    return clf


# ======================================================
# MAIN
# ======================================================

if __name__ == "__main__":
    folder = Path("/Users/noeamar/Documents/M2DS/Stage M2DS/Quantum Sports Analytics/Data hit & bounce/per_point_v2")

    df = load_all_points(folder)
    print("Dataset shape:", df.shape)
    print(df["label"].value_counts())

    model = train_supervised_baseline(df)

    # Save the model?
    # import joblib
    # joblib.dump(model, "rf_supervised.pkl")

Dataset shape: (113673, 12)
label
air       110627
hit         1600
bounce      1446
Name: count, dtype: int64

=== Supervised ML — RandomForest Baseline ===
              precision    recall  f1-score   support

         air       1.00      0.54      0.70     22126
      bounce       0.29      0.51      0.37       289
         hit       0.03      0.93      0.06       320

    accuracy                           0.54     22735
   macro avg       0.44      0.66      0.38     22735
weighted avg       0.97      0.54      0.69     22735

[[11902   354  9870]
 [   30   148   111]
 [   22     1   297]]


In [2]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.signal import savgol_filter
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier


# ============================================================
# 1. Feature computation (same as before)
# ============================================================

def compute_features(x, y, window=15, poly=3):
    x_s = savgol_filter(x, window, poly)
    y_s = savgol_filter(y, window, poly)

    vx = np.gradient(x_s)
    vy = np.gradient(y_s)
    ax = np.gradient(vx)
    ay = np.gradient(vy)

    speed = np.sqrt(vx**2 + vy**2)
    accel = np.sqrt(ax**2 + ay**2)
    jerk = np.gradient(accel)

    angle = np.degrees(np.arctan2(vy, vx))
    angle_change = np.gradient(angle)

    return pd.DataFrame({
        "x": x_s, "y": y_s,
        "vx": vx, "vy": vy,
        "ax": ax, "ay": ay,
        "speed": speed,
        "accel": accel,
        "jerk": jerk,
        "angle": angle,
        "angle_change": angle_change,
    })


# ============================================================
# 2. Load all files and build sliding window dataset
# ============================================================

def load_dataset_with_windows(folder: Path, W=7):
    rows = []
    half = W

    for file in folder.glob("*.json"):
        with open(file, "r") as f:
            data = json.load(f)

        frames = sorted(data.keys(), key=lambda x: int(x))

        x = np.array([data[f]["x"] for f in frames], dtype=float)
        y = np.array([data[f]["y"] for f in frames], dtype=float)
        visible = np.array([data[f]["visible"] for f in frames], dtype=bool)
        labels = np.array([data[f]["action"] for f in frames])

        # compute physics features
        feats = compute_features(x, y)

        # build sliding windows
        for t in range(half, len(frames) - half):
            if not visible[t]:
                continue

            window_feats = feats.iloc[t-half:t+half+1].values.flatten()

            rows.append({
                "features": window_feats,
                "label": labels[t],
            })

    return rows


# ============================================================
# 3. Extract matrix
# ============================================================

def build_matrix(rows):
    X = np.stack([r["features"] for r in rows])
    y = np.array([r["label"] for r in rows])
    return X, y


# ============================================================
# 4. Training with XGBoost
# ============================================================

def train_xgb(X, y):

    model = XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.8,
        objective="multi:softmax",
        num_class=3,
        tree_method="hist",
        scale_pos_weight=None,
        gamma=0.0,
        reg_lambda=1.2,
        random_state=0
    )

    model.set_params(**{
        "class_weight": "balanced"
    })

    model.fit(X_train, y_train)
    return model


# ============================================================
# 5. MAIN
# ============================================================

if __name__ == "__main__":

    folder = Path("/Users/noeamar/Documents/M2DS/Stage M2DS/Quantum Sports Analytics/Data hit & bounce/per_point_v2")

    print("Loading dataset...")
    rows = load_dataset_with_windows(folder, W=7)   # window size = 15 frames
    print(f"Number of training windows: {len(rows)}")

    X, y = build_matrix(rows)
    print("Shape X:", X.shape)

    # encode labels
    mapping = {"air": 0, "bounce": 1, "hit": 2}
    y = np.array([mapping[z] for z in y])

    global X_train, X_test, y_train, y_test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, shuffle=True, stratify=y
    )

    model = train_xgb(X_train, y_train)

    preds = model.predict(X_test)

    reverse = {0: "air", 1: "bounce", 2: "hit"}
    y_test_str = [reverse[z] for z in y_test]
    preds_str = [reverse[z] for z in preds]

    print("\n=== Sliding Window + XGBoost ===")
    print(classification_report(y_test_str, preds_str))
    print(confusion_matrix(y_test_str, preds_str))

Loading dataset...
Number of training windows: 112091
Shape X: (112091, 165)


Parameters: { "class_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Sliding Window + XGBoost ===
              precision    recall  f1-score   support

         air       0.98      1.00      0.99     21828
      bounce       0.72      0.41      0.52       282
         hit       0.84      0.17      0.28       309

    accuracy                           0.98     22419
   macro avg       0.85      0.53      0.60     22419
weighted avg       0.98      0.98      0.97     22419

[[21775    43    10]
 [  166   116     0]
 [  254     3    52]]


In [3]:
"""
supervised_sliding_xgb_multiwindow.py

Supervised model for hit/bounce detection using:
    • Multi-window temporal embeddings
    • Savitzky–Golay derivative features
    • XGBoost classifier (multi-class)
"""

import json
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.signal import savgol_filter
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier


# ============================================================
# 1. Feature computation (smooth + derivatives)
# ============================================================

def compute_features(x, y, window=15, poly=3):
    """Compute smooth features & derivatives."""

    # Smooth x,y
    x_s = savgol_filter(x, window, poly)
    y_s = savgol_filter(y, window, poly)

    # Velocity & accel
    vx = np.gradient(x_s)
    vy = np.gradient(y_s)
    ax = np.gradient(vx)
    ay = np.gradient(vy)

    speed = np.sqrt(vx**2 + vy**2)
    accel = np.sqrt(ax**2 + ay**2)
    jerk = np.gradient(accel)

    angle = np.degrees(np.arctan2(vy, vx))
    angle_change = np.gradient(angle)

    return pd.DataFrame({
        "x": x_s, "y": y_s,
        "vx": vx, "vy": vy,
        "ax": ax, "ay": ay,
        "speed": speed,
        "accel": accel,
        "jerk": jerk,
        "angle": angle,
        "angle_change": angle_change,
    })


# ============================================================
# 2. Multi Window Construction
# ============================================================

def build_multiwindow_features(feats_df, t, windows=[5,10,20]):
    """Build concatenated temporal embedding for multiple window sizes."""
    vectors = []
    T = len(feats_df)

    for W in windows:
        half = W
        if t-half < 0 or t+half >= T:
            return None

        vec = feats_df.iloc[t-half:t+half+1].values.flatten()
        vectors.append(vec)

    return np.concatenate(vectors)


def load_dataset_multiwindow(folder, windows=[5,10,20]):
    rows = []

    for file in folder.glob("*.json"):
        with open(file, "r") as f:
            data = json.load(f)

        frames = sorted(data.keys(), key=lambda x: int(x))
        x = np.array([data[f]["x"] for f in frames], dtype=float)
        y = np.array([data[f]["y"] for f in frames], dtype=float)
        labels = np.array([data[f]["action"] for f in frames])
        visible = np.array([data[f]["visible"] for f in frames])

        feats = compute_features(x, y)

        T = len(frames)
        max_half = max(windows)

        for t in range(max_half, T-max_half):
            if not visible[t]:
                continue

            fvec = build_multiwindow_features(feats, t, windows)
            if fvec is None:
                continue

            rows.append({
                "features": fvec,
                "label": labels[t],
            })

    return rows


# ============================================================
# 3. Build X, y matrices
# ============================================================

def build_matrix(rows):
    X = np.stack([r["features"] for r in rows])
    y = np.array([r["label"] for r in rows])
    return X, y


# ============================================================
# 4. XGBoost training
# ============================================================

def train_xgb(X_train, y_train):
    """XGBoost for multi-class classification with weighted classes."""

    model = XGBClassifier(
        n_estimators=400,
        max_depth=7,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.8,
        objective="multi:softmax",
        num_class=3,
        tree_method="hist",
        reg_lambda=1.0,
        gamma=0.0,
        random_state=0
    )

    # Class imbalance handled via sample weighting (manual)
    weights = np.ones_like(y_train, dtype=float)
    weights[y_train == 1] *= 15.0   # bounce
    weights[y_train == 2] *= 15.0   # hit

    model.fit(X_train, y_train, sample_weight=weights)
    return model


# ============================================================
# 5. MAIN
# ============================================================

if __name__ == "__main__":

    folder = Path("/Users/noeamar/Documents/M2DS/Stage M2DS/Quantum Sports Analytics/Data hit & bounce/per_point_v2")

    print("Loading multi-window dataset...")
    rows = load_dataset_multiwindow(folder, windows=[5, 10, 20])
    print(f"Number of training windows: {len(rows)}")

    X, y = build_matrix(rows)
    print("Shape X:", X.shape)

    # Encode labels
    mapping = {"air": 0, "bounce": 1, "hit": 2}
    y = np.array([mapping[z] for z in y])

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        shuffle=True,
        stratify=y,
        random_state=0
    )

    model = train_xgb(X_train, y_train)
    preds = model.predict(X_test)

    reverse = {0:"air", 1:"bounce", 2:"hit"}
    y_test_str = [reverse[z] for z in y_test]
    preds_str = [reverse[z] for z in preds]

    print("\n=== Multi-Window XGBoost ===")
    print(classification_report(y_test_str, preds_str))
    print(confusion_matrix(y_test_str, preds_str))

Loading multi-window dataset...
Number of training windows: 108725
Shape X: (108725, 803)

=== Multi-Window XGBoost ===
              precision    recall  f1-score   support

         air       0.99      0.98      0.99     21170
      bounce       0.54      0.64      0.59       275
         hit       0.40      0.60      0.48       300

    accuracy                           0.97     21745
   macro avg       0.65      0.74      0.69     21745
weighted avg       0.98      0.97      0.97     21745

[[20757   144   269]
 [   98   176     1]
 [  116     3   181]]


In [5]:
"""
supervised_lightgbm_multiwindow.py

LightGBM + Multi-windows + RAW + Smooth features.
Best performing supervised baseline for hit/bounce detection.
"""

import json
import numpy as np
import pandas as pd
from pathlib import Path
from scipy.signal import savgol_filter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import lightgbm as lgb


# ============================================================
# 1. RAW + SMOOTH feature extraction
# ============================================================

def compute_raw_smooth_features(x, y, sg_window=15, poly=3):
    """Return BOTH raw and smooth derivative features."""

    # RAW
    vx_raw = np.gradient(x)
    vy_raw = np.gradient(y)
    ax_raw = np.gradient(vx_raw)
    ay_raw = np.gradient(vy_raw)
    speed_raw = np.sqrt(vx_raw**2 + vy_raw**2)
    accel_raw = np.sqrt(ax_raw**2 + ay_raw**2)

    # SMOOTH
    x_s = savgol_filter(x, sg_window, poly)
    y_s = savgol_filter(y, sg_window, poly)

    vx = np.gradient(x_s)
    vy = np.gradient(y_s)
    ax = np.gradient(vx)
    ay = np.gradient(vy)
    speed = np.sqrt(vx**2 + vy**2)
    accel = np.sqrt(ax**2 + ay**2)
    jerk = np.gradient(accel)

    angle = np.degrees(np.arctan2(vy, vx))
    angle_change = np.gradient(angle)

    return pd.DataFrame({
        # RAW
        "x_raw": x, "y_raw": y,
        "vx_raw": vx_raw, "vy_raw": vy_raw,
        "ax_raw": ax_raw, "ay_raw": ay_raw,
        "speed_raw": speed_raw, "accel_raw": accel_raw,

        # SMOOTH
        "x_s": x_s, "y_s": y_s,
        "vx_s": vx, "vy_s": vy,
        "ax_s": ax, "ay_s": ay,
        "speed_s": speed,
        "accel_s": accel,
        "jerk_s": jerk,
        "angle_s": angle,
        "angle_change_s": angle_change,
    })


# ============================================================
# 2. Multi-window temporal embedding
# ============================================================

def build_multiwindow_features(df, t, windows):
    vectors = []
    T = len(df)

    for W in windows:
        half = W
        if t-half < 0 or t+half >= T:
            return None

        vec = df.iloc[t-half:t+half+1].values.flatten()
        vectors.append(vec)

    return np.concatenate(vectors)


def load_dataset_multiwindow(folder, windows=[5,10,20,30]):
    rows = []

    for file in folder.glob("*.json"):
        with open(file) as f:
            data = json.load(f)

        frames = sorted(data.keys(), key=lambda x: int(x))
        x = np.array([data[f]["x"] for f in frames], float)
        y = np.array([data[f]["y"] for f in frames], float)
        labels = np.array([data[f]["action"] for f in frames])
        visible = np.array([data[f]["visible"] for f in frames])

        feats = compute_raw_smooth_features(x, y)

        T = len(frames)
        max_half = max(windows)

        for t in range(max_half, T-max_half):
            if not visible[t]:
                continue

            fv = build_multiwindow_features(feats, t, windows)
            if fv is None:
                continue

            rows.append({
                "features": fv,
                "label": labels[t],
            })

    return rows


# ============================================================
# 3. Build matrices
# ============================================================

def build_matrix(rows):
    X = np.stack([r["features"] for r in rows])
    y = np.array([r["label"] for r in rows])
    return X, y


# ============================================================
# 4. LightGBM training
# ============================================================

def train_lgbm(X_train, y_train):
    """Train LightGBM for multi-class (3 classes)."""

    params = {
        "objective": "multiclass",
        "num_class": 3,
        "metric": "multi_logloss",

        # Strong model
        "learning_rate": 0.05,
        "num_leaves": 64,
        "max_depth": -1,
        "min_data_in_leaf": 50,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 1,
        "lambda_l2": 1.0,
    }

    dtrain = lgb.Dataset(X_train, label=y_train)
    model = lgb.train(params, dtrain, num_boost_round=600)

    return model


# ============================================================
# 5. MAIN
# ============================================================

if __name__ == "__main__":

    folder = Path("/Users/noeamar/Documents/M2DS/Stage M2DS/Quantum Sports Analytics/Data hit & bounce/per_point_v2")

    print("Loading multi-window dataset...")
    rows = load_dataset_multiwindow(folder, windows=[5,10,20,30])
    print(f"Number of windows: {len(rows)}")

    X, y = build_matrix(rows)
    print("Shape X:", X.shape)

    mapping = {"air": 0, "bounce": 1, "hit": 2}
    y = np.array([mapping[z] for z in y])

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, shuffle=True, stratify=y, random_state=0
    )

    print("Training LightGBM...")
    model = train_lgbm(X_train, y_train)

    preds = model.predict(X_test)
    preds = np.argmax(preds, axis=1)

    reverse = {0:"air", 1:"bounce", 2:"hit"}
    y_test_str = [reverse[z] for z in y_test]
    preds_str = [reverse[z] for z in preds]

    print("\n=== LightGBM Multi-Window + RAW + Smooth ===")
    print(classification_report(y_test_str, preds_str))
    print(confusion_matrix(y_test_str, preds_str))

Loading multi-window dataset...
Number of windows: 105781
Shape X: (105781, 2546)
Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.583139 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 649229
[LightGBM] [Info] Number of data points in the train set: 84624, number of used features: 2546
[LightGBM] [Info] Start training from score -0.027009
[LightGBM] [Info] Start training from score -4.353877
[LightGBM] [Info] Start training from score -4.283782

=== LightGBM Multi-Window + RAW + Smooth ===
              precision    recall  f1-score   support

         air       0.99      1.00      1.00     20593
      bounce       0.93      0.82      0.87       272
         hit       0.85      0.68      0.75       292

    accuracy                           0.99     21157
   macro avg       0.92      0.83      0.87     21157
weighted avg       0.99      0.99      0.99     21157

[[20544    13  