In [1]:
import numpy as np
import pandas as pd
import json
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import joblib
from scipy.ndimage import gaussian_filter1d
from scipy.signal import find_peaks
import warnings

warnings.filterwarnings('ignore')


In [2]:
# ============================================================================
# 1. FEATURE ENGINEERING FUNCTION
# ============================================================================

def calculate_optimized_features(df):
    """
    Calculate optimized features for hit/bounce detection.
    MUST be identical to training feature engineering.
    """
    df = df.copy()

    # === BASIC CLEANING ===
    df['x'] = df['x'].replace(0, np.nan).interpolate(method='cubic').bfill().ffill()
    df['y'] = df['y'].replace(0, np.nan).interpolate(method='cubic').bfill().ffill()

    # === SMOOTHING ===
    df['x_smooth'] = gaussian_filter1d(df['x'], sigma=1.5)
    df['y_smooth'] = gaussian_filter1d(df['y'], sigma=1.5)

    # === 1ST ORDER: VELOCITY ===
    df['vx'] = df['x_smooth'].diff()
    df['vy'] = df['y_smooth'].diff()
    df['speed'] = np.sqrt(df['vx']**2 + df['vy']**2)
    df['v_horizontal'] = df['vx'].abs()
    df['v_vertical'] = df['vy'].abs()

    # === 2ND ORDER: ACCELERATION ===
    df['ax'] = df['vx'].diff()
    df['ay'] = df['vy'].diff()
    df['accel_magnitude'] = np.sqrt(df['ax']**2 + df['ay']**2)
    df['accel_horizontal'] = df['ax'].abs()
    df['accel_vertical'] = df['ay'].abs()

    # === 3RD ORDER: JERK ===
    df['jerk_x'] = df['ax'].diff()
    df['jerk_y'] = df['ay'].diff()
    df['jerk_magnitude'] = np.sqrt(df['jerk_x']**2 + df['jerk_y']**2)

    # === ANGLE & DIRECTION ===
    df['angle'] = np.arctan2(df['vy'], df['vx'])
    df['delta_angle'] = df['angle'].diff().abs()
    df.loc[df['delta_angle'] > np.pi, 'delta_angle'] = \
        2*np.pi - df.loc[df['delta_angle'] > np.pi, 'delta_angle']
    df['angular_velocity'] = df['delta_angle'] / (df['speed'] + 1e-6)

    # === VERTICAL VELOCITY ANALYSIS ===
    df['vy_change'] = df['vy'].diff()
    df['vy_abs_change'] = df['vy_change'].abs()
    df['vy_sign_change'] = ((df['vy'] * df['vy'].shift(1)) < 0).astype(int)
    df['vy_acceleration'] = df['vy'].diff(2)

    # === HEIGHT & POSITION ===
    max_y = df['y'].max()
    min_y = df['y'].min()
    y_range = max_y - min_y if (max_y - min_y) > 0 else 1

    df['height_normalized'] = (max_y - df['y']) / y_range
    df['height_raw'] = max_y - df['y']

    center_x = (df['x'].max() + df['x'].min()) / 2
    df['dist_from_center'] = (df['x'] - center_x).abs()

    # === ENERGY & MOMENTUM ===
    df['kinetic_energy'] = df['speed']**2
    df['energy_change'] = df['kinetic_energy'].diff()
    df['energy_change_rate'] = df['energy_change'] / (df['kinetic_energy'].shift(1) + 1e-6)

    # === CURVATURE ===
    df['curvature'] = np.abs(df['ax'] * df['vy'] - df['ay'] * df['vx']) / \
                      (df['speed']**3 + 1e-6)

    # === TEMPORAL CONTEXT ===
    temporal_features = ['speed', 'vy', 'accel_magnitude', 'jerk_magnitude',
                        'height_normalized', 'delta_angle']

    for lag in [1, 2, 3, 5]:
        for feature in temporal_features:
            if feature in df.columns:
                df[f'{feature}_past_{lag}'] = df[feature].shift(lag)
                df[f'{feature}_future_{lag}'] = df[feature].shift(-lag)
                df[f'{feature}_diff_{lag}'] = df[feature] - df[feature].shift(lag)

    for window in [3, 5]:
        for feature in temporal_features:
            if feature in df.columns:
                df[f'{feature}_mean_{window}'] = df[feature].rolling(window, center=True).mean()
                df[f'{feature}_std_{window}'] = df[feature].rolling(window, center=True).std()
                df[f'{feature}_max_{window}'] = df[feature].rolling(window, center=True).max()
                df[f'{feature}_min_{window}'] = df[feature].rolling(window, center=True).min()

    df['speed_change'] = df['speed'].diff()
    df['accel_change'] = df['accel_magnitude'].diff()
    df['height_change'] = df['height_normalized'].diff()

    # === PEAK DETECTION ===
    df['is_speed_peak'] = 0
    df['is_accel_peak'] = 0
    df['is_jerk_peak'] = 0
    df['is_y_local_max'] = 0
    df['is_y_local_min'] = 0

    if len(df) > 5:
        try:
            speed_peaks, _ = find_peaks(df['speed'].values, distance=2, prominence=df['speed'].std()*0.5)
            accel_peaks, _ = find_peaks(df['accel_magnitude'].values, distance=2)
            jerk_peaks, _ = find_peaks(df['jerk_magnitude'].values, distance=2)
            y_maxs, _ = find_peaks(df['y_smooth'].values, distance=3)
            y_mins, _ = find_peaks(-df['y_smooth'].values, distance=3)

            if len(speed_peaks) > 0:
                df.iloc[speed_peaks, df.columns.get_loc('is_speed_peak')] = 1
            if len(accel_peaks) > 0:
                df.iloc[accel_peaks, df.columns.get_loc('is_accel_peak')] = 1
            if len(jerk_peaks) > 0:
                df.iloc[jerk_peaks, df.columns.get_loc('is_jerk_peak')] = 1
            if len(y_maxs) > 0:
                df.iloc[y_maxs, df.columns.get_loc('is_y_local_max')] = 1
            if len(y_mins) > 0:
                df.iloc[y_mins, df.columns.get_loc('is_y_local_min')] = 1
        except:
            pass

    # === COMPOSITE FEATURES ===
    df['hit_score'] = (
        df['jerk_magnitude'] *
        df['accel_magnitude'] *
        df['delta_angle'] *
        (1 + df['is_jerk_peak'])
    )

    df['bounce_score'] = (
        df['vy_abs_change'] *
        (1 - df['height_normalized']) *
        df['vy_sign_change'] *
        (1 + df['is_y_local_min'])
    )

    df['vertical_impact'] = df['vy_abs_change'] * df['accel_vertical']

    # === VELOCITY REVERSAL ===
    df['vy_reversed_recently'] = (
        (df['vy'] * df['vy_past_1'] < 0) |
        (df['vy_past_1'] * df['vy_past_2'] < 0)
    ).astype(int)

    df['vy_will_reverse'] = (
        (df['vy'] * df['vy_future_1'] < 0) |
        (df['vy_future_1'] * df['vy_future_2'] < 0)
    ).astype(int)

    df['vy_reversal_window'] = (
        df['vy_reversed_recently'] |
        df['vy_sign_change'] |
        df['vy_will_reverse']
    ).astype(int)

    # === SUSTAINED EVENTS ===
    df['jerk_sustained'] = (
        df['jerk_magnitude_past_3'] +
        df['jerk_magnitude'] +
        df['jerk_magnitude_future_3']
    ) / 3

    df['accel_sustained'] = (
        df['accel_magnitude_past_3'] +
        df['accel_magnitude'] +
        df['accel_magnitude_future_3']
    ) / 3

    # === COMPARATIVE ===
    for feature in ['speed', 'vy', 'height_normalized']:
        if feature in df.columns:
            past_mean = df[f'{feature}_mean_5']
            future_mean = df[feature].shift(-5).rolling(5).mean()

            df[f'{feature}_past_vs_future'] = past_mean - future_mean

            df[f'{feature}_is_peak'] = (
                (df[feature] > past_mean) &
                (df[feature] > future_mean)
            ).astype(int)

    df['trajectory_phase'] = np.arctan2(df['vy'], df['vx'] + 1e-6)
    df['trajectory_phase_change'] = df['trajectory_phase'].diff().abs()

    df['frame_idx'] = range(len(df))
    df['time_in_point'] = df['frame_idx'] / len(df)

    return df.fillna(0)

# ============================================================================
# Prepare Dataset Function
# ============================================================================
def prepare_dataset(folder_path):
    """
    Load all points and prepare dataset with optimized features.
    """
    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
    all_dfs = []

    print(f"Loading {len(json_files)} points...")

    for i, filename in enumerate(json_files):
        if (i + 1) % 50 == 0:
            print(f"  Progress: {i+1}/{len(json_files)}")

        file_path = os.path.join(folder_path, filename)

        with open(file_path, 'r') as f:
            point_data = json.load(f)

        # Convert to DataFrame
        frames = sorted(point_data.keys(), key=int)
        rows = []
        for f_idx in frames:
            details = point_data[f_idx]
            rows.append({
                "point_id": filename.replace(".json", ""),
                "frame": int(f_idx),
                "x": details.get("x"),
                "y": details.get("y"),
                "visible": details.get("visible"),
                "action": details.get("action")
            })

        df_point = pd.DataFrame(rows)
        df_point = calculate_optimized_features(df_point)
        all_dfs.append(df_point)

    # Combine all
    full_df = pd.concat(all_dfs, ignore_index=True)

    print(f"\n{'='*60}")
    print(f"Dataset created: {len(full_df)} frames")
    print(f"Features: {len(full_df.columns)}")
    print(f"Class distribution:\n{full_df['action'].value_counts()}")
    print(f"{'='*60}")

    return full_df

# ============================================================================
# Prepare Balanced Dataset Function
# ============================================================================
def prepare_balanced_dataset(df, strategy='hybrid'):


    df_hit = df[df['action'] == 'hit'].copy()
    df_bounce = df[df['action'] == 'bounce'].copy()
    df_air = df[df['action'] == 'air'].copy()

    n_hit = len(df_hit)
    n_bounce = len(df_bounce)
    n_minority = n_hit + n_bounce
    target_air_ratio = 10
    target_air_samples = target_air_ratio * n_minority

    print(f"\nHybrid Strategy:")
    print(f"  Keeping all hits: {n_hit}")
    print(f"  Keeping all bounces: {n_bounce}")
    print(f"  Downsampling air: {len(df_air)} → {target_air_samples}")
    print(f"  Target ratio - Air:Hit:Bounce = {target_air_ratio}:1:1")

    # Downsample air
    df_air_downsampled = df_air.sample(n=target_air_samples, random_state=42)

    # Combine
    df_balanced = pd.concat([df_hit, df_bounce, df_air_downsampled], ignore_index=True)
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

    # Final distribution
    print(f"\n{'='*70}")
    print("Balanced class distribution:")
    print(df_balanced['action'].value_counts())
    print(f"Total samples: {len(df_balanced)}")
    print(f"Reduction: {len(df)} → {len(df_balanced)} ({100*len(df_balanced)/len(df):.1f}%)")
    print(f"{'='*70}\n")

    return df_balanced


In [3]:
# Load and prepare training data
folder_path = "per_point_v2"

df_train = prepare_dataset(folder_path)
df_balanced = prepare_balanced_dataset(df_train, strategy='hybrid')

Loading 313 points...
  Progress: 50/313
  Progress: 100/313
  Progress: 150/313
  Progress: 200/313
  Progress: 250/313
  Progress: 300/313

Dataset created: 177341 frames
Features: 181
Class distribution:
action
air       174295
hit         1600
bounce      1446
Name: count, dtype: int64

Hybrid Strategy:
  Keeping all hits: 1600
  Keeping all bounces: 1446
  Downsampling air: 174295 → 30460
  Target ratio - Air:Hit:Bounce = 10:1:1

Balanced class distribution:
action
air       30460
hit        1600
bounce     1446
Name: count, dtype: int64
Total samples: 33506
Reduction: 177341 → 33506 (18.9%)



In [5]:
# ============================================================================
# Calculate Class Weights Function
# ============================================================================
def calculate_class_weights(y_train):
    """
    Calculate class weights for model training.
    Use this with any strategy.
    """
    from sklearn.utils.class_weight import compute_class_weight

    le = LabelEncoder()
    y_encoded = le.fit_transform(y_train)

    classes = np.unique(y_encoded)
    weights = compute_class_weight('balanced', classes=classes, y=y_encoded)

    weight_dict = dict(zip(le.classes_, weights))

    print("\nClass weights:")
    for cls, weight in weight_dict.items():
        print(f"  {cls:8s}: {weight:.4f}")

    return weight_dict, le

# ============================================================================
# Train Traditional ML Model Function
# ============================================================================
def train_traditional_ml(X_train, y_train, X_test, y_test, model_type='xgboost'):
    """
    Train traditional ML models with optimized hyperparameters.
    Enhanced for imbalanced multi-class hit/bounce detection.
    """
    print(f"\n{'='*70}")
    print(f"Training {model_type.upper()}")
    print(f"{'='*70}")

    # Encode labels
    le = LabelEncoder()
    y_train_encoded = le.fit_transform(y_train)
    y_test_encoded = le.transform(y_test)

    # Calculate class weights for imbalanced data
    from sklearn.utils.class_weight import compute_class_weight
    class_weights = compute_class_weight('balanced',
                                         classes=np.unique(y_train_encoded),
                                         y=y_train_encoded)
    class_weight_dict = dict(enumerate(class_weights))

    print(f"Class distribution:")
    for cls, count in zip(le.classes_, np.bincount(y_train_encoded)):
        print(f"  {cls}: {count} samples (weight: {class_weight_dict[le.transform([cls])[0]]:.3f})")

    if model_type == 'xgboost':
        import xgboost as xgb

        # Enhanced XGBoost parameters
        model = xgb.XGBClassifier(
            # Core parameters
            n_estimators=500,
            max_depth=10,
            learning_rate=0.05,

            # Regularization - prevent overfitting
            min_child_weight=3,
            gamma=0.1,
            reg_alpha=0.1,
            reg_lambda=1.0,

            # Sampling parameters
            subsample=0.8,
            colsample_bytree=0.8,
            colsample_bylevel=0.8,
            colsample_bynode=0.8,

            # Class imbalance handling
            scale_pos_weight=class_weights[1]/class_weights[0] if len(class_weights) == 2 else 1,

            # Performance
            tree_method='hist',
            max_bin=256,

            # Training control
            early_stopping_rounds=50,
            eval_metric='mlogloss',
            random_state=42,
            n_jobs=-1
        )

        # Use validation set for early stopping
        eval_set = [(X_test, y_test_encoded)]
        model.fit(X_train, y_train_encoded,
                 eval_set=eval_set,
                 verbose=False)

    elif model_type == 'lightgbm':
        import lightgbm as lgb

        # Enhanced LightGBM parameters
        model = lgb.LGBMClassifier(
            # Core parameters
            n_estimators=2500,
            max_depth=16,
            learning_rate=0.05,
            num_leaves=80,

            # Regularization
            min_child_samples=20,
            min_child_weight=0.001,
            reg_alpha=0.1,
            reg_lambda=1.0,

            # Sampling
            subsample=0.8,
            subsample_freq=5,
            colsample_bytree=0.8,

            # Feature interaction
            feature_fraction_bynode=0.8,

            # Class imbalance
            class_weight='balanced',
            is_unbalance=True,

            # Performance
            boosting_type='gbdt',
            n_jobs=-1,

            # Training control
            early_stopping_rounds=50,
            random_state=42,
            verbose=-1
        )

        eval_set = [(X_test, y_test_encoded)]
        model.fit(X_train, y_train_encoded,
                 eval_set=eval_set,
                 callbacks=[lgb.early_stopping(50, verbose=False)])

    elif model_type == 'catboost':
        from catboost import CatBoostClassifier

        # Enhanced CatBoost parameters
        model = CatBoostClassifier(
            # Core parameters
            iterations=500,
            depth=10,                      # Increased depth
            learning_rate=0.05,

            # Regularization
            l2_leaf_reg=3.0,              # NEW: L2 regularization
            bagging_temperature=1.0,       # NEW: Bayesian bootstrap intensity

            # Sampling
            # subsample=0.8, # Removed as it's incompatible with Bayesian bootstrap

            # Class imbalance
            auto_class_weights='Balanced', # NEW: automatic balancing
            # class_weights=class_weight_dict, # Removed due to conflict with auto_class_weights

            # CatBoost specific
            border_count=128,              # NEW: splits for numerical features
            grow_policy='SymmetricTree',   # Default, but explicit
            bootstrap_type='Bayesian',     # NEW: Bayesian bootstrap

            # Training control
            early_stopping_rounds=50,
            use_best_model=True,

            # Performance
            task_type='CPU',
            thread_count=-1,
            random_state=42,
            verbose=False
        )

        eval_set = (X_test, y_test_encoded)
        model.fit(X_train, y_train_encoded,
                 eval_set=eval_set,
                 verbose=False)

    elif model_type == 'random_forest':
        from sklearn.ensemble import RandomForestClassifier

        # Enhanced Random Forest parameters
        model = RandomForestClassifier(
            # Core parameters
            n_estimators=500,              # Increased from 200
            max_depth=25,                  # Increased from 20

            # Split parameters
            min_samples_split=5,           # Reduced from 10
            min_samples_leaf=2,            # Reduced from 4
            max_features='sqrt',           # NEW: sqrt(n_features) per split

            # Regularization
            min_weight_fraction_leaf=0.0,
            max_leaf_nodes=None,
            min_impurity_decrease=0.0,

            # Sampling
            bootstrap=True,
            oob_score=True,                # NEW: out-of-bag score

            # Class imbalance
            class_weight='balanced_subsample', # NEW: balanced per bootstrap

            # Performance
            n_jobs=-1,
            random_state=42,
            verbose=0
        )

        model.fit(X_train, y_train_encoded)

        if hasattr(model, 'oob_score_'):
            print(f"\nOut-of-Bag Score: {model.oob_score_:.4f}")

    else:
        raise ValueError(f"Unknown model type: {model_type}")

    # ========================================================================
    # EVALUATION
    # ========================================================================

    # Predict
    y_pred = model.predict(X_test)

    # Probabilities (if available)
    if hasattr(model, 'predict_proba'):
        y_pred_proba = model.predict_proba(X_test)
        print(f"\nPrediction confidence (mean): {y_pred_proba.max(axis=1).mean():.4f}")

    # Classification report
    print("\n" + "="*70)
    print("TEST SET PERFORMANCE")
    print("="*70)
    print(classification_report(y_test_encoded, y_pred,
                                zero_division=0,
                                target_names=le.classes_,
                                digits=4))

    # Confusion matrix
    cm = confusion_matrix(y_test_encoded, y_pred, labels=np.arange(len(le.classes_)))
    print("\nConfusion Matrix:")
    class_names = le.classes_
    header = "            " + "  ".join([f"{c:>8s}" for c in class_names])
    print(header)
    for i, label in enumerate(class_names):
        row_str = f"{label:12s}"
        for val in cm[i]:
            row_str += f"{val:8d}  "
        print(row_str)

    # Per-class metrics
    print("\nPer-Class Metrics:")
    from sklearn.metrics import precision_recall_fscore_support
    precision, recall, f1_per_class, support = precision_recall_fscore_support(
        y_test_encoded, y_pred, zero_division=0
    )

    for i, cls in enumerate(le.classes_):
        print(f"  {cls:8s}: Precision={precision[i]:.4f}, Recall={recall[i]:.4f}, "
              f"F1={f1_per_class[i]:.4f}, Support={support[i]}")

    # Overall metrics
    f1_macro = f1_score(y_test_encoded, y_pred, average='macro', zero_division=0)
    f1_weighted = f1_score(y_test_encoded, y_pred, average='weighted', zero_division=0)

    print(f"\nOverall Metrics:")
    print(f"  Macro F1:    {f1_macro:.4f}")
    print(f"  Weighted F1: {f1_weighted:.4f}")

    # ========================================================================
    # FEATURE IMPORTANCE
    # ========================================================================

    if hasattr(model, 'feature_importances_'):
        importance_df = pd.DataFrame({
            'feature': X_train.columns,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)

        print("\n" + "="*70)
        print("TOP 30 MOST IMPORTANT FEATURES")
        print("="*70)
        print(importance_df.head(30).to_string(index=False))

        # Save full importance
        importance_df.to_csv(f'{model_type}_feature_importance.csv', index=False)
        print(f"\n✓ Full feature importance saved to '{model_type}_feature_importance.csv'")

    return model, f1_macro, le

# ============================================================================
# Train and Save Model Function
# ============================================================================
def train_and_save_model(df_train, model_type='xgboost', output_dir='models/'):
    """
    Train model and save all necessary files for inference.
    """
    import os
    from sklearn.preprocessing import StandardScaler, LabelEncoder
    from sklearn.model_selection import train_test_split

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print(f"\n{'='*70}")
    print("TRAINING AND SAVING MODEL")
    print(f"{'='*70}")

    # Prepare data
    feature_cols = [col for col in df_train.columns
                   if col not in ['point_id', 'frame', 'action', 'visible']]

    X = df_train[feature_cols].fillna(0)
    y = df_train['action']

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Scale
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_cols)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_cols)

    # Train
    model, f1, le = train_traditional_ml(
        X_train_scaled, y_train,
        X_test_scaled, y_test,
        model_type=model_type
    )

    # ========================================================================
    # SAVE ALL NECESSARY FILES
    # ========================================================================

    model_path = os.path.join(output_dir, f'trained_{model_type}_model.pkl')
    scaler_path = os.path.join(output_dir, 'scaler.pkl')
    le_path = os.path.join(output_dir, 'label_encoder.pkl')
    feature_path = os.path.join(output_dir, 'feature_columns.pkl')

    joblib.dump(model, model_path)
    joblib.dump(scaler, scaler_path)
    joblib.dump(le, le_path)
    joblib.dump(feature_cols, feature_path)

    print(f"\n{'='*70}")
    print("MODEL SAVED SUCCESSFULLY")
    print(f"{'='*70}")
    print(f"Model:          {model_path}")
    print(f"Scaler:         {scaler_path}")
    print(f"Label Encoder:  {le_path}")
    print(f"Feature Cols:   {feature_path}")
    print(f"{'='*70}\n")

    return model, scaler, le, feature_cols


In [6]:

def train_and_save_model(df_train, model_type='xgboost', output_dir='models/'):
    """
    Train model and save all necessary files for inference.
    """
    import os
    from sklearn.preprocessing import StandardScaler, LabelEncoder
    from sklearn.model_selection import train_test_split

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print(f"\n{'='*70}")
    print("TRAINING AND SAVING MODEL")
    print(f"{'='*70}")

    # Prepare data
    feature_cols = [col for col in df_train.columns
                   if col not in ['point_id', 'frame', 'action', 'visible']]

    X = df_train[feature_cols].fillna(0)
    y = df_train['action']

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Scale
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_cols)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_cols)

    # Train
    model, f1, le = train_traditional_ml(
        X_train_scaled, y_train,
        X_test_scaled, y_test,
        model_type=model_type
    )

    # ========================================================================
    # SAVE ALL NECESSARY FILES
    # ========================================================================

    model_path = os.path.join(output_dir, f'trained_{model_type}_model.pkl')
    scaler_path = os.path.join(output_dir, 'scaler.pkl')
    le_path = os.path.join(output_dir, 'label_encoder.pkl')
    feature_path = os.path.join(output_dir, 'feature_columns.pkl')

    joblib.dump(model, model_path)
    joblib.dump(scaler, scaler_path)
    joblib.dump(le, le_path)
    joblib.dump(feature_cols, feature_path)

    print(f"\n{'='*70}")
    print("MODEL SAVED SUCCESSFULLY")
    print(f"{'='*70}")
    print(f"Model:          {model_path}")
    print(f"Scaler:         {scaler_path}")
    print(f"Label Encoder:  {le_path}")
    print(f"Feature Cols:   {feature_path}")
    print(f"{'='*70}\n")

    return model, scaler, le, feature_cols


In [6]:
pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [7]:
# Train and save
model, scaler, le, feature_cols = train_and_save_model(
    df_balanced,
    model_type='lightgbm',
    output_dir='models/'
)


TRAINING AND SAVING MODEL

Training LIGHTGBM
Class distribution:
  air: 24367 samples (weight: 0.367)
  bounce: 1157 samples (weight: 7.722)
  hit: 1280 samples (weight: 6.980)

Prediction confidence (mean): 0.9796

TEST SET PERFORMANCE
              precision    recall  f1-score   support

         air     0.9898    0.9723    0.9810      6093
      bounce     0.8055    0.9170    0.8576       289
         hit     0.7165    0.8688    0.7853       320

    accuracy                         0.9649      6702
   macro avg     0.8373    0.9193    0.8746      6702
weighted avg     0.9688    0.9649    0.9663      6702


Confusion Matrix:
                 air    bounce       hit
air             5924        59       110  
bounce            24       265         0  
hit               37         5       278  

Per-Class Metrics:
  air     : Precision=0.9898, Recall=0.9723, F1=0.9810, Support=6093
  bounce  : Precision=0.8055, Recall=0.9170, F1=0.8576, Support=289
  hit     : Precision=0.7165, Recal

In [8]:
import json

def remove_action_field(input_json_path, output_json_path):
    with open(input_json_path, 'r') as f:
        data = json.load(f)

    for key, value in data.items():
        if isinstance(value, dict):
            value.pop("action", None)  # safely remove if exists

    with open(output_json_path, 'w') as f:
        json.dump(data, f, indent=2)

    print("✔ 'action' field removed and file saved.")


In [9]:
remove_action_field(
    "per_point_v2/ball_data_5.json",
    "output.json"
)


✔ 'action' field removed and file saved.
