In [1]:
import joblib
from sklearn.metrics import roc_curve, auc
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.model_selection import TimeSeriesSplit
import os
import numpy as np
import pandas as pd
import optuna
import xgboost as xgb
from scipy.signal import find_peaks
from sklearn.svm import SVC
from scipy.integrate import simpson as simps

# Directory to save models
os.makedirs("saved_models", exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# === Load Data ===
boning = pd.read_csv('Boning.csv')
slicing = pd.read_csv('Slicing.csv')

# === Step 1: Filter Required Columns ===
# cols = ['Frame', 'Neck x', 'Neck y', 'Neck z', 'Head x', 'Head y', 'Head z']
# boning = boning[cols].copy()
# slicing = slicing[cols].copy()
boning['class'] = 0
slicing['class'] = 1
df = pd.concat([boning, slicing], ignore_index=True)

# === Step 2: Composite Feature Functions ===
import numpy as np
import pandas as pd

def compute_features(df, prefix):
    x, y, z = df[f'{prefix} x'], df[f'{prefix} y'], df[f'{prefix} z']
    return pd.DataFrame({
        f'{prefix}_rms_xy': np.sqrt(x**2 + y**2),
        f'{prefix}_rms_yz': np.sqrt(y**2 + z**2),
        f'{prefix}_rms_zx': np.sqrt(z**2 + x**2),
        f'{prefix}_rms_xyz': np.sqrt(x**2 + y**2 + z**2),
        f'{prefix}_roll': np.degrees(np.arctan2(y, np.sqrt(x**2 + z**2))),
        f'{prefix}_pitch': np.degrees(np.arctan2(x, np.sqrt(y**2 + z**2)))
    })

prefixes = [
    'Neck', 'Head',
    'Right Shoulder', 'Left Shoulder',
    'Right Upper Arm', 'Left Upper Arm',
    'Right Forearm', 'Left Forearm',
    'Right Hand', 'Left Hand',
    'Right Upper Leg', 'Left Upper Leg',
    'Right Lower Leg', 'Left Lower Leg',
    'Right Foot', 'Left Foot',
    'Right Toe', 'Left Toe',
    'L5', 'T12'
]

# Compute and concatenate features for all prefixes
feature_frames = [df]  # start with the original dataframe

for prefix in prefixes:
    feats = compute_features(df, prefix)
    feature_frames.append(feats)

# Final combined DataFrame
df_combined = pd.concat(feature_frames, axis=1)

In [3]:
# === Step 3: Feature Computation per Minute (60 Frames) ===
def extract_features(block):
    feats = {}
    for col in block.columns:
        print(col)
        values = block[col].values
        feats[f'{col}_mean'] = np.mean(values)
        feats[f'{col}_std'] = np.std(values)
        feats[f'{col}_min'] = np.min(values)
        feats[f'{col}_max'] = np.max(values)
        feats[f'{col}_auc'] = simps(values)
        feats[f'{col}_peaks'] = len(find_peaks(values)[0])
    return feats

# Exclude frame and class
feature_cols = df.columns.difference(['Frame', 'class'])
samples = []

# loop over 20-frame blocks instead of 60
for start in range(0, len(df), 20):
    end = start + 20
    if end > len(df):
        break
    block = df.iloc[start:end]
    features = extract_features(block[feature_cols])
    features['class'] = block['class'].iloc[0]
    samples.append(features)

features_df = pd.DataFrame(samples)
features_df.reset_index(drop=True, inplace=True)

# === Save final features ===
features_df.to_csv("final_feature_dataset.csv", index=False)
print("Final dataset with 108 features + class label saved as 'final_feature_dataset.csv'")

Head x
Head y
Head z
L3 x
L3 y
L3 z
L5 x
L5 y
L5 z
Left Foot x
Left Foot y
Left Foot z
Left Forearm x
Left Forearm y
Left Forearm z
Left Hand x
Left Hand y
Left Hand z
Left Lower Leg x
Left Lower Leg y
Left Lower Leg z
Left Shoulder x
Left Shoulder y
Left Shoulder z
Left Toe x
Left Toe y
Left Toe z
Left Upper Arm x
Left Upper Arm y
Left Upper Arm z
Left Upper Leg x
Left Upper Leg y
Left Upper Leg z
Neck x
Neck y
Neck z
Right Foot x
Right Foot y
Right Foot z
Right Forearm x
Right Forearm y
Right Forearm z
Right Hand x
Right Hand y
Right Hand z
Right Lower Leg x
Right Lower Leg y
Right Lower Leg z
Right Shoulder x
Right Shoulder y
Right Shoulder z
Right Toe x
Right Toe y
Right Toe z
Right Upper Arm x
Right Upper Arm y
Right Upper Arm z
Right Upper Leg x
Right Upper Leg y
Right Upper Leg z
T12 x
T12 y
T12 z
T8 x
T8 y
T8 z
Head x
Head y
Head z
L3 x
L3 y
L3 z
L5 x
L5 y
L5 z
Left Foot x
Left Foot y
Left Foot z
Left Forearm x
Left Forearm y
Left Forearm z
Left Hand x
Left Hand y
Left Hand z
L

In [4]:
X = df.drop(columns=["class"])
y = df["class"]
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
import joblib
import optuna
import xgboost as xgb
import numpy as np
import os

# Create model save directory
os.makedirs("saved_models", exist_ok=True)


# === Cross-validation using StratifiedKFold ===
def stratified_kfold_cv(X, y, model, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    roc_auc_scores = []

    for train_idx, test_idx in skf.split(X, y):
        X_train_fold, X_test_fold = X.iloc[train_idx], X.iloc[test_idx]
        y_train_fold, y_test_fold = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train_fold, y_train_fold)
        y_pred_proba = model.predict_proba(X_test_fold)[:, 1]
        score = roc_auc_score(y_test_fold, y_pred_proba)
        roc_auc_scores.append(score)

    return np.mean(roc_auc_scores)

# === Optuna Objective ===
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 300)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 6)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    n_components = trial.suggest_int('n_components', 10, min(30, X.shape[1]))

    pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=n_components)),
        ('classifier', xgb.XGBClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_child_weight=min_child_weight,
            learning_rate=learning_rate,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            eval_metric='logloss',
            tree_method='hist',
            device='cuda'  # or 'cpu'
        ))
    ])

    score = stratified_kfold_cv(X, y, pipeline, n_splits=5)

    model_name = f"saved_models/XGB_n{n_estimators}_d{max_depth}_w{min_child_weight}_lr{learning_rate:.2f}_ss{subsample:.2f}_cs{colsample_bytree:.2f}_comp{n_components}_trial{trial.number}.joblib"
    joblib.dump(pipeline, model_name)

    return score

# === Start Optuna Study ===
storage_name = 'sqlite:///optuna_study.db'
study_name = 'xgboost_optimization_study'

try:
    study = optuna.create_study(direction='maximize', study_name=study_name, storage=storage_name, load_if_exists=True)
    study.optimize(objective, n_trials=100, n_jobs=20)  # adjust n_jobs if you want parallel processing
except KeyboardInterrupt:
    print("Optimization interrupted manually. Gracefully exiting...")

# === Print the best trial ===
if study.best_trial:
    print("✅ Best Parameters:", study.best_params)
    print("📈 Best ROC-AUC Score:", round(study.best_value, 4))


[I 2025-03-24 15:43:41,720] Using an existing study with name 'xgboost_optimization_study' instead of creating a new one.
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


[I 2025-03-24 15:44:25,330] Trial 136 finished with value: 0.9566460563553587 and parameters: {'n_estimators': 191, 'max_depth': 7, 'min_child_weight': 6, 'learning_rate': 0.14527038677247092, 'subsample': 0.7664891581319784, 'colsample_bytree': 0.9389146628483227, 'n_components': 27}. Best is trial 136 with value: 0.9566460563553587.
[I 2025-03-24 15:44:25,850] Trial 142 finished with value: 0.956690903343978 and parameters: {'n_estimators': 190, 'max_depth': 7, 'min_child_weight': 6, 'learning_rate': 0.14179103713716343, 'subsample': 0.7634091971855761, 'colsample_bytree': 0.9388490357030632, 'n_components': 27}. Best is trial 142 with value: 0.956690903343978.
[I 2025-03-24 15:44:26,026] Trial 140 finished with

✅ Best Parameters: {'n_estimators': 154, 'max_depth': 9, 'min_child_weight': 6, 'learning_rate': 0.19338105702063518, 'subsample': 0.8603071664940043, 'colsample_bytree': 0.9015530774036971, 'n_components': 30}
📈 Best ROC-AUC Score: 0.964
