In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pickle
import os

# Define classifiers to try
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, solver='liblinear'),
    'Support Vector Machine': SVC(probability=True)
}

# Hyperparameter grids for GridSearchCV
param_grids = {
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'Gradient Boosting': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.1, 0.05],
        'max_depth': [3, 5],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'Logistic Regression': {
        'penalty': ['l1', 'l2'],
        'C': [0.1, 1, 10]
    },
    'Support Vector Machine': {
        'kernel': ['linear', 'rbf'],
        'C': [0.1, 1, 10],
        'gamma': ['scale', 'auto']
    }
}

# Hyperparameter distributions for RandomizedSearchCV
param_distributions = param_grids  # Using the same as grid for simplicity, can be customized

def train_and_optimize(exercise_name):
    print(f"Loading data for {exercise_name}...")
    csv_file = f"{exercise_name}_features.csv"
    data = pd.read_csv(csv_file)

    # Separate features and label
    X = data.drop(columns=['label'])
    y = data['label'].map({'correct': 1, 'incorrect': 0})  # Encode labels

    # Split train/test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    best_clf_name = None
    best_clf = None
    best_acc = 0

    # Train all classifiers and pick best on test set
    print(f"Training baseline classifiers for {exercise_name}...")
    for name, clf in classifiers.items():
        clf.fit(X_train_scaled, y_train)
        preds = clf.predict(X_test_scaled)
        acc = accuracy_score(y_test, preds)
        print(f"\t{name} accuracy: {acc:.4f}")
        if acc > best_acc:
            best_acc = acc
            best_clf_name = name
            best_clf = clf

    print(f"Best baseline classifier for {exercise_name}: {best_clf_name} with accuracy {best_acc:.4f}")

    # Hyperparameter optimization with GridSearchCV
    print(f"Optimizing {best_clf_name} with GridSearchCV for {exercise_name}...")
    param_grid = param_grids[best_clf_name]
    grid_search = GridSearchCV(classifiers[best_clf_name], param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)
    print(f"Best params (GridSearch) for {exercise_name}: {grid_search.best_params_}")
    print(f"Best CV accuracy (GridSearch) for {exercise_name}: {grid_search.best_score_:.4f}")

    # Save GridSearch optimized model + scaler
    os.makedirs('saved_models', exist_ok=True)
    gs_model_path = f"saved_models/{exercise_name}_{best_clf_name}_grid.pkl"
    with open(gs_model_path, 'wb') as f:
        pickle.dump({'model': grid_search.best_estimator_, 'scaler': scaler}, f)

    # Hyperparameter optimization with RandomizedSearchCV for additional tuning
    print(f"Optimizing {best_clf_name} with RandomizedSearchCV for {exercise_name}...")
    param_dist = param_distributions[best_clf_name]
    random_search = RandomizedSearchCV(classifiers[best_clf_name], param_dist, scoring='accuracy', cv=5, n_iter=20, n_jobs=-1, random_state=42)
    random_search.fit(X_train_scaled, y_train)
    print(f"Best params (RandomizedSearch) for {exercise_name}: {random_search.best_params_}")
    print(f"Best CV accuracy (RandomizedSearch) for {exercise_name}: {random_search.best_score_:.4f}")

    # Save RandomizedSearch optimized model + scaler
    rs_model_path = f"saved_models/{exercise_name}_{best_clf_name}_random.pkl"
    with open(rs_model_path, 'wb') as f:
        pickle.dump({'model': random_search.best_estimator_, 'scaler': scaler}, f)

    print(f"Models saved for {exercise_name} in saved_models/")

if __name__ == "__main__":
    exercises = ['squat', 'deadlift', 'overhead_press']
    for exercise in exercises:
        train_and_optimize(exercise)


Loading data for squat...
Training baseline classifiers for squat...
	Random Forest accuracy: 0.9985
	Gradient Boosting accuracy: 0.9829
	Logistic Regression accuracy: 0.8480
	Support Vector Machine accuracy: 0.9863
Best baseline classifier for squat: Random Forest with accuracy 0.9985
Optimizing Random Forest with GridSearchCV for squat...
Best params (GridSearch) for squat: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best CV accuracy (GridSearch) for squat: 0.9980
Optimizing Random Forest with RandomizedSearchCV for squat...
Best params (RandomizedSearch) for squat: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None}
Best CV accuracy (RandomizedSearch) for squat: 0.9979
Models saved for squat in saved_models/
Loading data for deadlift...
Training baseline classifiers for deadlift...
	Random Forest accuracy: 0.9997
	Gradient Boosting accuracy: 0.9790
	Logistic Regression accuracy: 0.8578
	Support Vector Ma