In [None]:
import pandas as pd
import numpy as np
import os
import joblib
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

np.random.seed(42)

In [None]:
# Configuration

DATA_PATH = "data/samples.csv"
MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, "random_forest_model.joblib")
LABEL_ENCODER_PATH = os.path.join(MODEL_DIR, "label_encoder_rf.pkl")

# Create model directory if it doesn't exist
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

# Load Data
print(f"Loading data from {DATA_PATH}...")
if not os.path.exists(DATA_PATH):
    print(f"Error: Data file not found at {DATA_PATH}")
else:
    df = pd.read_csv(DATA_PATH)
    print(f"Loaded {len(df)} frames for {len(df['label'].unique())} letters.")


In [None]:
# Normalized landmark features (relative to wrist)
landmark_columns = [f"{axis}{i}" for i in range(21) for axis in ["x", "y", "z"]]

# Absolute wrist position features (to help with dynamic gestures)
wrist_columns = ["wrist_abs_x", "wrist_abs_y", "wrist_abs_z"]

# Combine features: 63 landmarks + 3 absolute coordinates = 66 features
feature_columns = landmark_columns + wrist_columns

# For static and per-frame classification, we train on individual frames
X = df[feature_columns].values
y = df['label'].values

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Features shape: {X.shape} (Expected 66 features per frame)")
print(f"Labels shape: {y_encoded.shape}")
print(f"Target classes: {label_encoder.classes_}")


In [None]:
# 3. Split Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [None]:
def objective(trial):
    """Optuna objective function for Random Forest hyperparameter tuning with 5-fold CV."""
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
        n_jobs=-1
    )
    
    # Use 5-fold cross-validation on the training data
    score = cross_val_score(rf, X_train, y_train, n_jobs=-1, cv=5)
    accuracy = score.mean()
    
    return accuracy


In [None]:
# Tune the model with optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

print("\nBest trial:")
trial = study.best_trial
print(f"  Value (Mean CV Accuracy): {trial.value:.4f}")

In [None]:
# Train Final Model with Best Parameters
print("\nTraining final model with best parameters...")
best_rf = RandomForestClassifier(
    **study.best_params,
    random_state=42,
    n_jobs=-1
)
best_rf.fit(X_train, y_train)

# Final evaluation
y_pred = best_rf.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred)
print(f"Final Test Accuracy: {final_accuracy * 100:.2f}%")


In [None]:
# 6. Save Model and Label Encoder
print(f"Saving Random Forest model to {MODEL_PATH}")
joblib.dump(best_rf, MODEL_PATH)

print(f"Saving label encoder to {LABEL_ENCODER_PATH}")
joblib.dump(label_encoder, LABEL_ENCODER_PATH)