In [1]:
import pandas as pd
import numpy as np
import os
import joblib
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

np.random.seed(42)


In [2]:
# --- Configuration ---
DATA_PATH = "data/samples.csv"
MODEL_DIR = "models"
MODEL_PATH = os.path.join(MODEL_DIR, "random_forest_model.joblib")
LABEL_ENCODER_PATH = os.path.join(MODEL_DIR, "label_encoder_rf.pkl")

# Letters to EXCLUDE (dynamic gestures)
EXCLUDE_LETTERS = ['H', 'J', 'Z']

# --- Main Training Script ---
print("Starting Random Forest model training (Static letters only)...")

# Create model directory if it doesn't exist
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

# 1. Load Data
print(f"Loading data from {DATA_PATH}...")
if not os.path.exists(DATA_PATH):
    print(f"Error: Data file not found at {DATA_PATH}")
else:
    df = pd.read_csv(DATA_PATH)
    # Filter out dynamic letters
    df = df[~df['label'].isin(EXCLUDE_LETTERS)]
    print(f"Loaded {len(df)} frames for {len(df['label'].unique())} static letters.")


Starting Random Forest model training (Static letters only)...
Loading data from data/samples.csv...
Loaded 55085 frames for 23 static letters.


In [3]:
# 2. Preprocess Data
print("Preprocessing data...")

# List of landmark feature columns (x0, y0, z0, ..., x20, y20, z20)
feature_columns = [f"{axis}{i}" for i in range(21) for axis in ["x", "y", "z"]]

# For static gestures, we can train on individual frames
X = df[feature_columns].values
y = df['label'].values

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Features shape: {X.shape}")
print(f"Labels shape: {y_encoded.shape}")
print(f"Target classes: {label_encoder.classes_}")


Preprocessing data...
Features shape: (55085, 63)
Labels shape: (55085,)
Target classes: ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'I' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R' 'S' 'T'
 'U' 'V' 'W' 'X' 'Y']


In [4]:
# 3. Split Data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)


In [5]:
def objective(trial):
    """Optuna objective function for Random Forest hyperparameter tuning with 5-fold CV."""
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 5, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
        n_jobs=-1
    )
    
    # Use 5-fold cross-validation on the training data
    score = cross_val_score(rf, X_train, y_train, n_jobs=-1, cv=5)
    accuracy = score.mean()
    
    return accuracy


In [7]:
# 4. Hyperparameter Tuning with Optuna
print("Starting hyperparameter tuning (20 trials with 5-fold CV)...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

print("\nBest trial:")
trial = study.best_trial
print(f"  Value (Mean CV Accuracy): {trial.value:.4f}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


[I 2026-01-30 13:24:39,910] A new study created in memory with name: no-name-91a42b4a-3f61-4395-ae12-54c0f6fddc0e


Starting hyperparameter tuning (20 trials with 5-fold CV)...


[I 2026-01-30 13:26:13,673] Trial 0 finished with value: 0.9982300061039334 and parameters: {'n_estimators': 148, 'max_depth': 22, 'min_samples_split': 5, 'min_samples_leaf': 9}. Best is trial 0 with value: 0.9982300061039334.
[I 2026-01-30 13:26:43,595] Trial 1 finished with value: 0.9987292428097392 and parameters: {'n_estimators': 80, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 7}. Best is trial 1 with value: 0.9987292428097392.
[I 2026-01-30 13:27:52,966] Trial 2 finished with value: 0.9926250288660405 and parameters: {'n_estimators': 218, 'max_depth': 9, 'min_samples_split': 14, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.9987292428097392.
[I 2026-01-30 13:28:47,474] Trial 3 finished with value: 0.9859988896181408 and parameters: {'n_estimators': 185, 'max_depth': 8, 'min_samples_split': 20, 'min_samples_leaf': 7}. Best is trial 1 with value: 0.9987292428097392.
[I 2026-01-30 13:30:20,302] Trial 4 finished with value: 0.9987973111796625 and parameters: 


Best trial:
  Value (Mean CV Accuracy): 0.9988
  Params: 
    n_estimators: 260
    max_depth: 25
    min_samples_split: 9
    min_samples_leaf: 6


In [8]:
# 5. Train Final Model with Best Parameters
print("\nTraining final model with best parameters...")
best_rf = RandomForestClassifier(
    **study.best_params,
    random_state=42,
    n_jobs=-1
)
best_rf.fit(X_train, y_train)

# Final evaluation
y_pred = best_rf.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred)
print(f"Final Test Accuracy: {final_accuracy * 100:.2f}%")



Training final model with best parameters...
Final Test Accuracy: 99.95%


In [9]:
# 6. Save Model and Label Encoder
print(f"Saving Random Forest model to {MODEL_PATH}")
joblib.dump(best_rf, MODEL_PATH)

print(f"Saving label encoder to {LABEL_ENCODER_PATH}")
joblib.dump(label_encoder, LABEL_ENCODER_PATH)

print("\n✅ Static Random Forest model and label encoder saved successfully!")


Saving Random Forest model to models\random_forest_model.joblib
Saving label encoder to models\label_encoder_rf.pkl

✅ Static Random Forest model and label encoder saved successfully!
