In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import optuna
from optuna.samplers import TPESampler
import joblib
import os

# Set random seeds
np.random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_data(csv_path='data/samples.csv', letters=['A', 'B', 'C', 'L']):
    """
    Load CSV and organize into sequences grouped by sample_id
    Returns: sequences (list of arrays), labels (list)
    """
    df = pd.read_csv(csv_path)
    df = df[df['label'].isin(letters)]
    
    # Extract feature columns
    feature_cols = [col for col in df.columns 
                    if col not in ['sample_id', 'frame_id', 'label', 'x0', 'y0', 'z0']] # last 3 are coordinates with constant values
    
    # Group by sample_id to create sequences
    sequences = []
    labels = []
    
    for sample_id in df['sample_id'].unique():
        sample_data = df[df['sample_id'] == sample_id]
        sequence = sample_data[feature_cols].values 
        label = sample_data['label'].iloc[0]
        
        sequences.append(sequence)
        labels.append(label)

    return sequences, labels, feature_cols

# Load data
sequences, labels, feature_cols = load_data('data/samples.csv', letters=['A', 'B', 'C', 'L'])

In [3]:
def extract_features(sequence):
    """
    Extract statistical features from a sequence
    """
    features = []
    
    # Start from index 3 to skip x0, y0, z0 (wrist is always 0, 0, 0)
    for coord_idx in range(0, sequence.shape[1]): 
        coord_values = sequence[:, coord_idx]  
        
        # Extract statistics
        features.append(np.mean(coord_values))  # Mean position
        features.append(np.std(coord_values))   # Variability (low for static)
        features.append(np.min(coord_values))   # Minimum
        features.append(np.max(coord_values))   # Maximum
    
    return np.array(features)

In [4]:
def create_feature_dataset(sequences, labels):
    """
    Convert all sequences to feature vectors
    """
   
    X = []
    for seq in sequences:
        features = extract_features(seq)
        X.append(features)
    
    X = np.array(X)
    y = np.array(labels)
    
    return X, y

# Extract features
X, y = create_feature_dataset(sequences, labels)

In [5]:
# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"Label encoding: {dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))}")

# Split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

Label encoding: {np.str_('A'): 0, np.str_('B'): 1, np.str_('C'): 2, np.str_('L'): 3}


In [6]:
def objective(trial):
    """
    Optuna objective function for LightGBM hyperparameter tuning
    Optimizes: num_leaves, learning_rate, max_depth, regularization
    """
    
    params = {
        # Tree structure
        'num_leaves': trial.suggest_int('num_leaves', 8, 50),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        
        # Learning
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        
        # Regularization
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 30),
        
        # Fixed params
        'objective': 'multiclass',
        'num_class': len(label_encoder.classes_),
        'metric': 'multi_logloss',
        'verbosity': -1,
        'random_state': 42
    }
    
    # Train model
    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train)
    
    # Evaluate on test set
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    return accuracy

In [7]:
# Suppress Optuna output
optuna.logging.set_verbosity(optuna.logging.WARNING)

sampler = TPESampler(seed=42)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=100, show_progress_bar=True)

Best trial: 17. Best value: 1: 100%|██████████| 100/100 [00:13<00:00,  7.64it/s]


In [8]:
best_params = study.best_params
best_accuracy = study.best_value

# Create final model with best params
final_model = lgb.LGBMClassifier(
    **best_params,
    objective='multiclass',
    num_class=len(label_encoder.classes_),
    metric='multi_logloss',
    verbosity=-1,
    random_state=42
)

# Train on full training set
final_model.fit(X_train, y_train)

0,1,2
,boosting_type,'gbdt'
,num_leaves,19
,max_depth,8
,learning_rate,0.014118764460848992
,n_estimators,220
,subsample_for_bin,200000
,objective,'multiclass'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [9]:
# Predict
y_pred = final_model.predict(X_test)
y_pred_proba = final_model.predict_proba(X_test)

# Calculate accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")


Test Accuracy: 1.0000 (100.00%)




In [10]:
os.makedirs('models', exist_ok=True)

# Save LightGBM model (native format - fast loading)
final_model.booster_.save_model('models/lightgbm_model.txt')
print("✓ Model saved to models/lightgbm_model.txt")

# Save label encoder
joblib.dump(label_encoder, 'models/label_encoder_lgbm.pkl')
print("✓ Label encoder saved to models/label_encoder_lgbm.pkl")

# Save feature extraction info (for inference)
feature_info = {
    'num_features': X.shape[1],
    'feature_names': [
        f"{col}_{stat}" 
        for col in feature_cols 
        for stat in ['mean', 'std', 'min', 'max']
    ]
}
joblib.dump(feature_info, 'models/feature_info.pkl')
print("✓ Feature info saved to models/feature_info.pkl")

✓ Model saved to models/lightgbm_model.txt
✓ Label encoder saved to models/label_encoder_lgbm.pkl
✓ Feature info saved to models/feature_info.pkl
