# Notebook 03: Random Forest Baseline

## Objectives
1. Load aggregated features
2. Train Random Forest classifier
3. Evaluate performance
4. Analyze feature importance

In [None]:
import sys
sys.path.append('..')

from src.utils.helpers import load_config, set_random_seeds, save_results
from src.models.random_forest import RandomForestSpeakerClassifier
from src.training.trainer import ModelTrainer
from src.evaluation.metrics import evaluate_model
from src.evaluation.visualization import plot_confusion_matrix, plot_per_speaker_accuracy, plot_feature_importance
from src.data.dataset import FeatureDataset
import matplotlib.pyplot as plt

config = load_config('../config/config.yaml')
set_random_seeds(config['seeds']['sklearn'])

In [None]:
# Load features
train_dataset = FeatureDataset.load_pickle('../data/processed/train_aggregated.pkl')
X_train, y_train = train_dataset.get_data()

val_dataset = FeatureDataset.load_pickle('../data/processed/val_aggregated.pkl')
X_val, y_val = val_dataset.get_data()

test_dataset = FeatureDataset.load_pickle('../data/processed/test_aggregated.pkl')
X_test, y_test = test_dataset.get_data()

num_speakers = len(set(y_train))
print(f'Training: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}')
print(f'Number of speakers: {num_speakers}')

In [None]:
# Create and train model
model = RandomForestSpeakerClassifier(num_speakers, config)
model.build()

trainer = ModelTrainer(model, config)
history = trainer.train(X_train, y_train, X_val, y_val, save_path='../models/random_forest_best.pkl')

In [None]:
# Evaluate on test set
test_results = evaluate_model(model, X_test, y_test)

print('\nTest Results:')
for metric, value in test_results['metrics'].items():
    print(f'  {metric}: {value:.4f}')

In [None]:
# Plot confusion matrix
fig = plot_confusion_matrix(
    test_results['confusion_matrix_normalized'],
    normalize=True,
    title='Random Forest - Confusion Matrix',
    save_path='../results/random_forest/confusion_matrix.png'
)
plt.show()

In [None]:
# Plot per-speaker accuracy
fig = plot_per_speaker_accuracy(
    test_results['per_speaker_accuracy'],
    title='Random Forest - Per-Speaker Accuracy',
    save_path='../results/random_forest/per_speaker_accuracy.png'
)
plt.show()

In [None]:
# Plot feature importance
fig = plot_feature_importance(
    model.get_feature_importance(),
    top_n=20,
    title='Random Forest - Top 20 Features',
    save_path='../results/random_forest/feature_importance.png'
)
plt.show()

In [None]:
# Save results
save_results(test_results['metrics'], '../results/random_forest/test_metrics.json')
print('\nResults saved!')