# CNN-RNN Action Recognition: Model Evaluation and Predictions

This notebook evaluates the performance of three different models (Baseline, Optimized, Improved) on the UCF101 action recognition dataset and demonstrates how to make predictions.

## Models Overview
- **Baseline**: Simple LSTM with basic regularization
- **Optimized**: Bidirectional LSTM with batch normalization
- **Improved**: Advanced architecture with attention mechanism and heavy regularization

## Dataset
- **UCF101 Actions**: Basketball, BasketballDunk, ApplyEyeMakeup, ApplyLipstick, Archery
- **Features**: CNN features (2048) + Skeleton features (51) = 2099 total features
- **Sequence Length**: 20 frames per video

## 1. Setup and Imports

In [None]:
import sys, os
sys.path.append(os.path.dirname('..'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow import keras

# Custom imports
from config import SELECTED_CLASSES

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

print("‚úÖ Setup complete!")
print(f"TensorFlow version: {tf.__version__}")
print(f"Available GPUs: {len(tf.config.list_physical_devices('GPU'))}")

## 2. Load Trained Models

In [None]:
def load_model_safely(model_path):
    """Load a model with error handling"""
    try:
        model = keras.models.load_model(model_path)
        print(f"‚úÖ Loaded {model_path}")
        print(f"   Architecture: {model.name}")
        print(f"   Input shape: {model.input_shape}")
        print(f"   Output shape: {model.output_shape}")
        return model
    except Exception as e:
        print(f"‚ùå Error loading {model_path}: {e}")
        return None

# Model paths
model_paths = {
    'baseline': '../best_baseline.h5',
    'optimized': '../best_optimized.h5',
    'improved': '../best_improved.h5'
}

# Load all models
models = {}
for name, path in model_paths.items():
    models[name] = load_model_safely(path)
    print()

print(f"Successfully loaded {sum(1 for m in models.values() if m is not None)}/{len(models)} models")

## 3. Load and Preprocess Test Data

In [None]:
def load_and_filter_test_data():
    """Load test data and filter for selected classes"""
    data_dir = "../data/processed"
    
    # Load raw data
    X_test = np.load(os.path.join(data_dir, "test_features.npy"))
    M_test = np.load(os.path.join(data_dir, "test_masks.npy"))
    Y_test = np.load(os.path.join(data_dir, "test_labels.npy"))
    
    print(f"Raw test data shape: {X_test.shape}")
    print(f"Unique labels in raw data: {len(np.unique(Y_test))} (0-{np.unique(Y_test).max()})")
    
    # Filter for selected classes (0-4)
    selected_classes = list(range(len(SELECTED_CLASSES)))
    mask = np.isin(Y_test, selected_classes)
    
    X_test_filtered = X_test[mask]
    M_test_filtered = M_test[mask]
    Y_test_filtered = Y_test[mask]
    
    print(f"\nFiltered test data shape: {X_test_filtered.shape}")
    print(f"Selected classes: {SELECTED_CLASSES}")
    
    # Show class distribution
    unique, counts = np.unique(Y_test_filtered, return_counts=True)
    for i, (class_idx, count) in enumerate(zip(unique, counts)):
        print(f"  {SELECTED_CLASSES[class_idx]}: {count} samples")
    
    return X_test_filtered, M_test_filtered, Y_test_filtered

# Load test data
X_test, M_test, Y_test = load_and_filter_test_data()

# Create a sample for predictions
sample_indices = np.random.choice(len(X_test), 5, replace=False)
X_sample = X_test[sample_indices]
M_sample = M_test[sample_indices]
Y_sample_true = Y_test[sample_indices]

print(f"\nSample data shape: {X_sample.shape}")
print(f"Sample true labels: {[SELECTED_CLASSES[i] for i in Y_sample_true]}")

## 4. Evaluate Models on Test Data

In [None]:
def evaluate_model(model, name, X_test, M_test, Y_test):
    """Evaluate a single model and return detailed metrics"""
    print(f"\nüîç Evaluating {name} model...")
    
    # Get predictions
    predictions = model.predict([X_test, M_test], verbose=0)
    y_pred = np.argmax(predictions, axis=1)
    y_true = Y_test
    
    # Calculate metrics
    loss, accuracy = model.evaluate([X_test, M_test], Y_test, verbose=0)
    
    print(f"  Loss: {loss:.4f}")
    print(f"  Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    
    # Per-class accuracy
    per_class_acc = {}
    for i, class_name in enumerate(SELECTED_CLASSES):
        class_mask = (y_true == i)
        if np.sum(class_mask) > 0:
            class_acc = np.mean(y_pred[class_mask] == i)
            per_class_acc[class_name] = class_acc
            print(f"  {class_name}: {class_acc:.4f} ({class_acc*100:.2f}%)")
    
    return {
        'name': name,
        'loss': loss,
        'accuracy': accuracy,
        'per_class': per_class_acc,
        'predictions': y_pred,
        'probabilities': predictions
    }

# Evaluate all models
results = {}
for name, model in models.items():
    if model is not None:
        results[name] = evaluate_model(model, name, X_test, M_test, Y_test)
    else:
        print(f"‚ö†Ô∏è  Skipping {name} model (not loaded)")

print("\n‚úÖ Model evaluation complete!")

## 5. Comparative Analysis

In [None]:
# Create comparison dataframe
comparison_data = []
for name, result in results.items():
    row = {
        'Model': name.title(),
        'Overall Accuracy': result['accuracy'],
        'Test Loss': result['loss']
    }
    # Add per-class accuracies
    for class_name, acc in result['per_class'].items():
        row[class_name] = acc
    comparison_data.append(row)

df_comparison = pd.DataFrame(comparison_data)
df_comparison = df_comparison.round(4)

print("üìä Model Comparison Table:")
display(df_comparison)

# Plot comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Overall accuracy
axes[0,0].bar(df_comparison['Model'], df_comparison['Overall Accuracy'])
axes[0,0].set_title('Overall Test Accuracy')
axes[0,0].set_ylabel('Accuracy')
axes[0,0].grid(True, alpha=0.3)

# Test loss
axes[0,1].bar(df_comparison['Model'], df_comparison['Test Loss'], color='orange')
axes[0,1].set_title('Test Loss')
axes[0,1].set_ylabel('Loss')
axes[0,1].grid(True, alpha=0.3)

# Per-class accuracy heatmap
per_class_cols = [col for col in df_comparison.columns if col not in ['Model', 'Overall Accuracy', 'Test Loss']]
per_class_data = df_comparison[per_class_cols].T
per_class_data.columns = df_comparison['Model']

sns.heatmap(per_class_data, annot=True, fmt='.3f', cmap='YlOrRd', ax=axes[1,0])
axes[1,0].set_title('Per-Class Accuracy Heatmap')
axes[1,0].set_ylabel('Action Class')

# Best model per class
best_per_class = per_class_data.idxmax(axis=1)
axes[1,1].bar(range(len(best_per_class)), [1]*len(best_per_class))
axes[1,1].set_xticks(range(len(best_per_class)))
axes[1,1].set_xticklabels(best_per_class.index, rotation=45, ha='right')
axes[1,1].set_yticks([])
axes[1,1].set_title('Best Model Per Class')
for i, (class_name, model) in enumerate(best_per_class.items()):
    axes[1,1].text(i, 0.5, model, ha='center', va='center', fontsize=10, 
                   bbox=dict(boxstyle="round,pad=0.3", facecolor='lightblue'))

plt.tight_layout()
plt.show()

# Print summary
print("\nüèÜ Summary:")
best_overall = df_comparison.loc[df_comparison['Overall Accuracy'].idxmax()]
print(f"Best overall model: {best_overall['Model']} ({best_overall['Overall Accuracy']:.1%})")

for class_name in SELECTED_CLASSES:
    if class_name in df_comparison.columns:
        best_for_class = df_comparison.loc[df_comparison[class_name].idxmax()]
        print(f"Best for {class_name}: {best_for_class['Model']} ({best_for_class[class_name]:.1%})")

## 6. Confusion Matrices

In [None]:
# Create confusion matrices for each model
fig, axes = plt.subplots(1, len(results), figsize=(18, 6))
if len(results) == 1:
    axes = [axes]

for i, (name, result) in enumerate(results.items()):
    # Create confusion matrix
    cm = confusion_matrix(Y_test, result['predictions'])
    
    # Plot
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=SELECTED_CLASSES, yticklabels=SELECTED_CLASSES, ax=axes[i])
    axes[i].set_title(f'{name.title()} Model Confusion Matrix\nAccuracy: {result["accuracy"]:.1%}')
    axes[i].set_xlabel('Predicted')
    axes[i].set_ylabel('True')
    plt.setp(axes[i].get_xticklabels(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

# Print classification reports
for name, result in results.items():
    print(f"\nüìã Classification Report - {name.title()} Model:")
    print(classification_report(Y_test, result['predictions'], 
                              target_names=SELECTED_CLASSES, digits=3))

## 7. Make Predictions on Sample Data

In [None]:
def predict_sample(model, name, X_sample, M_sample, Y_sample_true):
    """Make predictions on sample data and show results"""
    print(f"\nüéØ {name.title()} Model Predictions:")
    
    # Get predictions
    predictions = model.predict([X_sample, M_sample], verbose=0)
    y_pred = np.argmax(predictions, axis=1)
    
    results = []
    for i, (true_label, pred_label, probs) in enumerate(zip(Y_sample_true, y_pred, predictions)):
        true_class = SELECTED_CLASSES[true_label]
        pred_class = SELECTED_CLASSES[pred_label]
        confidence = probs[pred_label]
        correct = "‚úÖ" if true_label == pred_label else "‚ùå"
        
        print(f"  Sample {i+1}: {true_class} ‚Üí {pred_class} ({confidence:.1%}) {correct}")
        
        results.append({
            'sample': i+1,
            'true': true_class,
            'predicted': pred_class,
            'confidence': confidence,
            'correct': true_label == pred_label
        })
    
    return results

# Make predictions with all models
all_predictions = {}
for name, model in models.items():
    if model is not None:
        all_predictions[name] = predict_sample(model, name, X_sample, M_sample, Y_sample_true)

# Summary of predictions
print("\nüìä Prediction Summary:")
summary_data = []
for name, preds in all_predictions.items():
    correct = sum(1 for p in preds if p['correct'])
    accuracy = correct / len(preds)
    avg_confidence = np.mean([p['confidence'] for p in preds])
    print(f"  {name.title()}: {correct}/{len(preds)} correct ({accuracy:.1%}), Avg confidence: {avg_confidence:.1%}")
    summary_data.append({
        'Model': name.title(),
        'Correct': correct,
        'Total': len(preds),
        'Accuracy': accuracy,
        'Avg Confidence': avg_confidence
    })

# Plot prediction results
if summary_data:
    df_preds = pd.DataFrame(summary_data)
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    
    # Accuracy
    ax1.bar(df_preds['Model'], df_preds['Accuracy'])
    ax1.set_title('Prediction Accuracy on Samples')
    ax1.set_ylabel('Accuracy')
    ax1.grid(True, alpha=0.3)
    
    # Average confidence
    ax2.bar(df_preds['Model'], df_preds['Avg Confidence'], color='green')
    ax2.set_title('Average Prediction Confidence')
    ax2.set_ylabel('Confidence')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 8. Model Architecture Analysis

In [None]:
# Analyze model architectures
for name, model in models.items():
    if model is not None:
        print(f"\nüèóÔ∏è {name.title()} Model Architecture:")
        print(f"Total parameters: {model.count_params():,}")
        
        # Count layers by type
        layer_counts = {}
        for layer in model.layers:
            layer_type = layer.__class__.__name__
            layer_counts[layer_type] = layer_counts.get(layer_type, 0) + 1
        
        print("Layer composition:")
        for layer_type, count in layer_counts.items():
            print(f"  {layer_type}: {count}")
        
        # Show model summary for the first model only (to avoid clutter)
        if name == list(models.keys())[0]:
            print("\nModel Summary:")
            model.summary()

## 9. Conclusions and Recommendations

In [None]:
print("üéØ EXPERIMENT CONCLUSIONS\n")

# Best model overall
if results:
    best_model = max(results.items(), key=lambda x: x[1]['accuracy'])
    print(f"üèÜ Best Overall Model: {best_model[0].title()}")
    print(f"   Accuracy: {best_model[1]['accuracy']:.1%}")
    print(f"   Loss: {best_model[1]['loss']:.4f}\n")

# Model characteristics
print("üìä MODEL CHARACTERISTICS:")
print("‚Ä¢ Baseline: Simple LSTM with basic regularization")
print("‚Ä¢ Optimized: Bidirectional LSTM with batch normalization")
print("‚Ä¢ Improved: Advanced architecture with attention and heavy regularization\n")

# Performance analysis
print("üìà PERFORMANCE ANALYSIS:")
if len(results) > 1:
    accuracies = [r['accuracy'] for r in results.values()]
    best_acc = max(accuracies)
    worst_acc = min(accuracies)
    spread = best_acc - worst_acc
    print(f"‚Ä¢ Best accuracy: {best_acc:.1%}")
    print(f"‚Ä¢ Worst accuracy: {worst_acc:.1%}")
    print(f"‚Ä¢ Performance spread: {spread:.1%}\n")

# Recommendations
print("üí° RECOMMENDATIONS:")
print("1. For production use: Choose the model with highest overall accuracy")
print("2. For specific actions: Select model that performs best on target action")
print("3. For robustness: Consider ensemble of all three models")
print("4. For improvement: Try data augmentation or more training data")
print("5. For real-time: Optimize Baseline model for speed\n")

print("‚úÖ Experiment completed successfully!")
print("üìù Notebook created for comprehensive model evaluation and prediction testing.")