In [1]:
import os
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from sklearn.model_selection import KFold
# Custom Dataset class for hand gestures
class HandGestureDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]


# Function to load and preprocess the dataset
def load_dataset(data_path):
    """
    Load and preprocess the hand gesture dataset.

    Args:
        data_path: Path to the dataset directory

    Returns:
        X: Sequences of images with shape (num_sequences, channels, frames, height, width)
        y: Labels corresponding to gestures (0: down, 1: left, 2: right, 3: up)
        subject_ids: IDs of subjects for leave-one-subject-out cross-validation
    """
    # Placeholder arrays for data, labels, and subject IDs
    sequences = []
    labels = []
    subject_ids = []

    # Map gesture names to numerical labels
    gesture_map = {'down': 0, 'left': 1, 'right': 2, 'up': 3}

    # Image transformation pipeline
    transform = transforms.Compose([
        transforms.Resize((64, 64)),  # Using power of 2 dimensions for better downsampling
        transforms.ToTensor(),  # Converts to [0,1] range and changes to CxHxW format
        transforms.Normalize(mean=[0.5], std=[0.5])  # Normalize to [-1, 1] range
    ])

    # Iterate through gesture folders
    for gesture in ['down', 'left', 'right', 'up']:
        gesture_path = os.path.join(data_path, gesture)

        # Iterate through sequence folders for this gesture
        for seq_folder in os.listdir(gesture_path):
            seq_path = os.path.join(gesture_path, seq_folder)

            if os.path.isdir(seq_path):
                # Extract subject ID from sequence folder name (assuming format contains subject ID)
                # Modify this according to your actual folder naming convention
                subject_id = int(seq_folder.split('_')[0])

                # Load frames for this sequence
                frame_files = sorted([f for f in os.listdir(seq_path) if f.endswith('.jpg') or f.endswith('.png')])

                if len(frame_files) > 0:
                    # Load and normalize frames
                    frames = []
                    for frame_file in frame_files:
                        frame_path = os.path.join(seq_path, frame_file)
                        # Open image and apply transformations
                        img = Image.open(frame_path).convert('L')  # Convert to grayscale
                        img_tensor = transform(img)  # Apply transforms
                        frames.append(img_tensor)

                    # Pad or truncate sequence to fixed length (8 frames - power of 2)
                    target_frames = 8
                    if len(frames) < target_frames:
                        # Pad with zeros if sequence is too short
                        for _ in range(target_frames - len(frames)):
                            frames.append(torch.zeros_like(frames[0]))
                    elif len(frames) > target_frames:
                        # Truncate if sequence is too long
                        frames = frames[:target_frames]

                    # Stack frames along a new dimension
                    sequence_tensor = torch.stack(frames)  # Shape: [frames, channels, height, width]
                    sequence_tensor = sequence_tensor.permute(1, 0, 2, 3)  # Reshape to [channels, frames, height, width]

                    # Add sequence to dataset
                    sequences.append(sequence_tensor)
                    labels.append(gesture_map[gesture])
                    subject_ids.append(subject_id)

    # Convert lists to tensors
    X = torch.stack(sequences)
    y = torch.tensor(labels, dtype=torch.long)
    print(y.size())
    subject_ids = np.array(subject_ids)

    return X, y, subject_ids


# Basic 3D convolution block
class Conv3DBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
        super(Conv3DBlock, self).__init__()
        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride, padding, bias=False)
        self.bn = nn.BatchNorm3d(out_channels)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        return self.relu(self.bn(self.conv(x)))


# Dense Block for 3D DenseNet
class DenseBlock3D(nn.Module):
    def __init__(self, in_channels, growth_rate, num_layers):
        super(DenseBlock3D, self).__init__()
        self.layers = nn.ModuleList()
        for i in range(num_layers):
            self.layers.append(self._make_dense_layer(in_channels + i * growth_rate, growth_rate))

    def _make_dense_layer(self, in_channels, growth_rate):
        return nn.Sequential(
            nn.BatchNorm3d(in_channels),
            nn.ReLU(inplace=True),
            nn.Conv3d(in_channels, 4 * growth_rate, kernel_size=1, bias=False),
            nn.BatchNorm3d(4 * growth_rate),
            nn.ReLU(inplace=True),
            nn.Conv3d(4 * growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)
        )

    def forward(self, x):
        features = [x]
        for layer in self.layers:
            new_feature = layer(torch.cat(features, 1))
            features.append(new_feature)
        return torch.cat(features, 1)


# Transition Layer for 3D DenseNet
class TransitionLayer3D(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(TransitionLayer3D, self).__init__()
        self.layers = nn.Sequential(
            nn.BatchNorm3d(in_channels),
            nn.ReLU(inplace=True),
            nn.Conv3d(in_channels, out_channels, kernel_size=1, bias=False),
            nn.AvgPool3d(kernel_size=2, stride=2)
        )

    def forward(self, x):
        return self.layers(x)


# 3D DenseNet Model
class DenseNet3D(nn.Module):
    def __init__(self, growth_rate=12, block_config=(2, 4, 6), num__init__features=16,
                 compression_rate=0.5, num_classes=4):
        super(DenseNet3D, self).__init__()

        # First convolution and pooling
        self.features = nn.Sequential(
            nn.Conv3d(1, num__init__features, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm3d(num__init__features),
            nn.ReLU(inplace=True),
            nn.MaxPool3d(kernel_size=2, stride=2)
        )

        # Dense Blocks
        num_features = num__init__features
        for i, num_layers in enumerate(block_config):
            # Add a dense block
            block = DenseBlock3D(
                in_channels=num_features,
                growth_rate=growth_rate,
                num_layers=num_layers
            )
            self.features.add_module(f'denseblock{i+1}', block)
            num_features += num_layers * growth_rate

            # Add a transition layer (except after the last block)
            if i != len(block_config) - 1:
                out_features = int(num_features * compression_rate)
                trans = TransitionLayer3D(num_features, out_features)
                self.features.add_module(f'transition{i+1}', trans)
                num_features = out_features

        # Final batch norm
        self.features.add_module('norm_final', nn.BatchNorm3d(num_features))
        self.features.add_module('relu_final', nn.ReLU(inplace=True))

        # Global Average Pooling and classifier
        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
        self.classifier = nn.Linear(num_features, num_classes)

        # Initialize weights
        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                nn.init.kaiming_normal_(m.weight)
            elif isinstance(m, nn.BatchNorm3d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        features = self.features(x)
        out = self.avgpool(features)
        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return out


# Function to train the model for one epoch
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        # Statistics
        running_loss += loss.item() * inputs.size(0)
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    train_loss = running_loss / total
    train_acc = 100. * correct / total

    return train_loss, train_acc


# Function to evaluate the model
def evaluate(model, test_loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    all_preds = []
    all_targets = []

    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            # Statistics
            running_loss += loss.item() * inputs.size(0)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()

            # Save predictions and targets for confusion matrix
            all_preds.extend(predicted.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())

    test_loss = running_loss / total
    test_acc = 100. * correct / total

    return test_loss, test_acc, all_preds, all_targets


def train_and_evaluate(X, y, subject_ids, model_type='densenet'):
    """
    Train and evaluate the 3D-CNN model using k-fold cross-validation
    with stratification across subjects.

    Args:
        X: Sequences of images tensor
        y: Labels tensor
        subject_ids: Subject IDs for cross-validation
        model_type: Type of model to use

    Returns:
        results: Dictionary containing evaluation results
        best_model_state: State dict of the best performing model
    """
    # Check for CUDA availability
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize KFold cross-validator
    n_splits = 5  # 5-fold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize lists to store results
    subject_accuracies = []
    all_y_true = []
    all_y_pred = []

    # Cross-validation parameters
    num_epochs = 10
    batch_size = 16

    # Track the best model across all folds
    best_overall_acc = 0
    final_best_model_state = None

    # Iterate through folds
    for fold, (train_idx, test_idx) in enumerate(kf.split(X.numpy()), 1):
        print(f"\n--- Fold {fold}/{n_splits} ---")

        # Create PyTorch datasets and dataloaders
        train_dataset = HandGestureDataset(X[train_idx], y[train_idx])
        test_dataset = HandGestureDataset(X[test_idx], y[test_idx])

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

        # Create the model
        model = DenseNet3D(growth_rate=12, block_config=(2, 4, 4), num__init__features=16).to(device)

        # Define loss function and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)

        # Learning rate scheduler
        scheduler = optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=0.01,
            steps_per_epoch=len(train_loader),
            epochs=num_epochs
        )

        # Variables for early stopping
        best_test_acc = 0
        best_model_state = None
        patience = 15
        patience_counter = 0

        # Training loop
        for epoch in range(num_epochs):
            # Train for one epoch
            train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)

            # Evaluate on test set
            test_loss, test_acc, _, _ = evaluate(model, test_loader, criterion, device)

            # Print progress
            print(f'Epoch {epoch+1}/{num_epochs}: '
                  f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, '
                  f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')

            # Check for improvement
            if test_acc > best_test_acc:
                best_test_acc = test_acc
                best_model_state = model.state_dict().copy()
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print(f"Early stopping at epoch {epoch+1}")
                    break

        # Load best model for final evaluation
        model.load_state_dict(best_model_state)

        # Final evaluation
        _, final_test_acc, y_pred, y_true = evaluate(model, test_loader, criterion, device)

        # Store results
        subject_accuracies.append(final_test_acc / 100.0)
        all_y_true.extend(y_true)
        all_y_pred.extend(y_pred)

        print(f"Fold {fold} Final Test Accuracy: {final_test_acc:.2f}%")

        # Keep track of the best model across all folds
        if final_test_acc > best_overall_acc:
            best_overall_acc = final_test_acc
            final_best_model_state = best_model_state.copy()

    # Calculate overall metrics
    overall_accuracy = np.mean(subject_accuracies)
    conf_matrix = confusion_matrix(all_y_true, all_y_pred)

    # Store results
    results = {
        'subject_accuracies': subject_accuracies,
        'overall_accuracy': overall_accuracy,
        'confusion_matrix': conf_matrix,
        'y_true': all_y_true,
        'y_pred': all_y_pred
    }

    return results, final_best_model_state
# Function to visualize results
def visualize_results(results, model_name='densenet'):
    """
    Visualize the cross-validation results.

    Args:
        results: Dictionary containing evaluation results
        model_name: Name of the model
    """
    # Plot subject accuracies
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(results['subject_accuracies'])), results['subject_accuracies'])
    plt.axhline(y=results['overall_accuracy'], color='r', linestyle='-',
                label=f"Overall: {results['overall_accuracy']:.4f}")
    plt.xlabel('Subject')
    plt.ylabel('Accuracy')
    plt.title(f'Leave-One-Subject-Out Cross-Validation Results ({model_name})')
    plt.legend()
    plt.ylim(0, 1)
    plt.savefig(f'subject_accuracies_{model_name}.png')
    plt.close()

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    gesture_names = ['Down', 'Left', 'Right', 'Up']
    conf_matrix = results['confusion_matrix']
    conf_matrix_norm = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]

    sns.heatmap(conf_matrix_norm, annot=True, fmt='.2f', cmap='Blues',
                xticklabels=gesture_names, yticklabels=gesture_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(f'Confusion Matrix ({model_name})')
    plt.savefig(f'confusion_matrix_{model_name}.png')
    plt.close()


# Main execution

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Path to your dataset
data_path = r"../Dataset/STMM/image"

# Load and preprocess the dataset
X, y, subject_ids = load_dataset(data_path)

# Print dataset info
print(f"Dataset shape: {X.shape}")
print(f"Number of sequences: {len(X)}")
print(f"Number of subjects: {len(np.unique(subject_ids))}")

# Run with DenseNet model
print("\n--- Training with 3D DenseNet ---")
densenet_results, best_model_state = train_and_evaluate(X, y, subject_ids, model_type='densenet')

# Print overall results
print("\n--- Overall Results ---")
print(f"DenseNet Overall Accuracy: {densenet_results['overall_accuracy']:.4f}")

# Visualize results
print("\nGenerating visualization plots...")
visualize_results(densenet_results, 'densenet')

# Save the model
print("\nSaving model...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DenseNet3D(growth_rate=12, block_config=(2, 4, 4), num__init__features=16).to(device)
model.load_state_dict(best_model_state)

# Save the model state dictionary
torch.save(model.state_dict(), 'hand_gesture_model.pth')
print("Model saved successfully to 'hand_gesture_model.pth'")

print("\nDone! Results saved as PNG files.")

  warn(


FileNotFoundError: [WinError 3] The system cannot find the path specified: '../Dataset/STMM/image\\down'

In [4]:
#prediction on sample sequences (unseen data)
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

# Define the same model architecture as used during training
class Conv3DBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
        super(Conv3DBlock, self).__init__()
        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride, padding, bias=False)
        self.bn = nn.BatchNorm3d(out_channels)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        return self.relu(self.bn(self.conv(x)))


class DenseBlock3D(nn.Module):
    def __init__(self, in_channels, growth_rate, num_layers):
        super(DenseBlock3D, self).__init__()
        self.layers = nn.ModuleList()
        for i in range(num_layers):
            self.layers.append(self._make_dense_layer(in_channels + i * growth_rate, growth_rate))

    def _make_dense_layer(self, in_channels, growth_rate):
        return nn.Sequential(
            nn.BatchNorm3d(in_channels),
            nn.ReLU(inplace=True),
            nn.Conv3d(in_channels, 4 * growth_rate, kernel_size=1, bias=False),
            nn.BatchNorm3d(4 * growth_rate),
            nn.ReLU(inplace=True),
            nn.Conv3d(4 * growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)
        )

    def forward(self, x):
        features = [x]
        for layer in self.layers:
            new_feature = layer(torch.cat(features, 1))
            features.append(new_feature)
        return torch.cat(features, 1)


class TransitionLayer3D(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(TransitionLayer3D, self).__init__()
        self.layers = nn.Sequential(
            nn.BatchNorm3d(in_channels),
            nn.ReLU(inplace=True),
            nn.Conv3d(in_channels, out_channels, kernel_size=1, bias=False),
            nn.AvgPool3d(kernel_size=2, stride=2)
        )

    def forward(self, x):
        return self.layers(x)


class DenseNet3D(nn.Module):
    def __init__(self, growth_rate=12, block_config=(2, 4, 4), num__init__features=16,
                 compression_rate=0.5, num_classes=4):
        super(DenseNet3D, self).__init__()

        # First convolution and pooling
        self.features = nn.Sequential(
            nn.Conv3d(1, num__init__features, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm3d(num__init__features),
            nn.ReLU(inplace=True),
            nn.MaxPool3d(kernel_size=2, stride=2)
        )

        # Dense Blocks
        num_features = num__init__features
        for i, num_layers in enumerate(block_config):
            # Add a dense block
            block = DenseBlock3D(
                in_channels=num_features,
                growth_rate=growth_rate,
                num_layers=num_layers
            )
            self.features.add_module(f'denseblock{i+1}', block)
            num_features += num_layers * growth_rate

            # Add a transition layer (except after the last block)
            if i != len(block_config) - 1:
                out_features = int(num_features * compression_rate)
                trans = TransitionLayer3D(num_features, out_features)
                self.features.add_module(f'transition{i+1}', trans)
                num_features = out_features

        # Final batch norm
        self.features.add_module('norm_final', nn.BatchNorm3d(num_features))
        self.features.add_module('relu_final', nn.ReLU(inplace=True))

        # Global Average Pooling and classifier
        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
        self.classifier = nn.Linear(num_features, num_classes)

        # Initialize weights
        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                nn.init.kaiming_normal_(m.weight)
            elif isinstance(m, nn.BatchNorm3d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        features = self.features(x)
        out = self.avgpool(features)
        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return out


# Function to preprocess and load a single sequence
def preprocess_sequence(sequence_folder_path):
    """
    Preprocesses a single sequence of images from a folder.

    Args:
        sequence_folder_path: Path to folder containing sequence frames

    Returns:
        Tensor with shape [1, C, F, H, W] (batch, channels, frames, height, width)
    """
    # Same transforms as used during training
    transform = transforms.Compose([
        transforms.Resize((64, 64)),  # Using power of 2 dimensions for better downsampling
        transforms.ToTensor(),  # Converts to [0,1] range and changes to CxHxW format
        transforms.Normalize(mean=[0.5], std=[0.5])  # Normalize to [-1, 1] range
    ])

    # Load frames and sort them
    frame_files = sorted([f for f in os.listdir(sequence_folder_path)
                         if f.endswith('.jpg') or f.endswith('.png')])

    # Process frames
    frames = []
    for frame_file in frame_files:
        frame_path = os.path.join(sequence_folder_path, frame_file)
        # Open image and apply transformations
        img = Image.open(frame_path).convert('L')  # Convert to grayscale
        img_tensor = transform(img)  # Apply transforms
        frames.append(img_tensor)

    # Pad or truncate sequence to fixed length (8 frames - power of 2)
    target_frames = 8
    if len(frames) < target_frames:
        # Pad with zeros if sequence is too short
        for _ in range(target_frames - len(frames)):
            frames.append(torch.zeros_like(frames[0]))
    elif len(frames) > target_frames:
        # Truncate if sequence is too long
        frames = frames[:target_frames]

    # Stack frames along a new dimension
    sequence_tensor = torch.stack(frames)  # Shape: [frames, channels, height, width]
    sequence_tensor = sequence_tensor.permute(1, 0, 2, 3)  # Reshape to [channels, frames, height, width]

    # Add batch dimension
    sequence_tensor = sequence_tensor.unsqueeze(0)  # Shape: [1, channels, frames, height, width]

    return sequence_tensor


# Function to predict on a single sequence
def predict_sequence(model, sequence_tensor, device):
    """
    Makes a prediction on a single sequence.

    Args:
        model: Trained model
        sequence_tensor: Preprocessed sequence tensor
        device: Device to use for computation

    Returns:
        predicted_class: Predicted class index
        confidence: Confidence score (probability)
    """
    model.eval()
    with torch.no_grad():
        sequence_tensor = sequence_tensor.to(device)
        outputs = model(sequence_tensor)
        probabilities = torch.nn.functional.softmax(outputs, dim=1)
        confidence, predicted = torch.max(probabilities, 1)
        return predicted.item(), confidence.item()


# Function to predict all sequences in a directory
def predict_all_sequences(model, sequences_dir, device):
    """
    Predicts classes for all sequences in the given directory.

    Args:
        model: Trained model
        sequences_dir: Directory containing sequence folders
        device: Device to use for computation

    Returns:
        results: Dictionary with sequence names as keys and predictions as values
    """
    # Map numerical labels to gesture names
    label_map = {0: 'down', 1: 'left', 2: 'right', 3: 'up'}

    results = {}
    sequence_folders = sorted([f for f in os.listdir(sequences_dir)
                              if os.path.isdir(os.path.join(sequences_dir, f))])

    for sequence_folder in sequence_folders:
        sequence_path = os.path.join(sequences_dir, sequence_folder)

        # Preprocess the sequence
        sequence_tensor = preprocess_sequence(sequence_path)

        # Make prediction
        predicted_class, confidence = predict_sequence(model, sequence_tensor, device)

        # Store result
        results[sequence_folder] = {
            'predicted_class': predicted_class,
            'gesture_name': label_map[predicted_class],
            'confidence': confidence
        }

        print(f"Sequence: {sequence_folder}, Predicted: {label_map[predicted_class]}, Confidence: {confidence:.4f}")

    return results


# Function to visualize predictions
def visualize_predictions(results, save_path=None):
    """
    Visualizes prediction results.

    Args:
        results: Dictionary with prediction results
        save_path: Path to save the visualization
    """
    sequences = list(results.keys())
    confidences = [results[seq]['confidence'] for seq in sequences]
    gestures = [results[seq]['gesture_name'] for seq in sequences]

    # Set colors based on gesture
    colors = {'down': 'blue', 'left': 'green', 'right': 'red', 'up': 'purple'}
    bar_colors = [colors[gesture] for gesture in gestures]

    plt.figure(figsize=(12, 6))
    bars = plt.bar(sequences, confidences, color=bar_colors)

    # Add gesture labels on top of bars
    for bar, gesture in zip(bars, gestures):
        plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                 gesture, ha='center', fontsize=9)

    plt.ylim(0, 1.1)
    plt.xlabel('Sequence')
    plt.ylabel('Confidence')
    plt.title('Hand Gesture Predictions')
    plt.xticks(rotation=45, ha='right')

    # Create legend
    from matplotlib.patches import Patch
    legend_elements = [Patch(facecolor=color, label=gesture)
                      for gesture, color in colors.items()]
    plt.legend(handles=legend_elements)

    plt.tight_layout()
    if save_path:
        plt.savefig(save_path)
        print(f"Visualization saved to {save_path}")
    plt.show()


# Main function to load model and predict
def main():
    # Path to the saved model
    model_path = 'hand_gesture_model.pth'

    # Path to the directory containing sample sequences
    sequences_dir = 'D/sample_sequence'

    # Check if GPU is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Initialize the model with the same architecture
    #model = DenseNet3D(growth_rate=12, block_config=(2, 4, 4), num__init__features=16).to(device)
    model = DenseNet3D(block_config=(2, 4, 4), num__init__features=16).to(device)

    # Load the trained model weights
    try:
        model.load_state_dict(torch.load(model_path, map_location=device))
        print(f"Model loaded successfully from {model_path}")
    except Exception as e:
        print(f"Error loading model: {e}")
        return

    # Predict all sequences
    print("\nMaking predictions on sample sequences...")
    results = predict_all_sequences(model, sequences_dir, device)

    # Visualize results
    print("\nVisualizing predictions...")
    visualize_predictions(results, save_path='predictions_visualization.png')

    # Save results to file
    import json
    with open('prediction_results.json', 'w') as f:
        # Convert results to serializable format
        serializable_results = {}
        for key, value in results.items():
            serializable_results[key] = {
                'predicted_class': int(value['predicted_class']),
                'gesture_name': value['gesture_name'],
                'confidence': float(value['confidence'])
            }
        json.dump(serializable_results, f, indent=4)

    print("Results saved to prediction_results.json")

main()

Using device: cpu


TypeError: DenseNet3D.__init__() got an unexpected keyword argument 'block_config'

In [None]:
"""Best Live Prediction UpTill Now"""
import cv2
import numpy as np
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from collections import deque
import mediapipe as mp
import matplotlib.pyplot as plt
from PIL import Image
import time
import os

# Recreate the DenseNet3D model to match your training architecture
class Conv3DBlock(nn.Module):
    def ___init___(self, in_channels, out_channels, kernel_size=3, stride=1, padding=1):
        super(Conv3DBlock, self).___init___()
        self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride, padding, bias=False)
        self.bn = nn.BatchNorm3d(out_channels)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        return self.relu(self.bn(self.conv(x)))


class DenseBlock3D(nn.Module):
    def ___init___(self, in_channels, growth_rate, num_layers):
        super(DenseBlock3D, self).___init___()
        self.layers = nn.ModuleList()
        for i in range(num_layers):
            self.layers.append(self._make_dense_layer(in_channels + i * growth_rate, growth_rate))

    def _make_dense_layer(self, in_channels, growth_rate):
        return nn.Sequential(
            nn.BatchNorm3d(in_channels),
            nn.ReLU(inplace=True),
            nn.Conv3d(in_channels, 4 * growth_rate, kernel_size=1, bias=False),
            nn.BatchNorm3d(4 * growth_rate),
            nn.ReLU(inplace=True),
            nn.Conv3d(4 * growth_rate, growth_rate, kernel_size=3, padding=1, bias=False)
        )

    def forward(self, x):
        features = [x]
        for layer in self.layers:
            new_feature = layer(torch.cat(features, 1))
            features.append(new_feature)
        return torch.cat(features, 1)


class TransitionLayer3D(nn.Module):
    def ___init___(self, in_channels, out_channels):
        super(TransitionLayer3D, self).___init___()
        self.layers = nn.Sequential(
            nn.BatchNorm3d(in_channels),
            nn.ReLU(inplace=True),
            nn.Conv3d(in_channels, out_channels, kernel_size=1, bias=False),
            nn.AvgPool3d(kernel_size=2, stride=2)
        )

    def forward(self, x):
        return self.layers(x)


class DenseNet3D(nn.Module):
    def ___init___(self, growth_rate=12, block_config=(2, 4, 4), num__init__features=16,
                 compression_rate=0.5, num_classes=4):
        super(DenseNet3D, self).___init___()

        # First convolution and pooling
        self.features = nn.Sequential(
            nn.Conv3d(1, num__init__features, kernel_size=3, stride=1, padding=1, bias=False),
            nn.BatchNorm3d(num__init__features),
            nn.ReLU(inplace=True),
            nn.MaxPool3d(kernel_size=2, stride=2)
        )

        # Dense Blocks
        num_features = num__init__features
        for i, num_layers in enumerate(block_config):
            # Add a dense block
            block = DenseBlock3D(
                in_channels=num_features,
                growth_rate=growth_rate,
                num_layers=num_layers
            )
            self.features.add_module(f'denseblock{i+1}', block)
            num_features += num_layers * growth_rate

            # Add a transition layer (except after the last block)
            if i != len(block_config) - 1:
                out_features = int(num_features * compression_rate)
                trans = TransitionLayer3D(num_features, out_features)
                self.features.add_module(f'transition{i+1}', trans)
                num_features = out_features

        # Final batch norm
        self.features.add_module('norm_final', nn.BatchNorm3d(num_features))
        self.features.add_module('relu_final', nn.ReLU(inplace=True))

        # Global Average Pooling and classifier
        self.avgpool = nn.AdaptiveAvgPool3d((1, 1, 1))
        self.classifier = nn.Linear(num_features, num_classes)

    def forward(self, x):
        features = self.features(x)
        out = self.avgpool(features)
        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return out


class GestureRecognition:
    def ___init___(self, model_path, frame_count=8, collection_time=2.0, display_fps=True):
        # Initialize MediaPipe Hands
        self.mp_hands = mp.solutions.hands
        self.hands = self.mp_hands.Hands(
            static_image_mode=False,
            max_num_hands=1,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        )
        self.mp_drawing = mp.solutions.drawing_utils

        # Load the trained model
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = DenseNet3D(growth_rate=12, block_config=(2, 4, 4), num__init__features=16).to(self.device)
        self.model.load_state_dict(torch.load(model_path, map_location=self.device))
        self.model.eval()

        # Gesture classes
        self.gesture_classes = ['down', 'left', 'right', 'up']

        # Initialize variables for frames processing
        self.frame_count = frame_count
        self.collection_time = collection_time  # Time in seconds to collect frames
        self.frame_buffer = deque(maxlen=frame_count)
        self.all_processed_frames = []  # Store all processed frames during collection
        self.is_collecting = False
        self.hand_detected_count = 0
        self.min_hand_detected = 6  # Minimum number of frames that must have a hand

        # Collection timing variables
        self.collection_start_time = 0
        self.collection_end_time = 0

        # Define the same transformation pipeline as in training
        self.transform = transforms.Compose([
            transforms.Resize((64, 64)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.5], std=[0.5])
        ])

        # FPS calculation
        self.prev_frame_time = 0
        self.new_frame_time = 0
        self.display_fps = display_fps

        # Create directory for saving processed frames
        self.frames_dir = "processed_frames"
        os.makedirs(self.frames_dir, exist_ok=True)

    def preprocess_hand_frame(self, frame, results):
        """Extract hand from frame and preprocess it"""
        h, w, _ = frame.shape

        # If no hand is detected, return None
        if not results.multi_hand_landmarks:
            return None, None

        # Get landmarks for the first hand detected
        hand_landmarks = results.multi_hand_landmarks[0]

        # Calculate bounding box with padding
        x_min, x_max, y_min, y_max = w, 0, h, 0
        for landmark in hand_landmarks.landmark:
            x, y = int(landmark.x * w), int(landmark.y * h)
            x_min = min(x_min, x)
            x_max = max(x_max, x)
            y_min = min(y_min, y)
            y_max = max(y_max, y)

        # Add padding
        padding = 20
        x_min = max(0, x_min - padding)
        y_min = max(0, y_min - padding)
        x_max = min(w, x_max + padding)
        y_max = min(h, y_max + padding)

        # Check if bounding box is valid
        if x_min >= x_max or y_min >= y_max:
            return None, None

        # Crop the hand region
        hand_crop = frame[y_min:y_max, x_min:x_max].copy()

        # Create a copy of the original frame with hand landmarks
        marked_frame = frame.copy()
        self.mp_drawing.draw_landmarks(
            marked_frame,
            hand_landmarks,
            self.mp_hands.HAND_CONNECTIONS
        )

        # Draw bounding box
        cv2.rectangle(marked_frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)

        # Convert to grayscale and PIL image
        hand_crop_gray = cv2.cvtColor(hand_crop, cv2.COLOR_BGR2GRAY)
        pil_img = Image.fromarray(hand_crop_gray)

        # Apply transforms
        img_tensor = self.transform(pil_img)

        return img_tensor, marked_frame

    def predict_gesture(self):
        """Make a prediction based on collected frames"""
        if len(self.frame_buffer) < self.frame_count:
            return "Insufficient frames", 0.0

        if self.hand_detected_count < self.min_hand_detected:
            return "Hand not consistently detected", 0.0

        # Stack frames into a sequence
        sequence = torch.stack(list(self.frame_buffer))  # Shape: [frames, channels, height, width]
        sequence = sequence.permute(1, 0, 2, 3).unsqueeze(0)  # Reshape to [1, channels, frames, height, width]

        # Make prediction
        with torch.no_grad():
            sequence = sequence.to(self.device)
            outputs = self.model(sequence)
            probabilities = torch.nn.functional.softmax(outputs, dim=1)
            confidence, prediction = torch.max(probabilities, 1)

        # Get prediction class and confidence
        predicted_class = self.gesture_classes[prediction.item()]
        confidence_value = confidence.item()

        return predicted_class, confidence_value

    def save_processed_frames(self):
        """Save the processed frames for visualization"""
        # Clear previous frames
        for file in os.listdir(self.frames_dir):
            os.remove(os.path.join(self.frames_dir, file))

        # Save current frame buffer
        for i, frame_tensor in enumerate(self.frame_buffer):
            # Convert tensor to numpy array for visualization
            frame_np = frame_tensor.cpu().numpy()[0]  # Get the first channel
            frame_np = (frame_np * 0.5 + 0.5) * 255  # Denormalize
            frame_np = frame_np.astype(np.uint8)

            # Save the frame
            cv2.imwrite(os.path.join(self.frames_dir, f"frame_{i+1}.jpg"), frame_np)

    def visualize_processed_frames(self, prediction, confidence):
        """Plot frames used for prediction"""
        plt.figure(figsize=(16, 8))
        for i, frame_path in enumerate(sorted(os.listdir(self.frames_dir))):
            plt.subplot(2, 4, i+1)
            frame = cv2.imread(os.path.join(self.frames_dir, frame_path), cv2.IMREAD_GRAYSCALE)
            plt.imshow(frame, cmap='gray')
            plt.title(f"Frame {i+1}")
            plt.axis('off')

        plt.suptitle(f"Prediction: {prediction.upper()} (Confidence: {confidence:.2f})", fontsize=16)
        plt.tight_layout()
        plt.savefig("prediction_frames.png")
        plt.close()

    def select_evenly_spaced_frames(self):
        """Select evenly spaced frames from all collected frames"""
        total_frames = len(self.all_processed_frames)
        if total_frames <= self.frame_count:
            return self.all_processed_frames

        # Calculate indices for evenly spaced frames
        indices = np.linspace(0, total_frames - 1, self.frame_count, dtype=int)
        selected_frames = [self.all_processed_frames[i] for i in indices]
        return selected_frames

    def run(self):
        """Run the gesture recognition system"""
        cap = cv2.VideoCapture(0)

        # Status variables
        collecting_status = "Press 'c' to start collecting frames"
        prediction_result = "No prediction yet"
        confidence = 0.0

        while True:
            ret, frame = cap.read()
            if not ret:
                break

            # Calculate FPS
            if self.display_fps:
                self.new_frame_time = time.time()
                fps = 1/(self.new_frame_time - self.prev_frame_time) if self.prev_frame_time > 0 else 0
                self.prev_frame_time = self.new_frame_time
                cv2.putText(frame, f"FPS: {int(fps)}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

            # Convert the BGR image to RGB
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            results = self.hands.process(rgb_frame)

            # Draw hand landmarks on the frame
            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    self.mp_drawing.draw_landmarks(
                        frame, hand_landmarks, self.mp_hands.HAND_CONNECTIONS)

            # Display collection status
            cv2.putText(frame, collecting_status, (10, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)

            # Display prediction result
            cv2.putText(frame, f"Prediction: {prediction_result}", (10, 90), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
            if confidence > 0:
                cv2.putText(frame, f"Confidence: {confidence:.2f}", (10, 120), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)

            # If collecting frames
            if self.is_collecting:
                current_time = time.time()
                elapsed_time = current_time - self.collection_start_time

                # Process the current frame
                processed_frame, marked_frame = self.preprocess_hand_frame(frame, results)

                if processed_frame is not None:
                    self.all_processed_frames.append(processed_frame)
                    self.hand_detected_count += 1
                    if marked_frame is not None:
                        frame = marked_frame

                # Update collection status with time remaining
                time_left = max(0, self.collection_time - elapsed_time)
                collecting_status = f"Collecting: {elapsed_time:.1f}s / {self.collection_time:.1f}s"

                # Add a progress bar
                progress = int(min(elapsed_time / self.collection_time, 1.0) * 200)
                cv2.rectangle(frame, (10, 150), (210, 170), (0, 0, 0), -1)
                cv2.rectangle(frame, (10, 150), (10 + progress, 170), (0, 255, 0), -1)

                # If collection time is over
                if elapsed_time >= self.collection_time:
                    self.is_collecting = False

                    # Select evenly spaced frames from all collected
                    selected_frames = self.select_evenly_spaced_frames()

                    # Update frame buffer with selected frames
                    self.frame_buffer = deque(selected_frames, maxlen=self.frame_count)

                    # Make prediction
                    prediction_result, confidence = self.predict_gesture()
                    collecting_status = "Press 'c' to collect new frames"

                    # Save and visualize processed frames
                    self.save_processed_frames()
                    self.visualize_processed_frames(prediction_result, confidence)

            # Display the resulting frame
            cv2.imshow('Hand Gesture Recognition', frame)

            # Key handling
            key = cv2.waitKey(1) & 0xFF
            if key == ord('q'):
                break
            elif key == ord('c'):
                # Start collecting frames
                self.is_collecting = True
                self.all_processed_frames = []
                self.frame_buffer.clear()
                self.hand_detected_count = 0
                self.collection_start_time = time.time()
                collecting_status = "Collecting started..."

        # Release the capture and close windows
        cap.release()
        cv2.destroyAllWindows()


if __name__ == "__main__":
    # Replace with the path to your trained model
    model_path = "hand_gesture_model.pth"

    # Initialize and run the gesture recognition system
    # Set collection_time to 2.0 seconds
    gesture_recognition = GestureRecognition(model_path, collection_time=1.0)
    gesture_recognition.run()