CNN for video classification

In [4]:
import os
import numpy as np
import cv2
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Set seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Function to load videos from a directory
def load_videos_from_directory(directory, max_frames=100):
    video_data = []
    labels = []
    
    # Extract label from directory name
    label = os.path.basename(directory)
    
    for filename in os.listdir(directory):
        if filename.endswith('.mp4'):
            file_path = os.path.join(directory, filename)
            
            # Read video frames
            cap = cv2.VideoCapture(file_path)
            frames = []
            
            frame_count = 0
            while True:
                ret, frame = cap.read()
                if not ret or frame_count >= max_frames:
                    break
                
                # Preprocess frame (resize, normalize, etc.)
                frame = cv2.resize(frame, (64, 64))
                frame = frame.astype("float32") / 255.0
                frames.append(frame)
                
                frame_count += 1
            
            cap.release()
            
            # Pad or truncate frames to max_frames
            frames = frames + [np.zeros_like(frames[0])] * (max_frames - len(frames))
            
            video_data.append(frames)
            labels.append(label)
    
    return np.array(video_data), np.array(labels)

# Specify directories
dance = "/Users/sudachk/PacktPublishing/DataLabelling/ch09/Kinetics/dance"
brush = "/Users/sudachk/PacktPublishing/DataLabelling/ch09/Kinetics/brushing"
new_video_data = "/Users/sudachk/PacktPublishing/DataLabelling/ch09/Kinetics/test"

# Load video data and get the maximum number of frames
dance_video, _ = load_videos_from_directory(dance)
brushing_video, _ = load_videos_from_directory(brush)
test_video, _ = load_videos_from_directory(new_video_data)

# Calculate the overall maximum number of frames
max_frames = max(dance_video.shape[1], brushing_video.shape[1])

# Truncate or pad frames to max_frames for both classes
dance_video = dance_video[:, :max_frames, :, :, :]
brushing_video = brushing_video[:, :max_frames, :, :, :]

# Combine data from both classes
video_data = np.concatenate([dance_video, brushing_video])

# Create labels and perform one-hot encoding
labels = np.array([0] * len(dance_video) + [1] * len(brushing_video))

# Check the size of the dataset
print("Total samples:", len(video_data))

# Make sure the dataset has enough samples for splitting
if len(video_data) >= 2:
    # Convert labels to one-hot encoding
    labels_one_hot = keras.utils.to_categorical(labels, num_classes=2)

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(video_data, labels_one_hot, test_size=0.2, random_state=42)
    
    # Define the CNN model with softmax activation for multi-class classification
    model = keras.Sequential(
        [
            layers.Conv3D(32, kernel_size=(3, 3, 3), activation="relu", input_shape=(max_frames, 64, 64, 3)),
            layers.MaxPooling3D(pool_size=(2, 2, 2)),
            layers.Conv3D(64, kernel_size=(3, 3, 3), activation="relu"),
            layers.MaxPooling3D(pool_size=(2, 2, 2)),
            layers.Flatten(),
            layers.Dense(128, activation="relu"),
            layers.Dense(2, activation="softmax")  # Two output nodes for binary classification with softmax activation
        ]
    )

    # Compile the model
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

    # Train the model
    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
        
    # Evaluate the model on the test set
    loss, accuracy = model.evaluate(X_test, y_test)
    print("Test Loss:", loss)
    print("Test Accuracy:", accuracy)

    # Save the model
    model.save("video_classification_model.h5")

    # Load the model
    loaded_model = keras.models.load_model("video_classification_model.h5")

else:
    print("Insufficient data for training and testing.")


Total samples: 5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.0
Test Accuracy: 1.0


In [10]:

# Predictions on new video data
# Assuming 'test_video' is loaded and preprocessed similarly to the training data
predictions = loaded_model.predict(test_video)
# Define the label mapping
label_mapping = {0: 'Dance', 1: 'Brushing'}
# Print class probabilities for each video in the test set
for i, pred in enumerate(predictions):
    print(f"Video {i + 1} - Class Probabilities: Dance={pred[0]:.4f}, Brushing={pred[1]:.4f}")
    # Convert predictions to labels using the mapping
predicted_labels = np.vectorize(label_mapping.get)(np.argmax(predictions, axis=1))
print(predicted_labels)

Video 1 - Class Probabilities: Dance=1.0000, Brushing=0.0000
['Dance']
