In [2]:
pip install tensorflow opencv-python mediapipe scikit-learn matplotlib

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


FUNCTIONS FOR DETECTING OR EXTRACTING HANDS

In [3]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

In [4]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [5]:
def mediapipe_detection(image, model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

In [6]:
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw right hand connections

In [7]:
def draw_styled_landmarks(image, results):
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

In [8]:
def extract_keypoints(results):
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21 * 3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21 * 3)
    return np.concatenate([lh, rh])  # Output size will be (126,)

In [9]:
def extract_hand(frame, model):
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
    frame.flags.writeable = False                  
    results = model.process(frame)                 
    frame.flags.writeable = True                   
    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)  # Convert RGB back to BGR

    # Check if any hand is detected
    if results.right_hand_landmarks:
        hand_landmarks = results.right_hand_landmarks.landmark
    elif results.left_hand_landmarks:
        hand_landmarks = results.left_hand_landmarks.landmark
    else:
        return None, results

    # Get bounding box
    x_min, y_min = float('inf'), float('inf')
    x_max, y_max = 0, 0

    h, w, _ = frame.shape
    for lm in hand_landmarks:
        x, y = int(lm.x * w), int(lm.y * h)
        x_min, y_min = min(x, x_min), min(y, y_min)
        x_max, y_max = max(x, x_max), max(y, y_max)

    # Add padding
    padding = 20
    x_min, y_min = max(0, x_min - padding), max(0, y_min - padding)
    x_max, y_max = min(w, x_max + padding), min(h, y_max + padding)

    # Crop and resize
    hand_crop = frame[y_min:y_max, x_min:x_max]

    # Check if the crop is empty
    if hand_crop.size == 0:
        return None, results

    return hand_crop, results

CREATING DIRECTORIES FOR TRAINING DATA

In [None]:
# Path for exported data, numpy arrays
DATA_PATH = os.path.join('Data_30_Frame_Limit_200_Videos') 

# Actions that we try to detect
actions = np.array(['zoom_in', 'reset_zoom', 'next_slide', 'prev_slide', 'annotation', 'pointer'])

# Fifty videos worth of data
no_sequences = 200

# Videos are going to be 50 frames in length
sequence_length = 30

# Folder start
start_folder = 1

In [11]:
for action in actions: 
    for sequence in range(0,no_sequences):
        try: 
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

PROCESSING THE TRAINING DATA

In [None]:
# NEW VIDEOS WITH 30 FRAMES EACH - PROCESSING CODE

VIDEO_PATH = "training_videos"

# Function to process each video
def process_video(video_file, action, sequence_num):
    cap = cv2.VideoCapture(video_file)
    frame_count = 0

    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while True:
            ret, frame = cap.read()
            if not ret:
                break  # Stop when video ends

            # Process frame
            hand_image, results = extract_hand(frame, holistic)

            if hand_image is None:
                print(f"Warning: No hand detected in frame {frame_count} of {video_file}")
                #frame_count += 1
                keypoints = np.zeros((126,)) # saves all the keypoints as 0s if hand is not detected

            else:
                draw_styled_landmarks(hand_image, results)
                # Extract keypoints and save them
                keypoints = extract_keypoints(results)

            npy_path = os.path.join(DATA_PATH, action, str(sequence_num), str(frame_count))
            os.makedirs(os.path.dirname(npy_path), exist_ok=True)
            np.save(npy_path, keypoints)

            frame_count += 1

        cap.release()
        print(f'The total number of frames for {video_file} is {frame_count}')

# Process all video clips
for action in actions:
    action_path = os.path.join(VIDEO_PATH, action) 
    video_files = sorted(os.listdir(action_path)) 
    
    for idx, video in enumerate(video_files):
        video_file = os.path.join(action_path, video)
        print(f"Processing {video_file} for action {action} ({idx + 1}/{len(video_files)})")
        process_video(video_file, action, idx)

print("All videos processed successfully!")
cv2.destroyAllWindows()

Processing videos_8\zoom_in\video_1 (2).avi for action zoom_in (1/200)
The total number of frames for videos_8\zoom_in\video_1 (2).avi is 30
Processing videos_8\zoom_in\video_1 (3).avi for action zoom_in (2/200)
The total number of frames for videos_8\zoom_in\video_1 (3).avi is 30
Processing videos_8\zoom_in\video_1.avi for action zoom_in (3/200)
The total number of frames for videos_8\zoom_in\video_1.avi is 30
Processing videos_8\zoom_in\video_10 (2).avi for action zoom_in (4/200)
The total number of frames for videos_8\zoom_in\video_10 (2).avi is 30
Processing videos_8\zoom_in\video_10 (3).avi for action zoom_in (5/200)
The total number of frames for videos_8\zoom_in\video_10 (3).avi is 30
Processing videos_8\zoom_in\video_10 (4).avi for action zoom_in (6/200)
The total number of frames for videos_8\zoom_in\video_10 (4).avi is 30
Processing videos_8\zoom_in\video_10.avi for action zoom_in (7/200)
The total number of frames for videos_8\zoom_in\video_10.avi is 30
Processing videos_8\z

DATA LABELLING

In [13]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [14]:
label_map = {label:num for num, label in enumerate(actions)} # enumerate just adds index -> [(0, 'zoom_in'), (1, 'zoom_out')]

In [15]:
# loading the frames - only 200 each
sequences, labels = [], []
for action in actions:
    for sequence in np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int):
        window = []
        for frame_num in range(sequence_length):
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            window.append(res) # adding each frame to the window
        sequences.append(window) # adding each the window with the frames to sequence
        labels.append(label_map[action]) # labelling the sequence with the action

In [None]:
# interpolation of frames with only 0's
import numpy as np
from scipy.interpolate import interp1d

def interpolate_missing_frames(sequence):
    """Interpolates missing frames in a sequence using linear interpolation."""
    sequence = np.array(sequence)  # Convert to NumPy array
    num_frames, num_keypoints = sequence.shape

    # Find indices where frames are missing (all zeros)
    missing_indices = [i for i in range(num_frames) if np.all(sequence[i] == 0)]

    if not missing_indices:  
        return sequence  # No missing frames, return as is

    # Get indices of existing frames
    valid_indices = [i for i in range(num_frames) if i not in missing_indices]
    valid_frames = sequence[valid_indices]

    if len(valid_indices) == 0:
        return sequence  # Avoid error if all frames are missing

    # Apply interpolation
    interp_func = interp1d(valid_indices, valid_frames, axis=0, kind='linear', fill_value="extrapolate")
    interpolated_frames = interp_func(missing_indices)

    # Fill missing frames with interpolated values
    sequence[missing_indices] = interpolated_frames

    return sequence

# Apply interpolation to all sequences
for i in range(len(sequences)):
    if not isinstance(sequences[i], np.ndarray) or sequences[i].size == 0:
        print(f"Skipping empty or invalid sequence {i}")
        continue  # Skip if empty or not a valid NumPy array

    print(f"Processing sequence {i} with shape: {sequences[i].shape}")

    if np.any(np.all(sequences[i] == 0, axis=1)):  # Check for missing frames
        print(f"🔄 Interpolating missing frames in sequence {i}...")
        sequences[i] = interpolate_missing_frames(sequences[i])

LSTM MODEL

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [17]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir) # setting the tensorflow directory as log_dir so that all the logs will be stored there

In [18]:
# CODE FOR MODEL - 1

model = Sequential()

# LSTM layers
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30, 126)))
model.add(Dropout(0.2)) 
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(Dropout(0.2))
model.add(LSTM(128, return_sequences=False, activation='relu'))

# Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))

# Output layer
model.add(Dense(actions.shape[0], activation='softmax'))

optimizer = Adam(learning_rate=0.0005)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

  super().__init__(**kwargs)


TRAINING AND SAVING THE MODEL

In [19]:
X = np.array(sequences)
y = to_categorical(labels).astype(int) # represent the array (labels) in binary format ('zoom_in'->[1,0] 'zoom_out'->[0,1])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15) # X -> features , y -> labels

In [20]:
# Callbacks for early stopping and TensorBoard logging
early_stopping = EarlyStopping(monitor='accuracy', mode='max', patience=10, restore_best_weights=True)
tb_callback = TensorBoard(log_dir="logs", histogram_freq=1)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=50,  # 50 epochs seems optimal
    validation_data=(X_test, y_test),
    batch_size=32,
    callbacks=[early_stopping, tb_callback]
)

Epoch 1/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 41ms/step - categorical_accuracy: 0.1422 - loss: 1.7912 - val_categorical_accuracy: 0.2556 - val_loss: 1.7303
Epoch 2/50
[1m 1/32[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 52ms/step - categorical_accuracy: 0.1875 - loss: 1.8151

  current = self.get_monitor_value(logs)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - categorical_accuracy: 0.2619 - loss: 1.7104 - val_categorical_accuracy: 0.2944 - val_loss: 1.4552
Epoch 3/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - categorical_accuracy: 0.3274 - loss: 1.4559 - val_categorical_accuracy: 0.3722 - val_loss: 1.2715
Epoch 4/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - categorical_accuracy: 0.3840 - loss: 1.2855 - val_categorical_accuracy: 0.5611 - val_loss: 0.8598
Epoch 5/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - categorical_accuracy: 0.4761 - loss: 1.1498 - val_categorical_accuracy: 0.7111 - val_loss: 0.8576
Epoch 6/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - categorical_accuracy: 0.5591 - loss: 0.9678 - val_categorical_accuracy: 0.6056 - val_loss: 0.7937
Epoch 7/50
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step -

KeyboardInterrupt: 

In [None]:
model.save('final_model_30_frames_200_videos.h5')
model.load_weights('final_model_30_frames_200_videos.h5')

