In [4]:
import os
import cv2
import mediapipe as mp
import csv
import time
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
from collections import Counter
from sklearn.preprocessing import LabelEncoder


mp_holistic = mp.solutions.holistic
holistic = mp_holistic.Holistic(static_image_mode=False, 
                                model_complexity=2, 
                                min_detection_confidence=0.5, 
                                min_tracking_confidence=0.5)


output_csv_path = './camera_output_signals.csv'


key_face_landmarks_indices = [1, 33, 263, 61, 291, 199]  


sequence_length = 30


landmark_data = []


def process_camera_feed():
    cap = cv2.VideoCapture(0)  

    
    fps_start_time = time.time()
    fps_counter = 0
    fps = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    
        results = holistic.process(rgb_frame)

        
        frame_landmarks = {}
        if results.pose_landmarks:
            frame_landmarks['pose'] = [(landmark.x, landmark.y, landmark.z) for landmark in results.pose_landmarks.landmark]
        if results.face_landmarks:
            frame_landmarks['face'] = [(landmark.x, landmark.y, landmark.z) for idx, landmark in enumerate(results.face_landmarks.landmark) if idx in key_face_landmarks_indices]
        if results.left_hand_landmarks:
            frame_landmarks['left_hand'] = [(landmark.x, landmark.y, landmark.z) for landmark in results.left_hand_landmarks.landmark]
        if results.right_hand_landmarks:
            frame_landmarks['right_hand'] = [(landmark.x, landmark.y, landmark.z) for landmark in results.right_hand_landmarks.landmark]
        
        
        landmark_data.append(frame_landmarks)

        
        fps_counter += 1
        if fps_counter >= 10:  
            fps_end_time = time.time()
            fps = int(fps_counter / (fps_end_time - fps_start_time))
            fps_counter = 0
            fps_start_time = fps_end_time

        cv2.putText(frame, f"FPS: {fps}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

        
        cv2.imshow('Camera Feed', frame)

        
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    
    cap.release()
    cv2.destroyAllWindows()


def save_landmarks_to_csv(landmark_data, csv_path):
    with open(csv_path, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        header = ['frame', 'type', 'index', 'x', 'y', 'z']
        csvwriter.writerow(header)

        for frame_index, frame_data in enumerate(landmark_data):
            for landmark_type, landmarks in frame_data.items():
                for idx, (x, y, z) in enumerate(landmarks):
                    csvwriter.writerow([frame_index, landmark_type, idx, x, y, z])


def prepare_input_data(csv_file_path, sequence_length=30):

    df = pd.read_csv(csv_file_path)
    

    landmarks = df[['x', 'y', 'z']].values
    num_frames = len(landmarks)


    sequences = []
    for start in range(0, num_frames - sequence_length, sequence_length):
        sequence = landmarks[start:start + sequence_length]
        sequences.append(sequence)


    sequences = np.array(sequences)
    
    return sequences


def predict_gesture(model, label_encoder, csv_file_path, sequence_length=30):

    sequences = prepare_input_data(csv_file_path, sequence_length)
    

    predictions = model.predict(sequences)
    

    predicted_class_idx = np.argmax(predictions, axis=1)
    

    predicted_labels = label_encoder.inverse_transform(predicted_class_idx)
    
    return predicted_labels


def get_most_frequent_prediction(predicted_labels):

    label_counts = Counter(predicted_labels)
    

    most_common_label, most_common_count = label_counts.most_common(1)[0]
    
    return most_common_label, most_common_count, label_counts


model = load_model('gesture_to_word_model.h5')

labels = ['Achamma','Alchi','Bar','Bhok','Dikka','Dukhi','Eklopan'] 
label_encoder = LabelEncoder()


label_encoder.fit(labels)  


np.save('label_encoder_classes.npy', label_encoder.classes_)



process_camera_feed()


save_landmarks_to_csv(landmark_data, output_csv_path)
print(f"CSV file saved at: {output_csv_path}")


predicted_labels = predict_gesture(model, label_encoder, output_csv_path)


print("Predicted gesture(s):", predicted_labels)


most_common_label, most_common_count, label_counts = get_most_frequent_prediction(predicted_labels)


print(f"The most predicted gesture is: {most_common_label} with {most_common_count} occurrences.")
print(f"Full count of predictions: {label_counts}")




CSV file saved at: ./camera_output_signals.csv
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 308ms/step
Predicted gesture(s): ['Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar'
 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Dikka'
 'Dikka' 'Dikka' 'Achamma' 'Bar' 'Bar' 'Bar' 'Bhok' 'Bar' 'Bar' 'Bar'
 'Bar' 'Bar' 'Alchi' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar'
 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar' 'Bar']
The most predicted gesture is: Bar with 49 occurrences.
Full count of predictions: Counter({'Bar': 49, 'Dikka': 3, 'Achamma': 1, 'Bhok': 1, 'Alchi': 1})
