# Постановка задачи.

Нужно написать приложение, которое будет считывать и выводить кадры с веб-
камеры. В процессе считывания определять что перед камерой находится человек,
задетектировав его лицо на кадре. После этого, человек показывает жесты руками, а
алгоритм должен считать их и определенным образом реагировать на эти жесты.
На то, как система будет реагировать на определенные жесты - выбор за вами.
Например, на определенный жест (жест пис), система будет здороваться с человеком.
На другой, будет делать скриншот экрана. И т.д.
Для распознавания жестов, вам надо будет скачать датасет для жестов рук.

# Решение

Для решения задачи (после долгих раздумий связанных с выбором датасета) было принято решение самому создать мини-датасет с использованием веб-камеры с которой в дальнейшем и должна будет работать программа. Данные собирались (и размечались) автоматически с использованием - __mediapipe__.

В данном ноутбуке программа обучается на этих данных и в режиме онлайн выводит название жеста а так же ведет журнал жестов. При появлении жеста __"Ok"__ происходит запись картинки на диск.

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch import optim

import glob 
import pickle
import numpy as np
from numpy.random import permutation

import time
import math

import cv2
from facenet_pytorch import MTCNN
import mediapipe as mp


mp_drawing = mp.solutions.drawing_utils
mp_hands = mp.solutions.hands


def save_obj(obj, name ):
    with open('objects/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
        

def load_obj(name ):
    with open('objects/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)


use_cuda = torch.cuda.is_available()
# device = torch.device("cpu")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [2]:
file_name_V = './video/data__0' 
file_name_O = './video/data__1'
file_name_W = './video/data__2'

LABELS = ['Victory', 'Ok', 'WTF']

data_V = load_obj(file_name_V)
data_O = load_obj(file_name_O)
data_W = load_obj(file_name_W)

data = np.vstack((data_V, data_O))
data = np.vstack((data, data_W))


perm = permutation(len(data))
data = data[perm]

data_V.shape, data_O.shape, data_W.shape, data.shape

((937, 64), (576, 64), (540, 64), (2053, 64))

In [3]:
X_data = data[:,:-1]
y_data = data[:,-1].astype('int64')
X_data.shape, y_data.shape

((2053, 63), (2053,))

In [4]:
sep = int(len(X_data)*0.75)
sep

1539

In [5]:
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
#     print(output.topk(5))
    return LABELS[category_i], category_i

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [6]:
class Skeleton_Dataset(Dataset):
    def __init__(self, X, y=None):
        self.data = X
        self.labels = y

    def __len__(self):
        return len(self.data) 
       
    def __getitem__(self, idx):

        item = self.data[idx]        
        label = self.labels[idx]        

        return (item, label) 

In [7]:
dataset = Skeleton_Dataset(X_data, y_data)

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [sep, len(X_data)-sep])
train_loader = DataLoader(train_dataset, batch_size = 256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size = 1, shuffle=True)

In [8]:
class My_net(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
      
        self.output_dim = output_dim
        self.fc1 = torch.nn.Linear(input_dim,256)
        self.elu1 = torch.nn.ELU(inplace=True)
        self.fc2 = torch.nn.Linear(256,512)        
        self.elu2 = torch.nn.ELU(inplace=True)        
        self.dr1 = torch.nn.Dropout(0.5)
        self.fc3 = torch.nn.Linear(512,output_dim)
        self.sm = torch.nn.Softmax(dim=1)
        
        
    def forward(self, inputs):
        x = inputs
        x = self.fc1(x)
        x = self.elu1(x)
        x = self.fc2(x)
        x = self.elu2(x)        
        x = self.dr1(x)    
        x = self.fc3(x)        
        out = self.sm(x)
        
        return out

In [9]:
input_dim = 63
output_dim = len(LABELS)
Class_net = My_net(input_dim, output_dim)
Class_net.to(device)

My_net(
  (fc1): Linear(in_features=63, out_features=256, bias=True)
  (elu1): ELU(alpha=1.0, inplace=True)
  (fc2): Linear(in_features=256, out_features=512, bias=True)
  (elu2): ELU(alpha=1.0, inplace=True)
  (dr1): Dropout(p=0.5, inplace=False)
  (fc3): Linear(in_features=512, out_features=3, bias=True)
  (sm): Softmax(dim=1)
)

In [10]:
criterion = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = optim.SGD(Class_net.parameters(),lr=learning_rate,momentum=0.9)

all_losses = []
start = time.time()
counter = 0
for epoch in range(2100):  
    current_loss = 0
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        
        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
    
        output = Class_net(inputs.float())
        
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step() 


        current_loss += loss.item()
        category = LABELS[int(labels[0])]

        if counter % 500 == 0:
            guess, guess_i = categoryFromOutput(output)
            correct = '✓' if guess == category else '✗ (%s)' % category
            print('epoch : %d iter : %d (%s) %.4f  / %s %s' % (epoch, i, timeSince(start), loss, guess, correct))

        
        counter = counter + 1
    if counter % 100 == 0:
        all_losses.append(current_loss / 25)
        current_loss = 0

epoch : 0 iter : 0 (0m 0s) 1.1047  / Ok ✓
epoch : 71 iter : 3 (0m 3s) 1.0403  / Victory ✓
epoch : 142 iter : 6 (0m 6s) 0.9041  / Victory ✓
epoch : 214 iter : 2 (0m 9s) 0.9876  / Victory ✓
epoch : 285 iter : 5 (0m 12s) 0.9067  / Victory ✓
epoch : 357 iter : 1 (0m 16s) 0.8665  / Victory ✓
epoch : 428 iter : 4 (0m 19s) 0.8753  / Victory ✓
epoch : 500 iter : 0 (0m 22s) 0.8281  / Victory ✓
epoch : 571 iter : 3 (0m 25s) 0.7843  / Victory ✓
epoch : 642 iter : 6 (0m 28s) 0.6321  / Victory ✓
epoch : 714 iter : 2 (0m 31s) 0.7575  / Victory ✗ (WTF)
epoch : 785 iter : 5 (0m 34s) 0.7575  / Victory ✓
epoch : 857 iter : 1 (0m 37s) 0.7156  / WTF ✓
epoch : 928 iter : 4 (0m 40s) 0.6747  / Ok ✓
epoch : 1000 iter : 0 (0m 43s) 0.6794  / Victory ✓
epoch : 1071 iter : 3 (0m 46s) 0.6508  / WTF ✓
epoch : 1142 iter : 6 (0m 49s) 0.6480  / Victory ✓
epoch : 1214 iter : 2 (0m 52s) 0.6237  / Victory ✓
epoch : 1285 iter : 5 (0m 55s) 0.6280  / Ok ✓
epoch : 1357 iter : 1 (0m 58s) 0.6149  / Ok ✓
epoch : 1428 iter : 4 (

In [11]:
# Функция рисования найденных параметров на кадре
def draw(frame, boxes, probs, landmarks):
    try:
        for box, prob, ld in zip(boxes, probs, landmarks):
            # Рисуем обрамляющий прямоугольник лица на кадре
            cv2.rectangle(frame,
                          (int(box[0]), int(box[1])),
                          (int(box[2]), int(box[3])),
                          (255, 0, 255),
                          thickness=3)

            # Рисуем особенные точки
            #cv2.circle(frame, (int(ld[0][0]),int(ld[0][1])), 5, (0, 0, 255), -1)
            #cv2.circle(frame, (int(ld[1][0]),int(ld[1][1])), 5, (0, 0, 255), -1)
            #cv2.circle(frame, (int(ld[2][0]),int(ld[2][1])), 5, (0, 0, 255), -1)
            #cv2.circle(frame, (int(ld[3][0]),int(ld[3][1])), 5, (0, 0, 255), -1)
            #cv2.circle(frame, (int(ld[4][0]),int(ld[4][1])), 5, (0, 0, 255), -1)
    except Exception as e:
        #print('Something wrong im draw function!')
        #print(f'error : {e}')
        pass
    return frame

In [12]:
file_name = "./MAH04286.MP4"
#cap = cv2.VideoCapture(file_name)  

# For webcam input:
drawing_spec = mp_drawing.DrawingSpec(thickness=1, circle_radius=1)
cap = cv2.VideoCapture(0)  


mtcnn = MTCNN()
gesture_text_from_session = []
pre_char=''
current_char=''
Ok_count = 0


with mp_hands.Hands(
    min_detection_confidence=0.75,
    min_tracking_confidence=0.75,
    max_num_hands=1) as hands:
    while cap.isOpened():
        success, image = cap.read()
        if not success:
            print("Ignoring empty camera frame.")
            # If loading a video, use 'break' instead of 'continue'.
            #break
            continue
            
            
        scale_percent = 50    
        width  = int(image.shape[1] * scale_percent / 100)
        height = int(image.shape[0] * scale_percent / 100)

        # dsize
        dsize = (width, height)    

        # resize image
        image = cv2.resize(image, dsize)
        try:        
            boxes, probs, landmarks = mtcnn.detect(image, landmarks=True)                
            image = draw(image, boxes, probs, landmarks)        
        except:
            pass
                
        palm_vector_list = []
        
        # Flip the image horizontally for a later selfie-view display, and convert
        # the BGR image to RGB.
        
        image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
        #image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)        
        # To improve performance, optionally mark the image as not writeable to
        # pass by reference.
        image.flags.writeable = False
        results = hands.process(image)

        # Draw the hand mesh annotations on the image.
        image.flags.writeable = True
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
        
        if results.multi_hand_landmarks:
            for hand_landmarks in results.multi_hand_landmarks:
                for n_point in range(len(results.multi_hand_landmarks[0].landmark)):
                        palm_vector_list.append(hand_landmarks.landmark[n_point].x)
                        palm_vector_list.append(hand_landmarks.landmark[n_point].y)                        
                        palm_vector_list.append(hand_landmarks.landmark[n_point].z)                
                mp_drawing.draw_landmarks(image, hand_landmarks, mp_hands.HAND_CONNECTIONS) 

            
            prob=torch.tensor(np.array(palm_vector_list), dtype=torch.float, device=device)
            prob=torch.reshape(prob, (1, X_data.shape[1]))
            
            result = Class_net(prob)
            
            emotion = categoryFromOutput(result)[0]
            
            cv2.putText(image, 
                    emotion, (30, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)

            
            current_char = emotion
            if current_char != pre_char:
                gesture_text_from_session.append(current_char)
                pre_char = current_char
                
                if current_char == 'Ok':
                    Ok_count = Ok_count+1
                    #save_obj(results.multi_hand_landmarks, './video/landmarks_palm__' + str(label) +"___"+ str(count) )   
                    cv2.imwrite('video/palm_' +str(current_char)  +"___" + str(Ok_count) + '.png', image)

                  
            
            cv2.imshow('MediaPipe PalmMesh', image)

        else:
            emotion = 'No hand found'
            cv2.putText(image, 
                    emotion, (30, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, cv2.LINE_AA)

            cv2.imshow('MediaPipe PalmMesh', image)
              
            
        if cv2.waitKey(5) & 0xFF == 27:
                break

cap.release()
cv2.destroyAllWindows()

In [13]:
new_file=open("gesture_text_from_session.txt", mode="w",encoding="utf-8")
for word in gesture_text_from_session:
    new_file.write(word +"\n")    
new_file.close()