## MediaPipe

- holistic 객체의 
- face_landmarks = 얼굴 랜드마크
- left_hand_landmarks = 왼쪽 손 랜드마크
- right_hand_landmarks = 오른쪽 손 랜드마크
- pose_landmarks = pose 랜드마크
- [mediapipe-documentation]https://google.github.io/mediapipe/getting_started/python.html



 ## 1. Install dependencies

In [1]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp
# train, test 데이터 분할
from sklearn.model_selection import train_test_split

'''
클래스 벡터(정수)를 이진 클래스 행렬로 변환한다.
ex) 클래스 (0, 1, 2)인 label 데이터가 있다고 가정.
'0' 클래스인 경우 => [1, 0 ,0]
'1' 클래스인 경우 => [0, 1, 0]
'2' 클래스인 경우 => [0, 0, 1]
'''
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard # 로깅

## 2. Keypoints using MP Holistic

In [2]:
mp_holistic = mp.solutions.holistic # Holistic model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

# mediapipe 감지 함수
def mediapipe_detection(image,model):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB
    image.flags.writeable = False                  # Image is no longer writeable
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   # Image is now writeable
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results


# landmark 특징점 그리는 함수
def draw_landmarks(image, results):
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION) # Draw face connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS) # Draw pose connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS) # Draw right hand connections


# landmark 특징점의 선 굵기나, 색상을 변경하는 함수.
def draw_styled_landmarks(image, results):
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION,
                                # 색상 변경. 선의 굵기나, 색상을 변경한다.
                                mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1),
                                mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                            ) 

    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                                # 색상 변경. 선의 굵기나, 색상을 변경한다.
                                mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4),
                                mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
    )

    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                                # 색상 변경. 선의 굵기나, 색상을 변경한다.
                                mp_drawing.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
                                mp_drawing.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2)
    ) 

    # Draw right hand connections
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                                # 색상 변경. 선의 굵기나, 색상을 변경한다.
                                mp_drawing.DrawingSpec(color=(245 ,117, 66), thickness=2, circle_radius=4),
                                mp_drawing.DrawingSpec(color=(245, 66, 230), thickness=2, circle_radius=2)
    ) 


# keypoints 추출 함수
def extract_keypoints(results):
    # pose의 landmark 배열을 일차원으로 펴서 반환한다. pose landmark가 없을 시 동일한 shape를 가진 영행렬을 반환한다.
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33 * 4) # member : x,y,z,visibility

    # face의 landmark 배열을 일차원으로 펴서 반환. face landmark가 없을 시 동일한 shape를 가진 영행렬을 반환한다.
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468 * 3) # member : x,y,z

    # 왼손 랜드마크 배열 left_hand_landmarks가 없을 경우에는 동일한 shape의 영행렬을 반환한다.
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21 * 3) # member : x,y,z

    # 오른손 랜드마크 배열 left_hand_landmarks가 없을 경우에는 동일한 shape의 영행렬을 반환한다.
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3) # member : x,y,z
    return np.concatenate([pose, face, lh, rh])

## 3. Setup Folders for Collection

In [3]:
actions = os.listdir("train_data")

In [4]:
actions

['back', 'belly', 'bleeding', 'burn', 'chest']

In [5]:
# 추출할 numpy array 타입의 데이터 PATH
DATA_PATH = os.path.join("Multi_Pose_Data")

# 특정 행동 들을 감지하려는 작업 (hello, thanks, iloveyou)
actions = np.array(actions)

# 비디오 숫자.
no_sequences = 240

# 비디오 내의 전체 프레임
sequence_length = 30

In [6]:
for action in actions:
    for sequence in range(no_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass


## 4. Collect Keypoint Sequences

In [7]:
TRAIN_DATA_PATH = os.path.join("train_data")
word_folder = os.listdir(TRAIN_DATA_PATH)

In [8]:
video_file = dict()

In [10]:
# video path
for word in word_folder:
    array = []    
    test = os.path.join(TRAIN_DATA_PATH,word)
    test_file = os.listdir(test)
    for t in test_file:
        test_x = os.path.join(test, t)
        array.append(test_x)
    video_file[word] = array

In [11]:
video_file

{'back': ['train_data\\back\\0.avi',
  'train_data\\back\\1.avi',
  'train_data\\back\\10.avi',
  'train_data\\back\\100.avi',
  'train_data\\back\\101.avi',
  'train_data\\back\\102.avi',
  'train_data\\back\\103.avi',
  'train_data\\back\\104.avi',
  'train_data\\back\\105.avi',
  'train_data\\back\\106.avi',
  'train_data\\back\\107.avi',
  'train_data\\back\\108.avi',
  'train_data\\back\\109.avi',
  'train_data\\back\\11.avi',
  'train_data\\back\\110.avi',
  'train_data\\back\\111.avi',
  'train_data\\back\\112.avi',
  'train_data\\back\\113.avi',
  'train_data\\back\\114.avi',
  'train_data\\back\\115.avi',
  'train_data\\back\\116.avi',
  'train_data\\back\\117.avi',
  'train_data\\back\\118.avi',
  'train_data\\back\\119.avi',
  'train_data\\back\\12.avi',
  'train_data\\back\\120.avi',
  'train_data\\back\\121.avi',
  'train_data\\back\\122.avi',
  'train_data\\back\\123.avi',
  'train_data\\back\\124.avi',
  'train_data\\back\\125.avi',
  'train_data\\back\\126.avi',
  'trai

In [12]:
video_max = 240

In [None]:
fill_zero = np.zeros(1662)
fill_zero.shape

In [None]:
# 단어 행동 데이터를 수집해서 각 시퀀스를 numpy 배열 형태로 저장한다.

# Loop through actions ex) ['ear', 'snow', 'leg']
for action in actions:
    
    for sequence in range(no_sequences): # 비디오 개수 만큼 루프
        cap = cv2.VideoCapture(video_file[action][sequence])
        length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) # Frame 길이 측정. 

        frame_cnt = 0 
        prev = None

        # Set mediapipe model
        with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:

            # Loop through sequences aka videos ex) no_sequences == 20 비디오 개수
                
            print(f'sequence : {sequence}')
            # Loop through video length aka sequence length
            '''
                해야 되는 것
                원래 video frame을 잘라서. 
                30 프레임 형태로 만들어줘야함.
            '''
            while cap.isOpened():
                
                # frame을 읽는다.
                ret, frame = cap.read()

                
                # 읽어올 거 없으면 알아서 끄세요.
                if not ret:
                    break
                
                # Make detections
                image, results = mediapipe_detection(frame, holistic)

                prev = results

                # Draw landmarks
                draw_styled_landmarks(image, results)

                '''
                    내보낼 keypoints 들
                    1. 각 프레임 마다 keypoints를 추출한다.
                    2. 각 프레임 마다 DATA_PATH에 맞게 해당 keypoints를 numpy 데이터 형태로 npy_path에 저장한다.
                    3. 해당 numpy 데이터 형태를 저장한다.
                '''
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_cnt))

                print(f'npy_path : {npy_path}')

                np.save(npy_path, keypoints)

                print(f"saved {saved_frames}")
                
                frame_cnt += 1
                print(f'frame_cnt : {frame_cnt}')
                # 스크린에 보여준다.
                # cv2.imshow('OpenCV Feed', image)

            while frame_cnt < video_max:
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_cnt))
                print(f'npy_path : {npy_path}')

                np.save(npy_path, fill_zero) # 추출한 keypoints 저장.
                frame_cnt += 1


        cap.release()
        cv2.destroyAllWindows()

## 6. Preprocess Data and Create Labels and Features

In [None]:
# train, test 데이터 분할
from sklearn.model_selection import train_test_split

'''
클래스 벡터(정수)를 이진 클래스 행렬로 변환한다.
ex) 클래스 (0, 1, 2)인 label 데이터가 있다고 가정.
'0' 클래스인 경우 => [1, 0 ,0]
'1' 클래스인 경우 => [0, 1, 0]
'2' 클래스인 경우 => [0, 0, 1]
'''
from tensorflow.keras.utils import to_categorical

In [None]:
# actions 단어들에 labeling
label_map = {label:num for num, label in enumerate(actions)}

In [None]:
'''
단어마다 30개의 프레임이 존재.
단어 * 30 개의 넘파이 배열에서
keypoints를 나타내는 총 1662개의 특정 값들이 필요함.
'''
label_map

In [None]:
# sequences, labels 배열
sequences, labels = [], [] 

for action in actions:
    # 각 프레임 마다.(이 예제에서는 30)
    for sequence in range(no_sequences):
        window = []

        # sequence의 길이마다(이 예제에서는 30) 
        for frame_num in range(sequence_length):

            '''
            각 sequence의 numpy 형태 배열을 불러온다.
            ex) hello > 0 > 0.npy
            window.append(hello > 0 > 0.npy)
            '''
            res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
            # 해당 numpy 배열을 window 배열에 추가한다.
            window.append(res)

        '''
            각 frame의 배열 모음을 추가한다.
            ex) hello > 0(numpy 배열 모음 0 ~ 30.npy)

            frame 배열에 라벨링을 추가해준다.
            ex) sequences = [
                [hello.0], [hello.1], [hello.2] ....
                [thanks.0], [thanks.1], [thanks.2] ...
            ]
            
            labels_map = { 'hello': 0, 'thanks': 1, 'iloveyou': 2}
            labels =[
                0, 0, 0, 0, 0 ....
                1, 1, 1, 1, 1 ....
                2, 2, 2, 2, 2 ....
            ]
        '''
        sequences.append(window)
        labels.append(label_map[action])

In [None]:
'''
1. sequences의 배열에는 각 단어마다 30개의 sequence
총 90개의 sequence를 가진다.

2. 각 시퀀스(프레임)마다 30번의 keypoints 추출 작업을 거친다.
3. 총 1662개의 key 포인트를 가지는 numpy 배열을 가진다.
sequences.shape = (90, 30, 1662)

labels_map = { 'hello': 0, 'thanks': 1, 'iloveyou': 2}
            labels =[
                0, 0, 0, 0, 0 ....
                1, 1, 1, 1, 1 ....
                2, 2, 2, 2, 2 ....
            ]
'''
print(f'sequences의 shape : {np.array(sequences).shape}')
print(f'sequences의 길이 : {len(sequences)}')
print(f'labels의 shape : {np.array(labels).shape}')
print(f'labels의 길이 : {len(labels)}')

In [None]:
X = np.array(sequences)

In [None]:
X.shape

In [None]:
y = to_categorical(labels).astype(int)

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
print(f'X_train : {X_train.shape}')
print(f'train_label : {y_train.shape}')
print(f'X_test : {X_test.shape}')
print(f'test_label : {y_test.shape}')

## 7. Build and Train LSTM Neural Network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import TensorBoard # 로깅

In [None]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

In [None]:
'''
    LSTM Neural network
'''
model = Sequential()
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30,1662))) # (frame , keypoints)
model.add(LSTM(128, return_sequences=True, activation='relu'))
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))

'''
    One-Hot-Encoding
    ex) res = [0.2, 0.7, 0.1]
    가장 확률 높은 인덱스 번호를 추출 
        => (모델이 예측한 확률이 가장 높은 인덱스 추출.)
        => 예측. (방금한 행동은 아마 1번 일거야)
    np.argmax(res) == 1 
    actions[np.argmax(res)] == 'thanks'
    너가 방금한 행동은 'thanks'야
'''
model.add(Dense(actions.shape[0], activation='softmax'))

In [None]:
X.shape

In [None]:
res = [.2, 0.7, 0.1]

In [None]:
np.argmax(res)

In [None]:
actions[np.argmax(res)]

In [None]:
'''
다중 클래스 분류 시 일반적인 손실함수 : categorical_crossentropy
다중 클래스 분류 시 일반적인 정확도 측정 함수 : categorical_accuracy
'''
model.compile(
    optimizer='adam', 
    loss='categorical_crossentropy',
    metrics=['categorical_accuracy']
)

In [None]:
'''
    Logs\train 폴더에 들어간뒤
    tensorboard --logdir=. 을 입력하면 
    Tensorbaord화면이 나온다.
    ex) 
    1step. cd suhwa_dataset\Logs\train
    2step. tesorboard --logdir=.
'''

In [None]:
import tensorflow as tf
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
model.fit(X_train, y_train, epochs=100, callbacks=[tb_callback])

In [None]:
model.summary()

## 8. Make Predictions

In [None]:
res = model.predict(X_test)

In [None]:
# 예측.
actions[np.argmax(res[0])]

In [None]:
actions[np.argmax(y_test[0])]

## 9. Save Weights

In [None]:
# model 저장
model.save('multi_pose.h5')

In [None]:
del model

In [None]:
# 모델 불러오기 load_model
'''
    1. 모델 재구축 실행 => model build
    2. 모델 컴파일 => model compile
    3. 모델 불러오기 => lodad_weights
'''
model.load_weights('multi_pose.h5')

## 10. Evaluation using Confusion Matrix and Accuracy

In [None]:
'''
    Confusion Matrix를 통한 정확도 검증
'''
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score

In [None]:
yhat = model.predict(X_train)

In [None]:
ytrue = np.argmax(y_train, axis=1).tolist()
yhat = np.argmax(yhat, axis=1).tolist()

In [None]:
multilabel_confusion_matrix(ytrue,yhat)

In [None]:
accuracy_score(ytrue, yhat)

# 11. Test in Real Time

In [None]:
colors = [(245,117,16), (117,245,16), (16,117,245)]
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
        
    return output_frame

In [None]:
plt.figure(figsize=(18,18))
plt.imshow(prob_viz(res, actions, image, colors))

In [None]:
file_num = '0177'
snow_format = f'data/snow/KETI_SL_000000{file_num}.avi'

In [None]:
file_num = "0150"
file_format = f'train_data/ear/1.avi'

In [None]:
fill_zero = np.zeros(1662)

In [None]:
# 1. New detection variables
for i in range(20):
    cap = cv2.VideoCapture(f'train_data/back/{i}.avi')
    length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) # Frame 길이 측정. 
    sequence = [fill_zero for _ in range(240)]
    sentence = []
    threshold = 0.3
    frame_cnt = 0
    # Set mediapipe model 
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():

            # Read feed
            ret, frame = cap.read()

            if not ret:
                break

            # Make detections
            image, results = mediapipe_detection(frame, holistic)
            print(results)
            
            # Draw landmarks
            draw_styled_landmarks(image, results)
            
            # 2. Prediction logic
            keypoints = extract_keypoints(results)
            sequence[frame_cnt] = keypoints

            if frame_cnt + 1 == length:
                # numpy 배열 차원 추가
                # 이 예제에서는 (30, 1662) => (1, 30, 1662)             
                res = model.predict(np.expand_dims(sequence, axis=0))[0]
                print(f'predict result : {actions[np.argmax(res)]}')
                
            frame_cnt += 1 
        
        cap.release()
        cv2.destroyAllWindows()

In [None]:
print(f'predict result : {actions[np.argmax(res)]}')

In [None]:
res[np.argmax(res)] > threshold

In [None]:
# 모델의 예측 규격에 맞게 데이터를 캡슐화
np.expand_dims(X_test[0].shape, axis=0)

In [None]:
model.predict(np.expand_dims(X_test[0], axis=0))