# Using Body language to classify diferents actions

## Background

The aim of this project is to develop a system capable of classifying different actions carried out by people, using as input images or videos, which can be captured live via a webcam or provided as pre-recorded files.

The project was made possible according to the [proposed planning](../docs/planning.md)

## Measure of Success

We don't have any criteria for classifying the actions that weight a metric such as precision or recall, so we'll use the F1-Score instead

# 0.0 Imports

In [1]:
import tarfile
import zipfile
import mediapipe as mp
import cv2
import pandas as pd
import numpy as np
from scipy.io import loadmat

## 0.1 Load data

- This data has an approximate quantity of 12 GB, to facilitate this we will provide intermediate tables with the appropriate treatments

In [2]:
image_archive_path = '../data/01_raw/mpii_human_pose_v1.tar.gz'
annotation_archive_path = '../data/01_raw/mpii_human_pose_v1_u12_2.zip'

with tarfile.open(image_archive_path, "r:gz") as tar:
    tar.extractall(path='../data/02_intermediate/')

with zipfile.ZipFile(annotation_archive_path, 'r') as zip_ref:
    zip_ref.extractall('../data/02_intermediate/')

In [42]:
annotation_path = '../data/02_intermediate/mpii_human_pose_v1_u12_2/mpii_human_pose_v1_u12_1.mat'

annotations = loadmat(annotation_path, squeeze_me=True)

# 1.0 Process images to cordinates

- Here I will browse through the annotations of the images to extract the information where for each image what action was taken.

In [68]:
release = annotations['RELEASE']

annolist = release['annolist'].item() if hasattr(release['annolist'], 'item') else release['annolist']
act = release['act'].item() if hasattr(release['act'], 'item') else release['act']

image_action_mapping = []

for idx, ann in enumerate(annolist):
    image_name = ann['image']['name'] if 'name' in ann['image'].dtype.names else "Unknown Image"
    
    if isinstance(act, np.ndarray) and idx < len(act) and 'act_name' in act[idx].dtype.names:
        action_name = act[idx]['act_name']
    else:
        action_name = "Unknown"
    
    image_action_mapping.append((image_name, action_name))

for mapping in image_action_mapping[:5]:
    print(f"Image: {mapping[0]}, Action: {mapping[1]}")

Image: 037454012.jpg, Action: []
Image: 095071431.jpg, Action: []
Image: 073199394.jpg, Action: []
Image: 059865848.jpg, Action: []
Image: 015601864.jpg, Action: curling


- Now I'm going to select images with only one action and choose 20 actions

In [78]:
df = pd.DataFrame(image_action_mapping, columns=['ImageName', 'Action'])

df_filtered = df[df['Action'] != "Unknown"]
df_cleaned = df_filtered[~df_filtered['Action'].str.contains(",| or ", regex=True)]

top_20_actions = df_cleaned['Action'].value_counts().head(20).index.tolist()

df_filtered = df_cleaned[df_cleaned['Action'].isin(top_20_actions)]

filtered_csv_path = '../data/03_primary/filtered_actions_top_20.csv'
df_filtered.to_csv(filtered_csv_path, index=False)

In [None]:
print(f"As 20 ações mais comuns são: {top_20_actions}")
print(df_filtered.head())

## 1.1 Extract landmarks

- Now for each image I'm going to use mediapipe to extract the poses and faces of the people.

In [107]:
mp_holistic = mp.solutions.holistic
mp_drawing = mp.solutions.drawing_utils

def extract_landmarks(image_path):
    with mp_holistic.Holistic(static_image_mode=True) as holistic:
        image = cv2.imread(image_path)
        results = holistic.process(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
        return results

columns = ['ImageName', 'Action']

max_pose_landmarks = 33
max_face_landmarks = 468

for i in range(max_pose_landmarks):
    columns.extend([f'pose_{i}_x', f'pose_{i}_y', f'pose_{i}_z', f'pose_{i}_visibility'])

for i in range(max_face_landmarks):
    columns.extend([f'face_{i}_x', f'face_{i}_y', f'face_{i}_z'])

df_landmarks = pd.DataFrame(columns=columns)

for _, row in df_filtered.iterrows():
    image_path = f"../data/02_intermediate/images/{row['ImageName']}"
    results = extract_landmarks(image_path)
    data = [row['ImageName'], row['Action']]

    if results.pose_landmarks:
        pose = results.pose_landmarks.landmark
        pose_row = list(np.array([[landmark.x, landmark.y, landmark.z, landmark.visibility] for landmark in pose]).flatten())
    else:
        pose_row = [np.nan] * (max_pose_landmarks * 4)

    if results.face_landmarks:
        face = results.face_landmarks.landmark
        face_row = list(np.array([[landmark.x, landmark.y, landmark.z] for landmark in face]).flatten())
    else:
        face_row = [np.nan] * (max_face_landmarks * 3)

    row_data = data + pose_row + face_row
    df_landmarks = pd.concat([df_landmarks, pd.DataFrame([row_data], columns=columns)], ignore_index=True)

df_landmarks.to_csv('../data/04_feature/final_landmarks_with_actions.csv', index=False)

  df_landmarks = pd.concat([df_landmarks, pd.DataFrame([row_data], columns=columns)], ignore_index=True)


In [4]:
df_landmarks

Unnamed: 0,ImageName,Action,pose_0_x,pose_0_y,pose_0_z,pose_0_visibility,pose_1_x,pose_1_y,pose_1_z,pose_1_visibility,...,face_464_z,face_465_x,face_465_y,face_465_z,face_466_x,face_466_y,face_466_z,face_467_x,face_467_y,face_467_z
0,084922341.jpg,ballroom,0.477999,0.243427,-0.104067,0.997500,0.480529,0.218856,-0.105465,0.996411,...,,,,,,,,,,
1,065761289.jpg,ballroom,0.623431,0.388648,-0.107929,0.999994,0.626444,0.378512,-0.097112,0.999991,...,0.001516,0.624713,0.375394,0.000605,0.630487,0.375629,0.004073,0.631177,0.374741,0.004253
2,056830860.jpg,ballroom,0.420640,0.365611,0.003073,0.997565,0.425081,0.349444,0.013141,0.997440,...,,,,,,,,,,
3,009367477.jpg,ballroom,,,,,,,,,...,,,,,,,,,,
4,036771580.jpg,ballroom,0.264926,0.227335,-0.112517,0.998056,0.268532,0.214265,-0.109486,0.996739,...,-0.004300,0.264555,0.216310,-0.004516,0.273764,0.212818,-0.008668,0.274791,0.210975,-0.009200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2596,025820296.jpg,resistance training,0.201862,0.143329,-0.007814,0.999397,0.194155,0.126794,-0.003160,0.999372,...,,,,,,,,,,
2597,097064125.jpg,resistance training,0.412948,0.522286,-0.326439,0.999448,0.413722,0.506519,-0.323732,0.999839,...,,,,,,,,,,
2598,059740789.jpg,resistance training,0.430848,0.540816,-0.193367,0.991907,0.432859,0.526751,-0.190976,0.971611,...,,,,,,,,,,
2599,000708647.jpg,resistance training,0.459628,0.382458,-0.084313,0.997350,0.459678,0.360366,-0.095997,0.997010,...,-0.003691,0.480774,0.390505,-0.003916,0.491623,0.389910,-0.009860,0.493047,0.388514,-0.010559
