In [1]:
import datetime
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.decomposition import PCA

# ===== Data ===== #
DFS = {
    'lag_cp' : pd.read_csv('data/bem_comportadas_laguardia.csv'),
    'lag_li' : pd.read_csv('data/livres_laguardia.csv'),
    'van_cp' : pd.read_csv('data/bem_comportadas_vanessa.csv'),
    'van_li' : pd.read_csv('data/livres_vanessa.csv'),
    'bru_cp' : pd.read_csv('data/bem_comportadas_bruno.csv'),
    'bru_li' : pd.read_csv('data/livres_bruno.csv'),
    'alm_cp' : pd.read_csv('data/bem_comportadas_almir.csv'),
}

---
## Identificando o momento de cada pose
---

In [None]:
for key, df in DFS.items():
    # add a fake column for timestamp
    df['timestamp'] = pd.date_range(start='1/1/2023', periods=df.shape[0], freq='500L')

In [13]:
# Transform the data into a df of poses and durations
DFS_POSES_DURATIONS = {}

for key, df in DFS.items():
    this_df_dict = {}
    this_pose = -1
    pose_count = 0
    # iterate over the rows of the dataframe
    for index, row in df.iterrows():
        # get the pose
        pose = row['pose']
        # if the pose is different from the previous one
        if pose != this_pose:
            # increase the pose count
            pose_count += 1
            # update the pose
            this_pose = pose
            # add the pose to the dict
            this_df_dict[f'p:{pose},c:{pose_count}'] = {
                'pose' : pose,
                'start' : row['timestamp'],
                'end' : row['timestamp'],
            }
        else:
            # update the end of the pose
            this_df_dict[f'p:{pose},c:{pose_count}']['end'] = row['timestamp']
    # create a dataframe from the dict
    dfs = pd.DataFrame.from_dict(this_df_dict, orient='index')
    # save the dataframe
    DFS_POSES_DURATIONS[key] = dfs

In [100]:
DFS_POSES_DURATIONS['lag_li']

Unnamed: 0,pose,start,end
"p:1,c:1",1,2023-01-01 00:00:00.000,2023-01-01 00:10:23.500
"p:4,c:2",4,2023-01-01 00:10:24.000,2023-01-01 00:14:06.000
"p:7,c:3",7,2023-01-01 00:14:06.500,2023-01-01 00:17:07.000
"p:1,c:4",1,2023-01-01 00:17:07.500,2023-01-01 00:18:16.000
"p:12,c:5",12,2023-01-01 00:18:16.500,2023-01-01 00:19:52.000
"p:1,c:6",1,2023-01-01 00:19:52.500,2023-01-01 00:26:32.500
"p:3,c:7",3,2023-01-01 00:26:33.000,2023-01-01 00:30:20.500
"p:5,c:8",5,2023-01-01 00:30:21.000,2023-01-01 00:31:02.500
"p:4,c:9",4,2023-01-01 00:31:03.000,2023-01-01 00:31:27.000


In [99]:
px.timeline(DFS_POSES_DURATIONS['lag_li'],
            x_start='start',
            x_end='end',
            y='pose',
            color='pose')

---
## Comparando dois momentos diferentes da mesma pose
---

In [21]:
DFS['van_li']

Unnamed: 0,p00,p01,p02,p03,p04,p05,p06,p07,p08,p09,p10,p11,pose,timestamp
0,0,0,0,0,953,1203,228,4018,778,484,237,904,3,2023-01-01 00:00:00.000
1,0,0,0,0,989,1227,243,4019,875,538,310,947,3,2023-01-01 00:00:00.500
2,0,0,0,0,986,1210,242,4028,863,532,303,940,3,2023-01-01 00:00:01.000
3,0,0,0,0,989,1232,245,4019,889,556,302,955,3,2023-01-01 00:00:01.500
4,0,0,0,0,1002,1233,246,4029,891,550,336,960,3,2023-01-01 00:00:02.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3756,3089,49,0,0,701,697,482,4009,487,387,652,1009,7,2023-01-01 00:31:18.000
3757,2999,267,0,0,623,584,486,4018,515,366,658,995,7,2023-01-01 00:31:18.500
3758,3067,0,0,0,594,558,477,4007,461,388,634,1009,7,2023-01-01 00:31:19.000
3759,3045,0,0,0,583,578,455,4016,475,365,622,1022,7,2023-01-01 00:31:19.500


In [103]:
# primeira ocorrencia de pose 3
pose3c1 = DFS['lag_li'][
    (DFS['lag_li']['timestamp'] > DFS_POSES_DURATIONS['lag_li']['start']['p:1,c:1']) &
    (DFS['lag_li']['timestamp'] < DFS_POSES_DURATIONS['lag_li']['end']['p:1,c:1'])
]
pose3c1 = pose3c1.copy()

# segunda ocorrencia de pose 3
pose3c2 = DFS['lag_li'][
    (DFS['lag_li']['timestamp'] > DFS_POSES_DURATIONS['lag_li']['start']['p:1,c:4']) &
    (DFS['lag_li']['timestamp'] < DFS_POSES_DURATIONS['lag_li']['end']['p:1,c:4'])
]
pose3c2 = pose3c2.copy()

# terceira ocorrencia de pose 3
pose3c3 = DFS['lag_li'][
    (DFS['lag_li']['timestamp'] > DFS_POSES_DURATIONS['lag_li']['start']['p:1,c:6']) &
    (DFS['lag_li']['timestamp'] < DFS_POSES_DURATIONS['lag_li']['end']['p:1,c:6'])
]
pose3c3 = pose3c3.copy()

In [104]:
# junta as duas ocorrencias e cria uma flag para cada uma
pose3c1['flag'] = 1
pose3c2['flag'] = 2
pose3c3['flag'] = 3
df_pose3 = pd.concat([pose3c1, pose3c2, pose3c3])

df_pose3

Unnamed: 0,p00,p01,p02,p03,p04,p05,p06,p07,p08,p09,p10,p11,pose,timestamp,flag
1,0,0,0,0,1512,1814,291,3950,737,347,78,0,1,2023-01-01 00:00:00.500,1
2,0,0,0,0,1553,1791,297,3935,760,351,122,0,1,2023-01-01 00:00:01.000,1
3,0,0,0,0,1563,1792,298,3935,775,355,130,0,1,2023-01-01 00:00:01.500,1
4,0,0,0,0,1542,1812,299,3950,785,368,94,0,1,2023-01-01 00:00:02.000,1
5,0,0,0,0,1562,1800,299,3941,796,351,86,0,1,2023-01-01 00:00:02.500,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3180,1863,1679,0,2277,1324,1319,189,3956,862,645,925,995,1,2023-01-01 00:26:30.000,3
3181,2176,1555,0,2427,1000,1445,163,3940,1008,649,1255,1180,1,2023-01-01 00:26:30.500,3
3182,2118,1783,0,2355,1121,1428,187,3946,961,636,977,1185,1,2023-01-01 00:26:31.000,3
3183,0,0,0,0,2106,1857,172,3958,986,646,1162,1114,1,2023-01-01 00:26:31.500,3


In [105]:
df_pose3['flag'].value_counts()

1    1246
3     799
2     136
Name: flag, dtype: int64

In [106]:
# separa o dataframa em features pro PCA e target
X = df_pose3[df.columns[:-2]]
X

Unnamed: 0,p00,p01,p02,p03,p04,p05,p06,p07,p08,p09,p10,p11
1,0,0,0,0,1512,1814,291,3950,737,347,78,0
2,0,0,0,0,1553,1791,297,3935,760,351,122,0
3,0,0,0,0,1563,1792,298,3935,775,355,130,0
4,0,0,0,0,1542,1812,299,3950,785,368,94,0
5,0,0,0,0,1562,1800,299,3941,796,351,86,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3180,1863,1679,0,2277,1324,1319,189,3956,862,645,925,995
3181,2176,1555,0,2427,1000,1445,163,3940,1008,649,1255,1180
3182,2118,1783,0,2355,1121,1428,187,3946,961,636,977,1185
3183,0,0,0,0,2106,1857,172,3958,986,646,1162,1114


In [108]:
# plot boxplots para cada coluna de pressão, com cores diferentes para cada flag
press_cols = ['p00', 'p01', 'p02', 'p03', 'p04', 'p05', 'p06', 'p07', 'p08','p09', 'p10', 'p11']

fig = make_subplots(rows=3, cols=4, subplot_titles=press_cols)

for i, col in enumerate(press_cols):
    fig.add_trace(go.Box(y=df_pose3[col], x=df_pose3['flag'], name=col), row=int(i/4)+1, col=i%4+1)

fig.update_layout(height=600, width=800, 
                  title_text="Boxplots de pressão para cada momento da pose 1 do Laguardia")
fig.show()

In [74]:
# aplica o PCA
pca = PCA(n_components=2)
pca.fit(X)
X_pca = pca.transform(X)
X_pca

array([[  89.78308604,  729.6856085 ],
       [  13.81627583,  592.28584159],
       [  95.62828097,  614.32088089],
       ...,
       [-332.54946947,  -71.89107527],
       [-238.02343852,   70.18253184],
       [1836.21136953,  644.41986535]])

In [77]:
# adiciona as componentes principais ao dataframe
df_pose3['pc1'] = X_pca[:,0]
df_pose3['pc2'] = X_pca[:,1]

# plota as componentes principais
fig = px.scatter(df_pose3, x='pc1', y='pc2', color='flag',
                 title='PCA das poses 7 de Bruno')
fig.show()