In [1]:
import numpy as np
import pandas as pd
import os

# Load each .npy file and convert the list of dictionaries to a DataFrame
dataframes = []
for file in os.listdir('dataset'):
    if file.endswith('.npy'):
        data = np.load('dataset/' + file, allow_pickle=True)
        dataframes.append(pd.DataFrame.from_records(data))

# Concatenate all DataFrames into a single DataFrame
merged_df = pd.concat(dataframes, ignore_index=True)
merged_df

Unnamed: 0,Word,Frame,X00,Y00,Z00,X01,Y01,Z01,X02,Y02,...,Z62,X63,Y63,Z63,X64,Y64,Z64,X65,Y65,Z65
0,Boy,1,-0.290488,-75.818914,-3.229877,-0.166410,-77.328566,-3.061451,-0.098528,-82.594196,...,-0.349284,0.610715,0.973082,-0.303378,0.405078,0.922159,-0.281479,0.559808,0.851646,-0.028510
1,Boy,2,-2.630732,163.577346,-2.202535,-2.630732,163.577346,-2.202535,-2.630732,163.577346,...,-0.346683,0.613981,0.946346,-0.395646,0.413621,0.845886,-0.267950,0.558445,0.851427,-0.036818
2,Boy,3,-2.649066,62.903229,-2.865200,-2.649066,62.903229,-2.865200,-2.649066,62.903229,...,-0.820756,0.612683,0.950004,-0.379825,0.448817,0.581546,-0.748774,0.557367,0.867107,-0.040340
3,Boy,4,-2.641445,73.387394,-6.816939,-2.641445,73.387394,-6.816939,-2.641445,73.387394,...,-0.892051,0.607402,0.961182,-0.300694,0.439404,0.449596,-0.823084,0.556838,0.878659,-0.022312
4,Boy,5,0.075973,-2.649841,-1.005892,0.190630,-5.362860,-1.022167,0.243538,-8.214614,...,-0.829807,0.604378,0.950633,-0.247957,0.445261,0.344964,-0.765876,0.552638,0.877294,-0.045415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14135,You,16,-0.255172,-14.686667,2.176972,-0.137746,-15.407824,3.206680,-0.088495,-16.454638,...,-0.189617,0.678464,0.430918,-0.998512,0.307135,0.832112,-0.110380,0.559508,0.746648,0.006331
14136,You,17,-0.247349,-18.125843,0.387121,-0.123823,-19.070908,-0.996675,-0.073921,-20.329048,...,-0.235708,0.673772,0.673639,-0.173289,0.301638,0.830727,-0.135007,0.558916,0.736754,0.016739
14137,You,18,-0.252560,-28.690787,5.913587,-0.133411,-30.150542,5.626362,-0.083474,-32.168021,...,-0.643596,0.653561,0.818583,-0.402061,0.293410,0.830436,-0.519518,0.550947,0.735413,0.027168
14138,You,19,-0.263622,-44.649560,6.507309,-0.145083,-46.797549,6.096631,-0.096428,-49.825290,...,-0.573453,0.627514,0.836671,-0.288298,0.281208,0.831862,-0.455801,0.543025,0.740520,0.026633


In [2]:
max_sequence_length = merged_df['Frame'].max()
max_sequence_length

20

In [3]:
def pad_sequences(group):
    # Calculate the number of padding rows needed
    padding_rows = max_sequence_length - len(group)
    
    # Create a DataFrame with padding rows filled with NaN (or any other padding value)
    padding_df = pd.DataFrame({
        'Word': [group['Word'].iloc[0]] * padding_rows,
        'Frame': np.arange(len(group) + 1, max_sequence_length + 1),
    })
    
    # Concatenate the original group with the padding DataFrame
    return pd.concat([group, padding_df], ignore_index=True)

# Group the DataFrame by 'Word' and apply the padding function
padded_df = merged_df.groupby('Word').apply(pad_sequences).reset_index(drop=True)
padded_df

Unnamed: 0,Word,Frame,X00,Y00,Z00,X01,Y01,Z01,X02,Y02,...,Z62,X63,Y63,Z63,X64,Y64,Z64,X65,Y65,Z65
0,Boy,1,-0.290488,-75.818914,-3.229877,-0.166410,-77.328566,-3.061451,-0.098528,-82.594196,...,-0.349284,0.610715,0.973082,-0.303378,0.405078,0.922159,-0.281479,0.559808,0.851646,-0.028510
1,Boy,2,-2.630732,163.577346,-2.202535,-2.630732,163.577346,-2.202535,-2.630732,163.577346,...,-0.346683,0.613981,0.946346,-0.395646,0.413621,0.845886,-0.267950,0.558445,0.851427,-0.036818
2,Boy,3,-2.649066,62.903229,-2.865200,-2.649066,62.903229,-2.865200,-2.649066,62.903229,...,-0.820756,0.612683,0.950004,-0.379825,0.448817,0.581546,-0.748774,0.557367,0.867107,-0.040340
3,Boy,4,-2.641445,73.387394,-6.816939,-2.641445,73.387394,-6.816939,-2.641445,73.387394,...,-0.892051,0.607402,0.961182,-0.300694,0.439404,0.449596,-0.823084,0.556838,0.878659,-0.022312
4,Boy,5,0.075973,-2.649841,-1.005892,0.190630,-5.362860,-1.022167,0.243538,-8.214614,...,-0.829807,0.604378,0.950633,-0.247957,0.445261,0.344964,-0.765876,0.552638,0.877294,-0.045415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14135,You,16,-0.255172,-14.686667,2.176972,-0.137746,-15.407824,3.206680,-0.088495,-16.454638,...,-0.189617,0.678464,0.430918,-0.998512,0.307135,0.832112,-0.110380,0.559508,0.746648,0.006331
14136,You,17,-0.247349,-18.125843,0.387121,-0.123823,-19.070908,-0.996675,-0.073921,-20.329048,...,-0.235708,0.673772,0.673639,-0.173289,0.301638,0.830727,-0.135007,0.558916,0.736754,0.016739
14137,You,18,-0.252560,-28.690787,5.913587,-0.133411,-30.150542,5.626362,-0.083474,-32.168021,...,-0.643596,0.653561,0.818583,-0.402061,0.293410,0.830436,-0.519518,0.550947,0.735413,0.027168
14138,You,19,-0.263622,-44.649560,6.507309,-0.145083,-46.797549,6.096631,-0.096428,-49.825290,...,-0.573453,0.627514,0.836671,-0.288298,0.281208,0.831862,-0.455801,0.543025,0.740520,0.026633


In [4]:
total_rows = len(merged_df)
num_videos = total_rows // 20

In [5]:
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

train_split = int(total_rows * train_ratio)
val_split = train_split + int(total_rows * val_ratio)


In [6]:
train_df = padded_df.iloc[:train_split]
val_df = padded_df.iloc[train_split:val_split]
test_df = padded_df.iloc[val_split:]

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")
train_df

Training set size: 9898
Validation set size: 2121
Test set size: 2121


Unnamed: 0,Word,Frame,X00,Y00,Z00,X01,Y01,Z01,X02,Y02,...,Z62,X63,Y63,Z63,X64,Y64,Z64,X65,Y65,Z65
0,Boy,1,-0.290488,-75.818914,-3.229877,-0.166410,-77.328566,-3.061451,-0.098528,-82.594196,...,-0.349284,0.610715,0.973082,-0.303378,0.405078,0.922159,-0.281479,0.559808,0.851646,-0.028510
1,Boy,2,-2.630732,163.577346,-2.202535,-2.630732,163.577346,-2.202535,-2.630732,163.577346,...,-0.346683,0.613981,0.946346,-0.395646,0.413621,0.845886,-0.267950,0.558445,0.851427,-0.036818
2,Boy,3,-2.649066,62.903229,-2.865200,-2.649066,62.903229,-2.865200,-2.649066,62.903229,...,-0.820756,0.612683,0.950004,-0.379825,0.448817,0.581546,-0.748774,0.557367,0.867107,-0.040340
3,Boy,4,-2.641445,73.387394,-6.816939,-2.641445,73.387394,-6.816939,-2.641445,73.387394,...,-0.892051,0.607402,0.961182,-0.300694,0.439404,0.449596,-0.823084,0.556838,0.878659,-0.022312
4,Boy,5,0.075973,-2.649841,-1.005892,0.190630,-5.362860,-1.022167,0.243538,-8.214614,...,-0.829807,0.604378,0.950633,-0.247957,0.445261,0.344964,-0.765876,0.552638,0.877294,-0.045415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9893,Parents,8,0.078565,-3.696548,-1.779066,0.112906,-5.095854,-1.917698,0.195637,-6.076751,...,-0.678216,0.589338,0.944744,-0.168253,0.451620,0.310572,-0.625051,0.530600,0.860472,-0.010854
9894,Parents,9,0.062591,-4.812546,-2.854190,0.191709,-5.405503,-2.971935,0.263003,-6.225743,...,-0.716471,0.588531,0.939794,-0.168598,0.440518,0.305957,-0.660765,0.532675,0.861604,-0.009045
9895,Parents,10,0.074528,-6.880069,-1.948132,0.186122,-6.972112,-2.019608,0.264631,-7.690690,...,-0.735241,0.588323,0.939107,-0.148097,0.436217,0.302292,-0.680691,0.533648,0.867738,-0.010981
9896,Parents,11,0.045960,-8.161325,-2.515860,0.163769,-8.196723,-2.559986,0.234525,-9.003735,...,-0.731106,0.588359,0.938447,-0.143474,0.436090,0.281910,-0.683369,0.535999,0.865807,-0.008149
