## Generate videos for each aligned gesture (for real pairs)

### Import packages and define paths

In [1]:
import os
import pandas as pd
from moviepy.editor import VideoFileClip, clips_array
from tqdm import tqdm

### define path to video file (two levels up from current directory)
mediapipe_folder = '../../../3_data/referential_task/mediapipe/output_videos/'
output_real_folder = '../videos_for_aligned_gestures/'

### Concatenate two videos into one to display two videos side by side
Now, we will concatenate two videos to display them next to each other. This is to check with the eyes whether aligned gestures look alike and whether the pose was properly estimated

In [2]:
### prepare the dataframe containing the gesture alignment data
gest_align_file = '../elan_annotation/gesture_form_similarity_coding_processed.csv'
df_gest_align = pd.read_csv(gest_align_file)

dtw_file_real = '../processed/10_dtw_distance_modified/dtw_distance.csv'
df_dtw_real = pd.read_csv(dtw_file_real)
# keep the pair_x, comparison_id, and average_distance columns only
df_dtw_real = df_dtw_real[['comparison_id', 'average_distance']]

# add average_distance to df_gest_align
df_gest_align = df_gest_align.merge(df_dtw_real, on=['comparison_id'], how='left')


### for each row in the dataframe, extract the pair, comparison_id, and start and end times
for i, row in tqdm(df_gest_align.iterrows(), total=len(df_gest_align)):
    pair = "pair" + str(row['pairnr']).zfill(2)
    comparison_id = row['comparison_id']
    speaker_1 = row['speaker_1']
    speaker_2 = row['speaker_2']
    rounds = row['round']
    similarity_coding = row['gesture_similarity_coding'][4:8] # only keep the similarity_coding for shape and movement
    average_distance = row['average_distance']

    if speaker_1 == "A":
        start_time_1 = row['A_begin_msec_adj']/1000
        end_time_1 = row['A_end_msec_adj']/1000
        start_time_2 = row['B_begin_msec_adj']/1000
        end_time_2 = row['B_end_msec_adj']/1000
    elif speaker_1 == "B":
        start_time_1 = row['B_begin_msec_adj']/1000
        end_time_1 = row['B_end_msec_adj']/1000
        start_time_2 = row['A_begin_msec_adj']/1000
        end_time_2 = row['A_end_msec_adj']/1000

    # if speaker_1 == "A":
    #     start_time_1 = row['A_begin_msec']/1000
    #     end_time_1 = row['A_end_msec']/1000
    #     start_time_2 = row['B_begin_msec']/1000
    #     end_time_2 = row['B_end_msec']/1000
    # elif speaker_1 == "B":
    #     start_time_1 = row['B_begin_msec']/1000
    #     end_time_1 = row['B_end_msec']/1000
    #     start_time_2 = row['A_begin_msec']/1000
    #     end_time_2 = row['A_end_msec']/1000

    ### get the video files for gesture
    video_file_1 = mediapipe_folder + f'{pair}_synced_pp{speaker_1}.mp4'
    video_file_2 = mediapipe_folder + f'{pair}_synced_pp{speaker_2}.mp4'

    ### load the video file and extract the relevant portion
    video_1 = VideoFileClip(video_file_1).subclip(start_time_1, end_time_1)
    video_2 = VideoFileClip(video_file_2).subclip(start_time_2, end_time_2)

    ### combine the video clips and save the comined video
    combined_video = clips_array([[video_1, video_2]])
    output_path = output_real_folder + f'{pair}_{comparison_id}_{rounds}_{similarity_coding}_{round(average_distance, 2)}.mp4'
    combined_video.write_videofile(output_path, codec='libx264', verbose=False, logger=None)

    ### close the videos
    video_1.close()
    video_2.close()
    combined_video.close()

100%|██████████| 419/419 [18:14<00:00,  2.61s/it]
