In [None]:
from moviepy.editor import VideoFileClip, AudioFileClip
import glob
import pandas as pd
import re
import os
from IPython.display import Video, Audio
import subprocess
from pydub import AudioSegment

# Preprocessing of DST Data

#### Collecting Data and creating dataframe

In [None]:
# Get all DST video files
video_files_dst = glob.glob("/data/dst_tsst_22_bi_multi_nt_lab/raw/mainstudy/**/*.webm",recursive=True)

In [None]:
# Sort DST files by token into dictionary
videos_dst = {}
for file in video_files_dst:
    pattern = r'([A-Z]{2}\d+)'
    token = re.search(pattern, file)[0]
    if token not in videos_dst.keys():
        videos_dst[token] = [file]
    else:
        videos_dst[token].append(file)

In [None]:
# eliminate all tokens with more than three files in the value-list (started multiple tests)
videos_dst = {key: value for key, value in videos_dst.items() if len(value) <= 3}

In [None]:
# Create dataframe with token and paths as columns
data = []

for token, file_paths in videos_dst.items():
    speech_task = None
    math_task = None
    introduction = None

    for path in file_paths:
        if "speechTask" in path:
            speech_task = path
        elif "mathTask" in path:
            math_task = path
        elif "introduction" in path:
            introduction = path
    data.append([token, speech_task, math_task, introduction])

dst_data = pd.DataFrame(data, columns=['token', 'speechTask', 'mathTask', 'introduction'])

display(dst_data)


#### Convert .webm to .wav and add to dataframe

In [None]:
# Function to convert webm to wav
def convert_to_wav(video_path, audio_path):
    #clip = VideoFileClip(video_path)
    #clip.audio.write_audiofile(audio_path)
    audio = AudioSegment.from_file(video_path, format='webm')
    audio.export(audio_path, format='wav')


for index, row in dst_data.iterrows():
    # Convert speechTask
    speech_task_video_path = row['speechTask']
    speech_task_audio_path = "/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/" + row["token"] + "_dst_speechTask.wav"
    if not os.path.exists(speech_task_audio_path):
        convert_to_wav(speech_task_video_path, speech_task_audio_path)
    dst_data.loc[index, 'speechTask_audio'] = speech_task_audio_path

    # Convert mathTask
    math_task_video_path = row['mathTask']
    math_task_audio_path = "/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/" + row["token"] + "_dst_mathTask.wav"
    if not os.path.exists(math_task_audio_path):
        convert_to_wav(math_task_video_path, math_task_audio_path)
    dst_data.loc[index, 'mathTask_audio'] = math_task_audio_path

display(dst_data)


In [None]:
print(dst_data.loc[0,"mathTask_audio"])

In [None]:
Audio(dst_data.loc[0,"speechTask_audio"])

In [None]:
# TODO: Cut out silences in speechTask
# TODO: dataframe, conversion simplify? -> only need speechTask