In [None]:
from moviepy.editor import VideoFileClip, AudioFileClip
import glob
import pandas as pd
import re
import os
from IPython.display import Video, Audio
import subprocess
from pydub import AudioSegment
import parselmouth
import matplotlib.pyplot as plt

# Preprocessing of DST Data

#### Collecting Data and creating dataframe

In [None]:
# Get all DST video files
video_files_dst = glob.glob("/data/dst_tsst_22_bi_multi_nt_lab/raw/mainstudy/**/*.webm",recursive=True)

In [None]:
# Sort DST files by token into dictionary
videos_dst = {}
for file in video_files_dst:
    pattern = r'([A-Z]{2}\d+)'
    token = re.search(pattern, file)[0]
    if token not in videos_dst.keys():
        videos_dst[token] = [file]
    else:
        videos_dst[token].append(file)

In [None]:
# eliminate all tokens with more than three files in the value-list (started multiple tests)
videos_dst = {key: value for key, value in videos_dst.items() if len(value) <= 3}

In [None]:
# Create dataframe with token and paths as columns
data = []

for token, file_paths in videos_dst.items():
    speech_task = None
    # only speech task is needed
    #math_task = None
    #introduction = None

    for path in file_paths:
        if "speechTask" in path:
            speech_task = path
    data.append([token, speech_task])

dst_data = pd.DataFrame(data, columns=['token', 'speechTask_webm'])

display(dst_data)


#### Convert .webm to .wav and add to dataframe

In [None]:
# Function to convert webm to wav
def convert_to_wav(video_path, audio_path):
    #clip = VideoFileClip(video_path)
    #clip.audio.write_audiofile(audio_path)
    audio = AudioSegment.from_file(video_path, format='webm')
    audio.export(audio_path, format='wav')


for index, row in dst_data.iterrows():
    # Convert speechTask
    speech_task_video_path = row['speechTask_webm']
    speech_task_audio_path = "/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/" + row["token"] + "_dst_speechTask.wav"
    if not os.path.exists(speech_task_audio_path):
        convert_to_wav(speech_task_video_path, speech_task_audio_path)
    dst_data.loc[index, 'speechTask_audio'] = speech_task_audio_path

display(dst_data)


#### Cutting out silences
The DST speech task includes three questions, to which the participants have 10 seconds to think of an answer and 20 seconds to answer. The "thinking"-silences should be cut out.

Option A: cut always 10-30, 40-60 and 70-90
Option B: cut depending on amplitude

In [None]:
for sample_token in ['NI433856', 'JK261022', 'TH313556', 'MG130123']:
    sample_audio = dst_data.loc[dst_data['token'] == sample_token, 'speechTask_audio'].values[0]
    # wav_path already created for sample files above
    snd = parselmouth.Sound(sample_audio)
    plt.figure()
    plt.title(sample_token)
    plt.plot(snd.xs(), snd.values.T)
    plt.xlim([snd.xmin, snd.xmax])
    plt.ylabel("amplitude")
    plt.xlabel("time [s]")
    plt.axvspan(10, 30, color='red', alpha=0.3)
    plt.axvspan(40, 60, color='red', alpha=0.3)
    plt.axvspan(70, 90, color='red', alpha=0.3)
    plt.show()

In [None]:
print(dst_data["speechTask_audio"][0])

In [None]:
# option A: cutting ridgidly
# Set the desired start and end times for each part (in seconds)
part1_start = 10
part1_end = 30
part2_start = 40
part2_end = 60
part3_start = 70

for index, row in dst_data.iterrows():
    # Load the original audio file
    file = row["speechTask_audio"]
    audio_full = AudioFileClip(file)

    # set part3_end to duration of file, as sometimes it is a few milliseconds shorter than 90 seconds
    part3_end = audio_full.duration

    # Extract the three parts from the original audio
    part1 = audio_full.subclip(part1_start, part1_end)
    part2 = audio_full.subclip(part2_start, part2_end)
    part3 = audio_full.subclip(part3_start, part3_end)

    # Create path/filename for new segments
    path1 = file[:-4] + "_part1.wav"
    path2 = file[:-4] + "_part2.wav"
    path3 = file[:-4] + "_part3.wav"

    # check if path already exists, if not write segment
    for part, path in zip([part1, part2, part3], [path1, path2, path3]):
        if not os.path.exists(path):
            part.write_audiofile(path, fps=16000) # downsample to 16kHz

    # add paths to segments to dataframe
    dst_data.loc[index, 'part1'] = path1
    dst_data.loc[index, 'part2'] = path2
    dst_data.loc[index, 'part3'] = path3


In [None]:
display(dst_data)

### Saving Dataframe as csv

In [None]:
dst_data.to_csv("/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/dst_data.csv")

In [None]:
# Code Snippet to check specific files
file = dst_data["part1"][5]
print(file)
Audio(file)