# Preprocessing of TSST Data

In [None]:
from moviepy.editor import VideoFileClip, AudioFileClip
import glob
import pandas as pd
import re
import os
from IPython.display import Video, Audio

#### Colletion of Videos and creating Audios
First I collect all TSST video files (including the ones from the second camera, but they are currently not used) and convert them to mp3.

In [None]:
# Get all TSST video files
video_files_tsst = glob.glob("/data/dst_tsst_22_bi_multi_nt_lab/raw/mainstudy/**/*.MOV",recursive=True)
video_files_tsst2 = glob.glob("/data/dst_tsst_22_bi_multi_nt_lab/raw/mainstudy/**/*.MP4",recursive=True)

# filter out all _2 videos (video camera splits after 12:30min into two files, speech task always in first video)
video_files_tsst = [path for path in video_files_tsst if not re.search("_2.MOV$", path, re.IGNORECASE)]
video_files_tsst2 = [path for path in video_files_tsst2 if not re.search("_2.MOV$", path, re.IGNORECASE)]

# wrong filename (corrected on vmc, but not synched yet so here done by hand -> can be deleted afterwards)
video_files_tsst = [path for path in video_files_tsst if not re.search("_1b.MOV$", path, re.IGNORECASE)]

print("I found", len(video_files_tsst), "TSST videos")
print("I found", len(video_files_tsst2), "secondary TSST videos")

In [None]:
# Convert all .MOV files into wav files - takes a while
audio_files_tsst = []
for input_file in video_files_tsst:
    output_file = "/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/" + input_file.rsplit('/', 1)[1][:-3] + "wav"
    print(input_file, output_file)
    # skip creating .wav file if it already exists
    if not os.path.exists(output_file):
        clip = VideoFileClip(input_file)
        clip.audio.write_audiofile(output_file) #, codec='pcm_s16le'
    audio_files_tsst.append(output_file)
    AudioFileClip(output_file)


#### Segmentation of Audio File to only include speech task
Next the start of the speech task is manually checked and saved in dictionary.

In [None]:
# Start-times for segmentation in seconds after first camera start (manual inspection)
segment_starts = {'CZ513556_tsst_video': 35, 'CS181122_tsst_video': 30, 'PD513556_tsst_video_': 30, 'JK261022_tsst_video': 49, 'AZ573556_tsst_video': 0, 'JB011222_tsst_video': 23, 'DQ563556_tsst_video': 28, 'DK011122_tsst_video': 28, 'SB041122_tsst_video': 27, 'DC553556_tsst_video': 28, 'AS050123_tsst_video': 30, 'ML031122_tsst_video': 32, 'MK230123_tsst_video_1': 32, 'MX463556_tsst_video': 30, 'MG130123_tsst_video': 24, 'KO433656_tsst_video': 29, 'SB021122_tsst_video': 32, 'SE141122_tsst_video': 30, 'EC250123_tsst_video_1': 28, 'KK483556_tsst_video_1a': 33, 'SS291122_tsst_movie': 33, 'MS021222_tsst_video': 28, 'KT463556_tsst_video': -1, 'JB190123_tsst_video': 30, 'OQ503556_tsst_video': 28, 'NE563556_tsst_video': 28, "TB493656_tsst_video":29, "NI433856_tsst_video":30, "JM463656_tsst_video":28, "BS323856_tsst_video":27, "SB443756_tsst_video":30, "KH553656_tsst_video":28, "FC483856_tsst_video":28, "TF483656_tsst_video":28, "JH373756_tsst_video":26, "OM423756_tsst_video":29, "KK483556_tsst_video_1": 33}

In [None]:
# Filter for new audios, which are not transcribed with start time in the dictionary above
to_check = []
print(sorted(segment_starts.keys()))
for video_path in video_files_tsst:
    #token = video_path[48:56]
    token =  video_path.rsplit('/', 1)[1][:-4]
    if token not in segment_starts.keys():
        audio_path = "/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/" + token + ".wav"
        to_check.append(audio_path)
print("to_check",to_check)

In [None]:
# Code Snippet to listen and manually transcribe start time for new audios
sample_audio = to_check[0]
print(sample_audio)

Audio(sample_audio)

In [None]:
def segment_audio(audio_file, start):
    """
    This function segments an audio file from a given start time to 5 minutes later, to capture only the speech task as an audio file.
    @param audio_file: path to audio file
    @param start: start of speech task in seconds
    @return: path to segmented audio file
    """
    audio = AudioFileClip(audio_file)
    segment_length = 300 # 5minutes
    end = start + segment_length
    new_path = "/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/" + audio_file.rsplit('/', 1)[1][:-4] + "_segment.wav"
    if start == -1:
        return None
    if not os.path.exists(new_path):
        segment = audio.subclip(start, end)
        segment.write_audiofile(new_path)
    return new_path


#### Creating and Filtering DataFrame

I first create a dataframe with token, video_path(s), audio_path, segmented audio path and segment start in seconds. I then merge it with VAS self-assessed stress information from the participant.csv and calculate the delat before and after the stress test, as well filter out anyone that does not speak German as a first language.


In [None]:
video_data = {}
for vfile in (video_files_tsst + video_files_tsst2):
    token = vfile.split("/")[-1][:8]
    if "cam" in vfile:
        name = "TSST2"
    else:
        name = "TSST"
    if token not in video_data:
        video_data[token] = {"TSST": None, "TSST2": None}
    video_data[token][name] = vfile

audio_data = {}
for audio_name, start_num in segment_starts.items():
    audio_file = "/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/" + audio_name + ".wav"
    token = audio_name[:8]
    audio_segment = segment_audio(audio_file, start_num)
    if token not in audio_data:
        audio_data[token] = {"audio_file": None, "audio_segment": None, "segment_start": None}
    audio_data[token]["audio_file"] = audio_file
    audio_data[token]["audio_segment"] = audio_segment
    audio_data[token]["segment_start"] = start_num

data = []
for token, values in video_data.items():
    audio_file = audio_data[token]["audio_file"] if token in audio_data else None
    audio_segment = audio_data[token]["audio_segment"] if token in audio_data else None
    segment_start = audio_data[token]["segment_start"] if token in audio_data else None
    data.append([token, values["TSST"], values["TSST2"], audio_file, audio_segment, segment_start])

tsst_data = pd.DataFrame(data, columns=["token", "TSST_video", "TSST2_video", "TSST_audio", "TSST_audio_segment", "segment_start"])
display(tsst_data)

In [None]:
def get_video_duration(video_path):
    """
    Function to get the duration of a video
    @param video_path: path to video
    @return: duration of clip in seconds
    """
    clip = VideoFileClip(video_path)
    return clip.duration

In [None]:
# get all participant information
participants = pd.read_csv("/data/dst_tsst_22_bi_multi_nt_lab/processed/participant.csv")
display(participants[["token", "tsst_vas_stress_T1", "tsst_vas_stress_T2"]])

In [None]:
# delete non-native German speakers from dataframe
tokens_to_drop = participants.loc[participants['mothertongue'] != 'Deutsch', 'token'].tolist()
tsst_data = tsst_data[~tsst_data['token'].isin(tokens_to_drop)]

# delete all length less than 10minutes (original video) (and delete all NaN videos)
tsst_data = tsst_data[tsst_data['TSST_video'].apply(lambda x: get_video_duration(x) >= 600 if pd.notnull(x) else False)]

# add vas_stress data and calculate delta
tsst_data = pd.merge(tsst_data, participants[['token','tsst_vas_stress_T1','tsst_vas_stress_T2']], on="token", how="inner")
tsst_data['stress_delta'] = tsst_data['tsst_vas_stress_T2'] - tsst_data['tsst_vas_stress_T1']

display(tsst_data)

#### Saving DataFrame as csv

In [None]:
tsst_data.to_csv("/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/tsst_data.csv")