In [None]:
from moviepy.editor import VideoFileClip, AudioFileClip
import glob
import pandas as pd
import pydub

In [None]:
# Get all TSST video files
video_files_tsst = glob.glob("/data/dst_tsst_22_bi_multi_nt_lab/raw/mainstudy/**/*.MOV",recursive=True)
video_files_tsst2 = glob.glob("/data/dst_tsst_22_bi_multi_nt_lab/raw/mainstudy/**/*.MP4",recursive=True)

print("I found", len(video_files_tsst), "TSST videos")
print("I found", len(video_files_tsst2), "secondary TSST videos")

In [None]:
# Convert all .MOV files into mp3 files - takes a while
audio_files_tsst = []
for input_file in video_files_tsst:
    output_file= "/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/" + input_file.rsplit('/', 1)[1][:-3] + "mp3"
    # print(input_file, output_file)
    # skip creating mp3, if already exists
    if not os.path.exists(output_file):
        clip = VideoFileClip(input_file)
        clip.audio.write_audiofile(output_file)
    audio_files_tsst.append(output_file)
    Audio(output_file)


In [None]:
# Start-times for segmentation in seconds after first camera start (manual inspection)
segment_starts = {'CZ513556_tsst_video': 35, 'CS181122_tsst_video': 30, 'PD513556_tsst_video_': 30, 'KK483556_tsst_video_1b': -1, 'JK261022_tsst_video': 49, 'EC250123_tsst_video_2': -1, 'AZ573556_tsst_video': 0, 'JB011222_tsst_video': 23, 'DQ563556_tsst_video': 28, 'DK011122_tsst_video': 28, 'MK230123_tsst_video_2': -1, 'SB041122_tsst_video': 27, 'DC553556_tsst_video': 28, 'AS050123_tsst_video': 30, 'ML031122_tsst_video': 32, 'MK230123_tsst_video_1': 32, 'MX463556_tsst_video': 30, 'MG130123_tsst_video': 24, 'KO433656_tsst_video': 29, 'JK261022_tsst_video_2': -1, 'SB021122_tsst_video': 32, 'SE141122_tsst_video': 30, 'EC250123_tsst_video_1': 28, 'KK483556_tsst_video_1a': 33, 'SS291122_tsst_movie': 33, 'MS021222_tsst_video': 28, 'KT463556_tsst_video': -1, 'JB190123_tsst_video': 30, 'SB021122_tsst_video_2': -1, 'OQ503556_tsst_video': 28, 'NE563556_tsst_video': 28}

In [None]:
def segment_audio(audio_file, start):
    audio = AudioFileClip(audio_file)
    segment_length = 300 # 5minutes
    end = start + segment_length
    new_path = "/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/" + audio_file.rsplit('/', 1)[1][:-4] + "_segment.mp3"
    if start == -1:
        return None
    if not os.path.exists(new_path):
        segment = audio.subclip(start, end)
        segment.write_audiofile(new_path)
    return new_path


In [None]:
video_data = {}
for vfile in (video_files_tsst + video_files_tsst2):
    token = vfile.split("/")[-1][:8]
    if "cam" in vfile:
        name = "TSST2"
    else:
        name = "TSST"
    if token not in video_data:
        video_data[token] = {"TSST": None, "TSST2": None}
    video_data[token][name] = vfile

audio_data = {}
for audio_name, start_num in segment_starts.items():
    audio_file = "/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/" + audio_name + ".mp3"
    token = audio_name[:8]
    audio_segment = segment_audio(audio_file, start_num)
    if token not in audio_data:
        audio_data[token] = {"audio_file": None, "audio_segment": None, "segment_start": None}
    audio_data[token]["audio_file"] = audio_file
    audio_data[token]["audio_segment"] = audio_segment
    audio_data[token]["segment_start"] = start_num

data = []
for token, values in video_data.items():
    audio_file = audio_data[token]["audio_file"] if token in audio_data else None
    audio_segment = audio_data[token]["audio_segment"] if token in audio_data else None
    segment_start = audio_data[token]["segment_start"] if token in audio_data else None
    data.append([token, values["TSST"], values["TSST2"], audio_file, audio_segment, segment_start])

tsst_data = pd.DataFrame(data, columns=["token", "TSST_video", "TSST2_video", "TSST_audio", "TSST_audio_segment", "segment_start"])

# Print the updated DataFrame
display(tsst_data)

In [None]:
# get all participant information
participants = pd.read_csv("/data/dst_tsst_22_bi_multi_nt_lab/processed/participant.csv")

In [None]:
# Function to get the duration of a video
def get_video_duration(video_path):
    clip = VideoFileClip(video_path)
    return clip.duration

In [None]:
# delete non-native German speakers from dataframe
tokens_to_drop = participants.loc[participants['mothertongue'] != 'Deutsch', 'token'].tolist()
tsst_data = tsst_data[~tsst_data['token'].isin(tokens_to_drop)]

# delete all length less than 10minutes (original video)
tsst_data = tsst_data[tsst_data['TSST_video'].apply(lambda x: get_video_duration(x) >= 600)]

display(tsst_data)

In [None]:
from IPython.display import Video, Audio
import os

In [None]:
input_file = video_files_tsst[3]
output_file= "/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/" + input_file.rsplit('/', 1)[1][:-3] + "mp3"
print(input_file, output_file)


In [None]:
audio_files_tsst = glob.glob("/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/**.mp3",recursive=True)
print(audio_files_tsst)

In [None]:


print(audio_files_tsst[0])

In [None]:
input_file = VideoFileClip(video_files_tsst2[1])
output_file= "/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/test_TSST2_audio_quality.mp3"
# print(input_file, output_file)
# skip creating mp3, if already exists
if not os.path.exists(output_file):
    clip = VideoFileClip(input_file)
    clip.audio.write_audiofile(output_file)
Audio(output_file)

In [None]:
print(len(audio_files_tsst))

In [None]:
print(audio_files_tsst)