# Preprocessing of TSST Data

In [None]:
from moviepy.editor import VideoFileClip, AudioFileClip
import glob
import pandas as pd
import re
import os
from IPython.display import Video, Audio

#### Colletion of Videos and creating Audios
First I collect all TSST video files (including the ones from the second camera, but they are currently not used) and convert them to mp3.

In [None]:
# Get all TSST video files
video_files_tsst = glob.glob("/data/dst_tsst_22_bi_multi_nt_lab/raw/mainstudy/**/*.MOV",recursive=True)
video_files_tsst2 = glob.glob("/data/dst_tsst_22_bi_multi_nt_lab/raw/mainstudy/**/*.MP4",recursive=True)

# filter out all _2 videos (video camera splits after 12:30min into two files, speech task always in first video)
video_files_tsst = [path for path in video_files_tsst if not re.search("_2.MOV$", path, re.IGNORECASE)]
video_files_tsst2 = [path for path in video_files_tsst2 if not re.search("_2.MOV$", path, re.IGNORECASE)]

# wrong filename (corrected on vmc, but not synched yet so here done by hand -> can be deleted afterwards)
video_files_tsst = [path for path in video_files_tsst if not re.search("_1b.MOV$", path, re.IGNORECASE)]

print("I found", len(video_files_tsst), "TSST videos")
print("I found", len(video_files_tsst2), "secondary TSST videos")

In [None]:
# Convert all .MOV files into wav files - takes a while
audio_files_tsst = []
for input_file in video_files_tsst:
    output_file = "/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/" + input_file.rsplit('/', 1)[1][:-3] + "wav"
    print(input_file, output_file)
    # skip creating .wav file if it already exists
    if not os.path.exists(output_file):
        clip = VideoFileClip(input_file)
        clip.audio.write_audiofile(output_file) #, codec='pcm_s16le'
    audio_files_tsst.append(output_file)
    AudioFileClip(output_file)


#### Segmentation of Audio File to only include speech task
Next the start of the speech task is manually checked and saved in dictionary.

In [None]:
# Start-times for segmentation in seconds after first camera start (manual inspection)
segment_starts = {'CZ513556_tsst_video': 35, 'CS181122_tsst_video': 30, 'PD513556_tsst_video_': 30, 'JK261022_tsst_video': 49, 'AZ573556_tsst_video': 0, 'JB011222_tsst_video': 23, 'DQ563556_tsst_video': 28, 'DK011122_tsst_video': 28, 'SB041122_tsst_video': 27, 'DC553556_tsst_video': 28, 'AS050123_tsst_video': 30, 'ML031122_tsst_video': 32, 'MK230123_tsst_video_1': 32, 'MX463556_tsst_video': 30, 'MG130123_tsst_video': 24, 'KO433656_tsst_video': 29, 'SB021122_tsst_video': 32, 'SE141122_tsst_video': 30, 'EC250123_tsst_video_1': 28, 'KK483556_tsst_video_1a': 33, 'SS291122_tsst_movie': 33, 'MS021222_tsst_video': 28, 'KT463556_tsst_video': -1, 'JB190123_tsst_video': 30, 'OQ503556_tsst_video': 28, 'NE563556_tsst_video': 28, "TB493656_tsst_video":29, "NI433856_tsst_video":30, "JM463656_tsst_video":28, "BS323856_tsst_video":27, "SB443756_tsst_video":30, "KH553656_tsst_video":28, "FC483856_tsst_video":28, "TF483656_tsst_video":28, "JH373756_tsst_video":26, "OM423756_tsst_video":29, "KK483556_tsst_video_1": 33, "TZ493156_tsst_video":33, "NM443056_tsst_video":28, "WV453056_tsst_video":31, "BU563856_tsst_video":31, "BI343156_tsst_video":42, "ML373056_tsst_video":29, "BC493156_tsst_video": 38, "UH473956_tsst_video": 30, "BH373056_tsst_video": 40, "TO523956_tsst_video":30}

In [None]:
# Filter for new audios, which are not transcribed with start time in the dictionary above
to_check = []
#print(sorted(segment_starts.keys()))
for video_path in video_files_tsst:
    #token = video_path[48:56]
    token =  video_path.rsplit('/', 1)[1][:-4]
    if token not in segment_starts.keys():
        audio_path = "/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/" + token + ".wav"
        to_check.append(audio_path)
print("to_check",to_check)

In [None]:
# Code Snippet to listen and manually transcribe start time for new audios
sample_audio = to_check[9]
print(sample_audio)

In [None]:
from moviepy.editor import AudioFileClip

audio_clip = AudioFileClip(sample_audio)

# Display the video clip
audio_clip.ipython_display(width=400, maxduration=900)

In [None]:
Audio(sample_audio)

In [None]:
def segment_audio(audio_file, start):
    """
    This function segments an audio file from a given start time to 5 minutes later, to capture only the speech task as an audio file.
    @param audio_file: path to audio file
    @param start: start of speech task in seconds
    @return: path to segmented audio file
    """
    audio = AudioFileClip(audio_file)
    segment_length = 300 # 5minutes
    end = start + segment_length
    new_path = "/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/" + audio_file.rsplit('/', 1)[1][:-4] + "_segment.wav"
    if start == -1:
        return None
    if not os.path.exists(new_path):
        segment = audio.subclip(start, end)
        segment.write_audiofile(new_path, fps=16000) # downsample to 16Hz
    return new_path


#### Creating and Filtering DataFrame

I first create a dataframe with token, video_path(s), audio_path, segmented audio path and segment start in seconds. I then merge it with VAS self-assessed stress information from the participant.csv and calculate the delat before and after the stress test, as well filter out anyone that does not speak German as a first language.


In [None]:
video_data = {}
for vfile in (video_files_tsst + video_files_tsst2):
    token = vfile.split("/")[-1][:8]
    if "cam" in vfile:
        name = "TSST2"
    else:
        name = "TSST"
    if token not in video_data:
        video_data[token] = {"TSST": None, "TSST2": None}
    video_data[token][name] = vfile

audio_data = {}
for audio_name, start_num in segment_starts.items():
    audio_file = "/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/" + audio_name + ".wav"
    token = audio_name[:8]
    audio_segment = segment_audio(audio_file, start_num)
    if token not in audio_data:
        audio_data[token] = {"audio_file": None, "audio_segment": None, "segment_start": None}
    audio_data[token]["audio_file"] = audio_file
    audio_data[token]["audio_segment"] = audio_segment
    audio_data[token]["segment_start"] = start_num

data = []
for token, values in video_data.items():
    audio_file = audio_data[token]["audio_file"] if token in audio_data else None
    audio_segment = audio_data[token]["audio_segment"] if token in audio_data else None
    segment_start = audio_data[token]["segment_start"] if token in audio_data else None
    data.append([token, values["TSST"], values["TSST2"], audio_file, audio_segment, segment_start])

tsst_data = pd.DataFrame(data, columns=["token", "TSST_video", "TSST2_video", "TSST_audio", "TSST_audio_segment", "segment_start"])
display(tsst_data)

In [None]:
def get_video_duration(video_path):
    """
    Function to get the duration of a video
    @param video_path: path to video
    @return: duration of clip in seconds
    """
    clip = VideoFileClip(video_path)
    return clip.duration

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# get all participant information
participants = pd.read_csv("/data/dst_tsst_22_bi_multi_nt_lab/processed/participant.csv")
display(participants[["token", "tsst_vas_stress_T1", "tsst_vas_stress_T2"]])

In [None]:
loo_token = {0:'ML031122', 1:'MK230123', 2:'JB011222', 3:'DK011122', 4:'DC553556', 5:'WV453056', 6:'JB190123', 7:'SS291122', 8:'AS050123', 9:'TO523956', 10:'JK261022'}

test_files_280 = ['NE563556', 'JM463656', 'TF483656']

loo = ['/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/ML031122_tsst_video_segment.wav'
,'/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/MK230123_tsst_video_1_segment.wav'
,'/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/JB011222_tsst_video_segment.wav'
,'/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/DK011122_tsst_video_segment.wav'
,'/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/DC553556_tsst_video_segment.wav'
,'/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/WV453056_tsst_video_segment.wav'
,'/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/JB190123_tsst_video_segment.wav'
,'/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/SS291122_tsst_movie_segment.wav'
,'/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/AS050123_tsst_video_segment.wav'
,'/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/TO523956_tsst_video_segment.wav'
,'/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/JK261022_tsst_video_segment.wav']
sample_audio = loo[0]
audio_clip = AudioFileClip(sample_audio)
Audio(sample_audio)



In [None]:
from moviepy.editor import AudioFileClip
# Display the video clip
audio_clip.ipython_display(width=400, maxduration=900)

In [None]:
# delete non-native German speakers from dataframe
print("before mothertongue", len(tsst_data))
tokens_to_drop = participants.loc[participants['mothertongue'] != 'Deutsch', 'token'].tolist()
tsst_data = tsst_data[~tsst_data['token'].isin(tokens_to_drop)]
print("after mothertongue", len(tsst_data))

# delete all length less than 10minutes (original video) (and delete all NaN videos)
tsst_data = tsst_data[tsst_data['TSST_video'].apply(lambda x: get_video_duration(x) >= 300 if pd.notnull(x) else False)]
print("after > 10min original video", len(tsst_data))

# add vas_stress data and calculate delta
tsst_data = pd.merge(tsst_data, participants[['token','tsst_vas_stress_T1','tsst_vas_stress_T2', 'panel_passivevas_speechstress0', 'panel_activevas_speechstress0']], on="token", how="inner")
tsst_data['stress_delta'] = tsst_data['tsst_vas_stress_T2'] - tsst_data['tsst_vas_stress_T1']
tsst_data = tsst_data.dropna(subset=["panel_passivevas_speechstress0", "panel_activevas_speechstress0"])
tsst_data['panel_stress_speech_average'] = (tsst_data['panel_passivevas_speechstress0'] + tsst_data['panel_activevas_speechstress0']) / 2

print("after self-assessed stress merge")

# add panel vas_stress (absolut not relative, since only one measurement)
#participants[""]



display(tsst_data)

In [None]:
for i in range(3):
    panel_stress = tsst_data.loc[tsst_data['token'] == test_files_280[i], 'panel_stress_speech_average'].values[0]
    print(i, panel_stress)

In [None]:
tsst_data.to_csv("/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/tsst_data.csv")

In [None]:
# TEMPORARY CODE UNTIL VM-PROBLEMS SORTED
panel = pd.read_csv("/homes/nziegenbein/panel_survey.csv")
panel_stress_speech = panel[["token", "panel_passivevas_speechstress0", "panel_activevas_speechstress0"]].dropna()
panel_stress_speech['panel_stress_speech_average'] = (panel_stress_speech['panel_passivevas_speechstress0'] + panel_stress_speech['panel_activevas_speechstress0']) / 2
display(panel_stress_speech)

# delete non-native German speakers from dataframe
tokens_to_drop = participants.loc[participants['mothertongue'] != 'Deutsch', 'token'].tolist()
tsst_data = tsst_data[~tsst_data['token'].isin(tokens_to_drop)]

# delete all length less than 10minutes (original video) (and delete all NaN videos)
tsst_data = tsst_data[tsst_data['TSST_video'].apply(lambda x: get_video_duration(x) >= 600 if pd.notnull(x) else False)]

# add vas_stress data and calculate delta
tsst_data = pd.merge(tsst_data, panel_stress_speech, on="token", how="inner")
display(tsst_data)

## Trying Butterworth (lowpass) Filter to reduce noise

In [None]:
Audio(tsst_data["TSST_audio_segment"][0])

In [None]:
import numpy as np
from scipy.io import wavfile
from scipy.signal import butter, lfilter

def butter_lowpass(cutoff, fs, order=5):
    nyquist = 0.5 * fs
    normal_cutoff = cutoff / nyquist
    b, a = butter(order, normal_cutoff, btype='low', analog=False)
    return b, a

def butter_lowpass_filter(data, cutoff, fs, order=5):
    b, a = butter_lowpass(cutoff, fs, order=order)
    y = lfilter(b, a, data)
    return y

# Load the audio file
fs, data = wavfile.read(tsst_data["TSST_audio_segment"][0])

# Define the cutoff frequency and filter order
cutoff_freq = 1200  # Adjust this value based on your needs
order = 6

# Apply the low-pass filter
filtered_data = butter_lowpass_filter(data, cutoff_freq, fs, order)

temp_path = "/data/dst_tsst_22_bi_multi_nt_lab/processed/audio_files/temp_file.wav"
wavfile.write(temp_path, fs, (filtered_data * 32767).astype(np.int16))

Audio(temp_path)


#### Saving DataFrame as csv

### Visualizing panel assessed stress distribution

In [None]:
panel = pd.read_csv("/data/dst_tsst_22_bi_multi_nt_lab/raw/dstvtsst_limesurvey_panel_28-02-23.csv")
display(panel)

In [None]:
import matplotlib.pyplot as plt
panel['VAS0Stress0Passive[SQ001]'].hist(bins=[0,10,20,30,40,50,60,70,80,90,100])
plt.title("Distribution of panel-assessed stress (passive)")
plt.show()

In [None]:
panel['VAS0Stress0Active[SQ001]'].hist(bins=[0,10,20,30,40,50,60,70,80,90,100])
plt.title("Distribution of panel-assessed stress (active)")
plt.show()

In [None]:
panel['panel_stress'] = (panel['VAS0Stress0Active[SQ001]'] + panel['VAS0Stress0Passive[SQ001]']) / 2
panel['panel_stress'].hist(bins=[0,10,20,30,40,50,60,70,80,90,100])
plt.title("Distribution of panel-assessed stress (averaged)")
plt.show()

In [None]:
panel['panel_stress_difference'] = abs(panel['VAS0Stress0Active[SQ001]'] - panel['VAS0Stress0Passive[SQ001]'])
panel['panel_stress_difference'].hist(bins=[0,10,20,30,40,50,60,70,80,90,100])
plt.title("Distribution of difference in panel-assessed stress between panel members")
plt.show()

### Visualization of cortisol and alpha-amylase stress distribution

Value inferred as peak-reactivity according to Miller, which is the difference between the peak (highest value) and the baseline (measurement T1).

In [None]:
display(participants)
participants["tsst_amylase_mean_1"]

In [None]:
x = ["T1", "T2", "T3", "T4", "T5"]
for index, row in  participants.filter(like="tsst_cortisol_mean").iterrows():
    y = row
    plt.plot(x,y,  marker='o', linestyle='--')
plt.xlabel('Time Points')
plt.ylabel('Cortisol Mean')
plt.title('Trajectories of Cortisol Mean')
plt.legend()
plt.show()

In [None]:
# Custom function to handle commas as dots while converting to float
def custom_to_float(value):
    if isinstance(value, str) and ',' in value:
        return float(value.replace(',', '.'))
    return float(value)

In [None]:
# Find the maximum value among "T2" to "T5" columns for each row
participants = participants[['tsst_cortisol_mean_1','tsst_cortisol_mean_2', 'tsst_cortisol_mean_3', 'tsst_cortisol_mean_4', 'tsst_cortisol_mean_5']].applymap(custom_to_float)
participants['tsst_cortisol_peak'] = participants[['tsst_cortisol_mean_2', 'tsst_cortisol_mean_3', 'tsst_cortisol_mean_4', 'tsst_cortisol_mean_5']].max(axis=1)
# Calculate the difference between "baseline" and "peak"
participants['tsst_cortisol_peak_difference'] = participants['tsst_cortisol_peak'] - participants['tsst_cortisol_mean_1']
participants['tsst_cortisol_peak_difference'].hist(bins=[-5,0,5,10,15,20,25,30,35])
plt.title("Distribution of cortisol peak-reactivity")
plt.show()

In [None]:
display(participants.filter(like="tsst_amylase_mean"))

In [None]:
participants = participants[['tsst_amylase_mean_1','tsst_amylase_mean_2', 'tsst_amylase_mean_3', 'tsst_amylase_mean_4', 'tsst_amylase_mean_5']].applymap(custom_to_float)
x = ["T1", "T2", "T3", "T4", "T5"]
for index, row in participants.filter(like="tsst_amylase_mean").iterrows():
	y = row
	plt.plot(x, y, marker='o', linestyle='--')
plt.xlabel('Time Points')
plt.ylabel('Amylase Mean')
plt.title('Trajectories of Amylase Mean')
plt.show()

In [None]:
participants['tsst_amylase_peak'] = participants[['tsst_amylase_mean_2', 'tsst_amylase_mean_3', 'tsst_amylase_mean_4', 'tsst_amylase_mean_5']].max(axis=1)
# Calculate the difference between "baseline" and "peak"
participants['tsst_amylase_peak_difference'] = participants['tsst_amylase_peak'] - participants['tsst_amylase_mean_1']
participants['tsst_amylase_peak_difference'].hist(bins=[-100,-50,0,50,100,150,200,250,300,350,400,450,500,550, 600,650,700])
plt.title("Distribution of amylase peak-reactivity")
plt.show()