## 提取文本

In [62]:
import pandas as pd
def merge_dialogues(csv_file, label, participant, max_duration=60, segments = []):
    df = pd.read_csv(csv_file, sep='\t')
    current_start_time = None
    current_end_time = None
    current_dialogue = ""
    current_duration = 0

    for index, row in df.iterrows():
        if pd.isna(row['value']):  # Skip rows where 'value' is NaN
            continue
        start_time = row['start_time']
        stop_time = row['stop_time']
        dialogue = row['value']
        duration = stop_time - start_time
         
        if current_start_time is None:
            current_start_time = start_time
            current_end_time = stop_time
            current_dialogue = dialogue
            current_duration = duration
        elif stop_time - current_start_time <= max_duration:
            current_end_time = stop_time
            current_dialogue += " " + dialogue
            current_duration += duration
        else:
            segments.append([label, participant, current_start_time, current_end_time, current_dialogue])
            current_start_time = start_time
            current_end_time = stop_time
            current_dialogue = dialogue
            current_duration = duration

    # Add the last segment
    if current_dialogue:
        if current_end_time - current_start_time >= 10:
            segments.append([label, participant, current_start_time, current_end_time, current_dialogue])

    # Save the segments to a new file
    return segments

In [64]:
import os
# 指定 CSV 文件的路径
file_path = './test_split.csv'

output_file = "./test.csv"

# 使用 pandas 读取 CSV 文件
df = pd.read_csv(file_path, usecols=['Participant_ID', 'PHQ_Binary'])
segments = []

source_folder = './'

# 遍历数据集中的每一行
for index, row in df.iterrows():
    participant_folder = os.path.join(source_folder, str(row['Participant_ID']) + "_P")
    if os.path.exists(participant_folder):
        source_file = os.path.join(participant_folder,  str(row['Participant_ID']) + '_TRANSCRIPT.csv')
        if os.path.exists(source_file):
            segments = merge_dialogues(source_file, row['PHQ_Binary'], str(row['Participant_ID']), 60, segments)
        else:
            print(f'File not found: {source_file}')
    else:
        print(f'Folder not found: {participant_folder}')

pd.DataFrame(segments, columns=['label', 'participant', 'start_time', 'stop_time', 'dialogue']).to_csv(output_file, index=False, sep='\t')

## 提取音频

In [65]:
def add_column_to_csv(csv_file, new_column_name, new_column_values):
    df = pd.read_csv(csv_file, sep='\t')
    df[new_column_name] = new_column_values
    df.to_csv(csv_file, index=False, sep='\t')

In [7]:
import pandas as pd
import os
import librosa
import soundfile as sf
import numpy as np

# 加载train.csv数据
train_csv_file = "test.csv"
train_df = pd.read_csv(train_csv_file, sep='\t')

# 定义切割音频的函数
def split_audio(audio_path, target_folder, participant_id, index):
    # 加载音频文件
    y, sr = librosa.load(audio_path, sr=16000)
    # 分割音频并保存
    for i in range(6):
        start_sample = i * 10 * sr
        end_sample = (i + 1) * 10 * sr
        segment = y[start_sample:end_sample]
        # 创建目标文件夹，如果不存在
        os.makedirs(target_folder, exist_ok=True)
        # 生成目标文件路径
        target_file = os.path.join(target_folder, f"{participant_id}_{index}_{i}.wav")
        # 保存音频片段
        sf.write(target_file, segment, sr)
        # 返回音频片段的路径
        yield target_file

# 遍历train数据集中的每一行
for index, row in train_df.iterrows():
    participant_id = row['participant']
    participant_folder =  str(participant_id) + "_P"
    
    # 检查参与者文件夹是否存在
    if os.path.exists(participant_folder):
        audio_file = row['audio_segment_path']
        
        # 检查音频文件是否存在
        if os.path.exists(audio_file):
            # 初始化新的CSV文件，用于保存截取的音频信息
            new_csv_path = os.path.join("audio_csv", f"{participant_id}_{index}_audio_segments.csv")
            new_df = pd.DataFrame(columns=['audio_segment_path'])
            
            # 分割音频并保存到新的CSV文件中
            for segment_path in split_audio(audio_file, "audio_segment", participant_id, index):
                new_df.loc[len(new_df)] = [segment_path]
            
            # 保存新的CSV文件
            new_df.to_csv(new_csv_path, index=False)
            
            # 将新CSV文件的路径记录到train.csv中
            train_df.loc[index, 'new_audio_segments_path'] = new_csv_path
        else:
            print(f"Audio file not found for participant {participant_id}")
    else:
        print(f"Folder not found for participant {participant_id}")

# 保存更新后的train.csv文件
train_df.to_csv(train_csv_file, index=False, sep='\t')


In [None]:
import pandas as pd
import os
import librosa
import soundfile as sf
import numpy as np

# 加载train.csv数据
train_csv_file = "train.csv"
train_df = pd.read_csv(train_csv_file, sep='\t')

# 定义切割音频的函数
def split_audio(audio_path, target_folder, participant_id, start_time, stop_time, index):
    # 加载音频文件
    y, sr = librosa.load(audio_path, sr=16000)
    # 计算开始和结束的采样点
    start_sample = int(start_time * sr)
    end_sample = int(stop_time * sr)
    # 提取音频片段
    segment = y[start_sample:end_sample]
    # 如果片段长度小于10秒，进行填充
    if len(segment) < 10 * sr:
        padding = 10 * sr - len(segment)
        segment = np.pad(segment, (0, padding), 'constant')
    # 创建目标文件夹，如果不存在
    os.makedirs(target_folder, exist_ok=True)
    # 生成目标文件路径
    target_file = os.path.join(target_folder, f"{participant_id}_{index}.wav")
    # 保存音频片段
    sf.write(target_file, segment, sr)
    return target_file

# 遍历train数据集中的每一行
for index, row in train_df.iterrows():
    participant_id = row['participant']
    start_time = row['start_time']
    stop_time = row['stop_time']
    participant_folder =  str(participant_id) + "_P"
    
    # 检查参与者文件夹是否存在
    if os.path.exists(participant_folder):
        transcript_file = os.path.join(participant_folder, f"{participant_id}_TRANSCRIPT.csv")
        audio_file = os.path.join(participant_folder, f"{participant_id}_AUDIO.wav")
        
        # 检查转录文件和音频文件是否存在
        if os.path.exists(transcript_file) and os.path.exists(audio_file):
            transcript_df = pd.read_csv(transcript_file, sep='\t')
            # 筛选出在时间范围内的句子
            sentences = transcript_df[(transcript_df['start_time'] >= start_time) & (transcript_df['stop_time'] <= stop_time)]
            
            # 初始化新的CSV文件，用于保存截取的音频信息
            new_csv_path = os.path.join("audio_csv", f"{participant_id}_{index}_audio_segments.csv")
            new_df = pd.DataFrame(columns=['start_time', 'stop_time', 'audio_segment_path'])
            
            # 合并和截取音频
            segment_index = 0
            current_start_time = None
            current_stop_time = None
            for _, sentence in sentences.iterrows():
                if segment_index >= 6:
                    break
                if current_start_time is None:
                    current_start_time = sentence['start_time']
                current_stop_time = sentence['stop_time']
                segment_duration = current_stop_time - current_start_time
                if segment_duration >= 10:
                    # 截取10秒的音频片段
                    segment_path = split_audio(audio_file, "audio", participant_id, current_start_time, current_start_time + 10, segment_index)
                    df = pd.concat([new_df, pd.DataFrame([{'start_time': current_start_time, 'stop_time': current_start_time + 10, 'audio_segment_path': segment_path}])], ignore_index=True)
                    segment_index += 1
                    current_start_time = None
                elif segment_index == 5:
                    # 对最后一个片段进行填充
                    segment_path = split_audio(audio_file, "audio", participant_id, current_start_time, current_stop_time, segment_index)
                    df = pd.concat([new_df, pd.DataFrame([{'start_time': current_start_time, 'stop_time': current_stop_time, 'audio_segment_path': segment_path}])], ignore_index=True)
                    segment_index += 1
            if segment_index == 6:
                new_df.to_csv(new_csv_path, index=False)
                train_df.loc[index, 'new_audio_segments_path'] = new_csv_path
           
        else:
            print(f"Transcript or audio file not found for participant {participant_id}")
    else:
        print(f"Folder not found for participant {participant_id}")

# 保存更新后的train.csv文件
train_df.to_csv(train_csv_file, index=False)

In [67]:
import librosa
import soundfile as sf
import numpy as np


csv_file = "test.csv"

target_folder = "audio"

df = pd.read_csv(csv_file, sep='\t')

# 定义切割音频的函数
def split_audio(audio_path, target_folder, participant_id, start_time, stop_time, index, segment_duration=60):
    # 加载音频文件
    y, sr = librosa.load(audio_path, sr=16000)
    # 计算开始和结束的采样点
    start_sample = int(start_time * sr)
    end_sample = int(stop_time * sr)
    # 提取音频片段
    segment = y[start_sample:end_sample]
    # 如果片段长度小于 segment_duration，进行填充
    if len(segment) < segment_duration * sr:
        padding = segment_duration * sr - len(segment)
        segment = np.pad(segment, (0, padding), 'constant')
    # 创建目标文件夹，如果不存在
    os.makedirs(target_folder, exist_ok=True)
    # 生成目标文件路径
    target_file = os.path.join(target_folder, f"{participant_id}_{index}.wav")
    # 保存音频片段
    sf.write(target_file, segment, sr)
    return target_file


new_column_values = []

index = 0

# 遍历数据集中的每一行
for index, row in df.iterrows():
    participant_folder = os.path.join(source_folder, str(row['participant']) + "_P")
    if os.path.exists(participant_folder):
        source_file = os.path.join(participant_folder,  str(row['participant']) + '_AUDIO.wav')
        if os.path.exists(source_file):
            start_time = row['start_time']
            stop_time = row['stop_time']
            path = split_audio(source_file, target_folder, row['participant'], start_time, stop_time, index)
            new_column_values.append(path)
            index += 1
        else:
            print(f'File not found: {source_file}')
    else:
        print(f'Folder not found: {participant_folder}')


column_index = 1
df.insert(column_index, 'audio_segment_path', new_column_values)
df.to_csv(csv_file, index=False, sep='\t')

## 提取视频

In [9]:
import pandas as pd
import os
import numpy as np

# 定义切割视觉特征的函数
def split_visual(visual_path, target_folder, participant_id, start_time, stop_time, index, frame_rate=30, target_frames=150):
    # 读取 CLNF 特征文件
    visual_df = pd.read_csv(visual_path, sep=', ', engine='python')

    # 计算开始和结束帧
    start_frame = int(start_time * frame_rate)
    end_frame = int(stop_time * frame_rate)

    # 计算间隔
    total_frames = end_frame - start_frame
    if total_frames > target_frames:
        frame_skip = total_frames // target_frames
    else:
        frame_skip = 1

    # 截取对应帧的特征
    extracted_df = visual_df.iloc[start_frame:end_frame:frame_skip]

    # 如果提取的帧数多于目标帧数，则进一步截取
    if len(extracted_df) > target_frames:
        extracted_df = extracted_df.iloc[:target_frames]

    # 创建目标文件夹
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    # 生成目标文件路径
    output_file = os.path.join(target_folder, f'visual_features_{participant_id}_{index}.csv')

    # 保存特征到新的 CSV 文件
    extracted_df.to_csv(output_file, index=False)

    return output_file

# 定义一个列表来存储特征文件的路径
new_column_values = []

# 设置源文件夹和目标文件夹
target_folder = 'visual'

csv_file = "test.csv"

source_folder = "./"

# 读取样本 CSV 文件
df = pd.read_csv(csv_file, sep='\t')

for index, row in df.iterrows():
    participant_folder = os.path.join(source_folder, str(row['participant']) + "_P")
    if os.path.exists(participant_folder):
        source_file = os.path.join(participant_folder,  str(row['participant']) + '_CLNF_features.txt')
        if os.path.exists(source_file):
            start_time = row['start_time']
            stop_time = row['stop_time']
            path = split_visual(source_file, target_folder, row['participant'], start_time, stop_time, index)
            new_column_values.append(path)
            index += 1
        else:
            print(f'File not found: {source_file}')
    else:
        print(f'Folder not found: {participant_folder}')


column_index = 1

df.insert(column_index, 'visual_segment_path', new_column_values)

# 保存修改后的样本 CSV 文件
df.to_csv(csv_file, index=False, sep='\t')


In [2]:
import pandas as pd

# 读取CSV文件
csv_file = "test.csv"
df = pd.read_csv(csv_file, sep='\t')

# 删除'visual_segment_path'列
if 'audio_segment_path' in df.columns:
    df.drop('audio_segment_path', axis=1, inplace=True)

# 保存修改后的DataFrame回CSV文件
df.to_csv(csv_file, index=False, sep='\t')