In [None]:
import os
import pandas as pd
import h5py
from mne import create_info
import torch
import numpy as np

# --- 现有函数 (load_meg_data, 以及修改后的 generate_meg_labels) ---
def load_meg_data(hdf5_file_path):
    with h5py.File(hdf5_file_path, 'r') as f:
        raw_data = f['data'][:] # Load raw_data as a NumPy array
        times = f['times'][:]
    if len(times) >= 2:
        dt = times[1] - times[0]
        sfreq = 1.0 / dt
    else:
        raise ValueError("Not enough time points in 'times' to determine sampling frequency.")
    n_channels = raw_data.shape[0]
    channel_names = [f'MEG {i+1:03d}' for i in range(n_channels)]
    info = create_info(ch_names=channel_names, sfreq=sfreq, ch_types=['mag'] * n_channels)
    return raw_data, info # raw_data is already a numpy array here

def generate_meg_labels(tsv_data, info, start_time, end_time):
    """
    Generates 0/1 speech/silence labels aligned to MEG sfreq, without visualization.

    Parameters:
        - tsv_data: DataFrame with timing and labeling information.
        - info: MNE Info object containing metadata (including sampling frequency).
        - start_time, end_time: Time window (in seconds) to generate labels for.

    Returns:
        - aligned_labels: NumPy array of 0/1 labels, sampled at info['sfreq'].
    """
    tsv_data = tsv_data.copy()
    tsv_data['timemeg'] = tsv_data['timemeg'].astype(float)

    # Ensure start_time is covered by adding a preceding entry if necessary
    last_before = tsv_data[tsv_data['timemeg'] < start_time]
    if not last_before.empty:
        last_before = last_before.iloc[-1:]
    else: # If no events before start_time, assume silence starts from start_time
        last_before = pd.DataFrame({'timemeg': [start_time - 0.001], 'kind': ['silence'], 'segment': ['']}) # A dummy entry

    window_data = tsv_data[(tsv_data['timemeg'] >= start_time) & (tsv_data['timemeg'] <= end_time)]
    filtered_data = pd.concat([last_before, window_data]).sort_values('timemeg').drop_duplicates(subset=['timemeg'], keep='first')

    if filtered_data.empty:
        # Handle empty window: assume silence
        filtered_data = pd.DataFrame({'timemeg': [start_time, end_time], 'kind': ['silence', 'silence']})

    filtered_data['speech_label'] = 0
    filtered_data.loc[filtered_data['kind'].isin(['word', 'phoneme']), 'speech_label'] = 1

    sfreq = info['sfreq']
    # Calculate the number of samples, rounding up to ensure the end time is included if needed, 
    # but the time calculation based on end_time - start_time should be sufficient if rounding issues are minimal.
    num_samples = int((end_time - start_time) * sfreq) 
    
    # Handle cases where num_samples might be 0 or negative
    if num_samples <= 0:
        print(f"Warning: Time window too small or invalid ({start_time}-{end_time}s) for sfreq {sfreq}Hz. Returning empty labels.")
        return np.array([])

    aligned_labels = np.zeros(num_samples, dtype=np.float32)

    # Fill labels based on time segments
    # Initialize current_label from the first valid point
    if not filtered_data.empty:
        # Find the first entry whose timemeg is >= start_time, or use the very first entry if none
        initial_entries = filtered_data[filtered_data['timemeg'] >= start_time]
        if not initial_entries.empty:
            current_label = initial_entries.iloc[0]['speech_label']
            first_event_time = initial_entries.iloc[0]['timemeg']
            # Fill from start_time up to first_event_time with current_label
            first_sample_idx = max(0, int((start_time - start_time) * sfreq))
            end_of_first_segment_sample_idx = min(num_samples, int((first_event_time - start_time) * sfreq))
            if first_sample_idx < end_of_first_segment_sample_idx:
                aligned_labels[first_sample_idx:end_of_first_segment_sample_idx] = current_label
        # Note: If initial_entries is empty, it means all events are *before* start_time. 
        # The logic below handles this by relying on the 'last_before' entry to set the initial state.


    for i in range(len(filtered_data)):
        row = filtered_data.iloc[i]
        event_time = row['timemeg']
        label_at_event = row['speech_label']

        # Determine the start and end sample index for the current segment
        # This segment starts at the current event_time and continues until the next event_time or end_time
        start_sample_idx = max(0, int((event_time - start_time) * sfreq))

        if i + 1 < len(filtered_data):
            next_event_time = filtered_data.iloc[i+1]['timemeg']
            end_sample_idx = min(num_samples, int((next_event_time - start_time) * sfreq))
        else:
            end_sample_idx = num_samples # Until the end of the window

        # Fill the segment starting from this event_time with the label that *starts* at this event
        if start_sample_idx < end_sample_idx:
            # The label should be valid from start_sample_idx up to (but not including) end_sample_idx
            aligned_labels[start_sample_idx:end_sample_idx] = label_at_event

    return aligned_labels


In [None]:
# --- 通道选择掩码 ---
SENSORS_SPEECH_MASK = [18, 20, 22, 23, 45, 120, 138, 140, 142, 143, 145,
                       146, 147, 149, 175, 176, 177, 179, 180, 198, 271, 272, 275]

# --- 批量处理逻辑 (修改为保存 NumPy 数组) ---
base_folder = "./libribrain/data" # 替换为你的根文件夹路径
output_labels_folder = "./libribrain/data/labels_npy" # 更改为保存.npy标签的文件夹
output_meg_data_folder = "./libribrain/data/meg_data_npy" # 更改为保存.npy MEG数据的文件夹
os.makedirs(output_labels_folder, exist_ok=True)
os.makedirs(output_meg_data_folder, exist_ok=True) # 创建MEG数据保存文件夹

print(f"Base folder: {base_folder}")
print(f"Output Labels folder: {output_labels_folder}")
print(f"Output MEG Data folder: {output_meg_data_folder}")
print(f"SENSORS_SPEECH_MASK applied: {SENSORS_SPEECH_MASK}")
print("MEG data will be scaled by dividing by 1e-07.")


In [None]:
for root, dirs, files in os.walk(base_folder):
    for file in files:
        if file.endswith("_meg.h5"):
            hdf5_file_path = os.path.join(root, file)

            # 构建对应的 TSV 文件路径
            # 这部分需要根据你的实际文件结构和命名约定进行调整
            parts = hdf5_file_path.split(os.sep)
            # 找到 'serialised' 文件夹的索引，并替换为 'events'
            try:
                serialised_idx = parts.index('serialised')
                tsv_parts = parts[:serialised_idx] + ['events'] + parts[serialised_idx + 1:]
                # 替换文件名
                tsv_file_name = file.replace('_proc-bads+headpos+sss+notch+bp+ds_meg.h5', '_events.tsv')
                tsv_parts[-1] = tsv_file_name
                tsv_file_path = os.path.join(*tsv_parts)
            except ValueError:
                print(f"Skipping {hdf5_file_path}: 'serialised' folder not found in path for TSV inference.")
                continue

            if not os.path.exists(tsv_file_path):
                print(f"Corresponding TSV file not found for: {tsv_file_path}. Skipping.")
                continue

            print(f"\nProcessing MEG: {hdf5_file_path}")
            print(f"With TSV: {tsv_file_path}")

            try:
                tsv_data = pd.read_csv(tsv_file_path, sep='\t')
                meg_raw_np, info = load_meg_data(hdf5_file_path) # meg_raw_np is already a numpy array

                # 获取 MEG 数据的总时长，用于生成完整标签序列
                total_meg_samples = meg_raw_np.shape[1]
                total_duration = total_meg_samples / info['sfreq']

                # --- 1. 生成并保存标签 (保存为 .npy) ---
                aligned_labels_np = generate_meg_labels(tsv_data, info, start_time=0, end_time=total_duration)

                if aligned_labels_np.size > 0:
                    # 定义标签保存文件的名称
                    output_label_filename = os.path.basename(file).replace('_proc-bads+headpos+sss+notch+bp+ds_meg.h5', '_labels.npy')
                    output_label_full_path = os.path.join(output_labels_folder, output_label_filename)

                    np.save(output_label_full_path, aligned_labels_np) # 保存为 NumPy .npy 文件
                    print(f"Successfully saved labels (shape {aligned_labels_np.shape}) to: {output_label_full_path}")
                else:
                    print(f"No labels generated for {hdf5_file_path}. Skipping label save.")

                # --- 2. 筛选、缩放并保存原始 MEG 数据 (保存为 .npy) ---
                
                # a. 通道筛选
                # 假设通道在第一个维度
                meg_masked_np = meg_raw_np[SENSORS_SPEECH_MASK, :]
                print(f"Original MEG shape: {meg_raw_np.shape}, Masked MEG shape: {meg_masked_np.shape}")

                # b. 数据缩放
                # 除以 1e-07，即乘以 1e+07
                scaling_factor = 1e-07
                meg_scaled_np = meg_masked_np / scaling_factor
                
                # 定义MEG数据保存文件的名称
                output_meg_filename = os.path.basename(file).replace('.h5', '_masked_scaled.npy') # 修改文件名以区分
                output_meg_full_path = os.path.join(output_meg_data_folder, output_meg_filename)

                np.save(output_meg_full_path, meg_scaled_np.astype(np.float32)) # 保存为 NumPy .npy 文件，确保是浮点型
                print(f"Successfully saved masked and scaled MEG data (shape {meg_scaled_np.shape}) to: {output_meg_full_path}")

            except Exception as e:
                print(f"Error processing {hdf5_file_path} or {tsv_file_path}: {e}")
                continue

print("\nBatch processing complete.")