### 编辑人:苏则茜
### Project:Prob-Na-Learning
For motion extraction of worm trajectories on different shapes of Petri dish  
Although it is called 'general' version for motion parameter extraction, it works **only to single point gradient or non-gradient test plate**  
For probabilitic learning training plate, see ***Distribution_Extract*** codes


### 编辑日志
@23-12-02整理了所有处理和写出csv的步骤，输出为处理了基本运动参数和实验条件等信息的csv文件  
@23-12-09增加了距离区间面积标签；将时间区间默认设置为10min一组，距离区间默认5mm一组  
@23-12-11抽取运动参数批量化，可一次性跑一个文件下所有的实验文件，但是总时长需要一致  
@23-12-12增加角速率列  
@24-3-4与MP（multiple-point）的处理相区分，重命名为OP_Motion_Extract  
@24-3-29解决warning bug
@24-10-14加入了平滑，轨迹先平滑后计算运动参数

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
import matplotlib as mpl
import os
import csv
import matplotlib.ticker as ticker

In [2]:
# 定义角度计算公式
def ang_cal(vec_1,vec_2):
    dot_pro = np.dot(vec_1, vec_2)
    mod_1 = np.sqrt(np.dot(vec_1,vec_1))
    mod_2 = np.sqrt(np.dot(vec_2,vec_2)) 
    if mod_1 == 0 or mod_2 ==0:
        angle = 0
    else:
        cos = dot_pro/(mod_1*mod_2)
        if np.isnan(cos) == True:
            angle = np.nan
        else:  
            angle = np.arccos(round(cos,1))  #弧度制
    return angle

def clws_delta_phi(vec_1, vec_2, vec_0 = [1,0]):
    # vec_1 is the first vector, and it rotates to the vec_2
    vec_0 = np.array(vec_0)
    agl_1 = ang_cal(vec_1,vec_2)
    agl_2 = ang_cal(vec_0, vec_1)
    agl_3 = ang_cal(vec_0, vec_2)
    if agl_1 == 0:
        agl_1 = 0
        # print('angle = 0°')
    elif agl_1 == np.pi:
        agl_1 = np.pi
        # print('angle = 180°')
    elif vec_1[1] >= 0 and vec_2[1] >= 0:    #同时在第一第二象限
        if (agl_3 - agl_2) > 0:
            agl_1 = (-1)*np.abs(agl_1)
        else:
            agl_1 = np.abs(agl_1)
    elif vec_1[1] <= 0 and vec_2[1] <= 0:    #同时在第三第四象限
        if (agl_3 - agl_2) > 0:
            agl_1 = np.abs(agl_1)
        else:
            agl_1 = (-1)*np.abs(agl_1)
    elif vec_1[1] >= 0 and vec_2[1] <= 0:
        if (agl_3 + agl_2) > np.pi:
            agl_1 = (-1)*np.abs(agl_1)
        else:                                  #如果等于180°认为是顺时针旋转
            agl_1 = np.abs(agl_1)
    elif vec_1[1] <= 0 and vec_2[1] >= 0:
        if (agl_3 + agl_2) < np.pi:
            agl_1 = (-1)*np.abs(agl_1)
        else:                                  #如果等于180°认为是顺时针旋转
            agl_1 = np.abs(agl_1)   
    return agl_1*180/np.pi              # 输出为角度制

In [3]:
def Sliding_CTX_Calculation(df_worms, df_ROI, idx, hlf_spd_inv, hlf_spd_agl_inv, hlf_agl_inv, 
                            window_size=20, frame_rate=20, track_jump_frame=1, pixel_length=0.025):
    """
    优化后的速度、角速度和CTX计算函数，计算速度和角速度完全独立。
    """
    # 滤波与预处理
    df_slide = df_worms[df_worms.ID == idx].copy()
    df_slide['X'] = df_slide['X'].rolling(window=window_size, center=True).mean()
    df_slide['Y'] = df_slide['Y'].rolling(window=window_size, center=True).mean()

    # 数据准备
    trajectory = df_slide[['X', 'Y', 'Timestamp']].values.astype(float)
    x, y, timestamps = trajectory[:, 0], trajectory[:, 1], trajectory[:, 2] / frame_rate

    # 定义辅助函数
    def compute_delta(data, bins, scale_factor=1):
        return (data[bins:] - data[:-bins]) * scale_factor

    def smooth_velocity(x, y, timestamps, bins, pixel_length):
        delta_x = compute_delta(x, bins, pixel_length)
        delta_y = compute_delta(y, bins, pixel_length)
        delta_t = compute_delta(timestamps, bins)
        velocity = np.column_stack((delta_x / delta_t, delta_y / delta_t))
        speed = np.linalg.norm(velocity, axis=1)
        return velocity, speed, delta_t

    # 速度计算与异常检测
    bins_spd = int((hlf_spd_inv * frame_rate) // track_jump_frame) * 2
    velocity_spd, speed_spd, delta_t_spd = smooth_velocity(x, y, timestamps, bins_spd, pixel_length)

    mean_delta_t = np.nanmean(delta_t_spd)
    threshold = hlf_spd_inv * 2
    outlier_id = mean_delta_t > 1.25 * threshold

    if outlier_id:
        print(f'平均delta_t超出阈值 ({mean_delta_t:.2f} > {threshold:.2f}), ID: {idx}')

    # 补充 NaN 填充
    velocity_spd = np.pad(velocity_spd, ((bins_spd // 2, bins_spd // 2), (0, 0)), constant_values=np.nan)
    speed_spd = np.pad(speed_spd, (bins_spd // 2, bins_spd // 2), constant_values=np.nan)

    # 角速度计算
    bins_agl = int((hlf_spd_agl_inv * frame_rate) // track_jump_frame) * 2
    hlf_bins_agl = int((hlf_agl_inv * frame_rate) // track_jump_frame)
    velocity_agl, _, time_step_agl = smooth_velocity(x, y, timestamps, bins_agl, pixel_length)

    angular_velocity = np.zeros(len(time_step_agl))
    valid_idx = np.arange(hlf_bins_agl, len(time_step_agl) - hlf_bins_agl)

    for i in valid_idx:
        delta_phi = clws_delta_phi(velocity_agl[i - hlf_bins_agl], velocity_agl[i + hlf_bins_agl])
        delta_t = time_step_agl[i + hlf_bins_agl] - time_step_agl[i - hlf_bins_agl]
        angular_velocity[i] = delta_phi / delta_t if delta_t > 0 else 0

    # CTX计算
    points = np.array([list(map(int, df_ROI.loc[0, col])) for col in df_ROI.columns])
    vec_left, vec_right = points[1] - points[2], points[2] - points[1]
    ctx = velocity_spd[:, 0] / np.linalg.norm(velocity_spd, axis=1)
    bearing_left = np.arctan2(velocity_spd[:, 1], velocity_spd[:, 0]) - np.arctan2(vec_left[1], vec_left[0])
    bearing_right = np.arctan2(velocity_spd[:, 1], velocity_spd[:, 0]) - np.arctan2(vec_right[1], vec_right[0])

    # 合并结果
    df_slide['speed'] = speed_spd
    df_slide['x_velocity'], df_slide['y_velocity'] = velocity_spd[:, 0], velocity_spd[:, 1]
    df_slide['agl_velocity'] = angular_velocity
    df_slide['agl_speed'] = np.abs(angular_velocity)
    df_slide['CTX'] = ctx
    df_slide['bearing_left'] = bearing_left
    df_slide['CTX_left'] = np.cos(bearing_left)
    df_slide['bearing_right'] = bearing_right
    df_slide['CTX_right'] = np.cos(bearing_right)
    df_slide['X_org'] = df_worms[df_worms.ID == idx]['X']
    df_slide['Y_org'] = df_worms[df_worms.ID == idx]['Y']

    return df_slide, idx if outlier_id else False


In [4]:
# 速度，角速度，bearing angle, Dist_to_center, ctx计算函数
# 输入：筛选为虫子的csv文件，画圆的csv文件，ID索引，速度平滑，角速度速度平滑，角速度平滑，追踪帧率，跳帧数量，像素长度
def Sliding_CTX_Calculation(df_worms, df_ROI, idx, sm_inv, spd_sm_inv, hlf_sm_inv,window_size = 20, frame_rate = 20, track_jump_frame = 1, pixel_length = 0.025):
    # 只适用于线性梯度，因为CTX只计算到左和右边缘的
    # 速度平滑窗s sm_inv
    # 计算角速度的速度平滑窗s spd_sm_inv
    # 角速度平滑床s hlf_sm_inv
    
    # ======================================数据平滑=================
    # 1. 移动窗口平滑
    # 对X和Y坐标进行移动平均
    df_slide = df_worms[df_worms.ID==idx].copy()
    df_slide['X'] = df_slide['X'].rolling(window=window_size, center=True).mean()
    df_slide['Y'] = df_slide['Y'].rolling(window=window_size, center=True).mean()

    # ======================================速度计算=================
    half_bins_spd = int((sm_inv*frame_rate)//track_jump_frame)
    bins_spd = 2*half_bins_spd
#     print(f'速度总平滑窗为{sm_inv*2}s,bins数量为{2*half_bins_spd}个')

    trajectory_0 = df_slide[['X','Y','Timestamp']].values      # 提取x,y坐标和时间戳
    trajectory_1 = (trajectory_0.copy()).astype('float')                            # 转float数据类型
    x = trajectory_1[:,0]
    y = trajectory_1[:,1]
    delta_x = (x[bins_spd:]-x[:len(x)-bins_spd])*pixel_length                       # 得到减去头尾数据点的delta_x和delta_y
    delta_y = (y[bins_spd:]-y[:len(y)-bins_spd])*pixel_length
    time_step = trajectory_1[:,2]/frame_rate    
    time_step_vec = time_step[bins_spd:]-time_step[:len(time_step)-bins_spd]        # 得到减去头尾数据点的对应delta-x和delta-y的时间
    print('平均时间间隔'+str(np.average(time_step_vec))+'应该等于总平滑窗'+str(sm_inv*2))
    
    # 根据平均时长筛选轨迹
    mean_time_inv = np.average(time_step_vec)   # 平均时间间隔
    n = 1.25                                    # 平均时间间隔长度
    outlier_id = False
    if mean_time_inv/(sm_inv*2) >= n:
        outlier_id = True
        print(f'实际平均间隔大于{str(n)}倍，，ID为{idx}')
    velocity_bef = np.dstack((delta_x/time_step_vec,delta_y/time_step_vec))[0]      # 使用dstack函数合并x,y方向计算的速度为一个矩阵
    speed_bef = np.linalg.norm(velocity_bef, axis = 1).reshape(-1,1)
    nan_bin_vec = np.full((half_bins_spd,2),np.nan)
    nan_bin_spd = np.full((half_bins_spd,1),np.nan)
    velocity = np.vstack((nan_bin_vec, velocity_bef, nan_bin_vec))                  # 速度向量
    speed = np.vstack((nan_bin_spd, speed_bef, nan_bin_spd))                        # 速率向量
#     print(f'补齐nan后的速度形状:{len(velocity)}应该等于轨迹形状')
    df_idx = df_slide[['ID','X','Y','Timestamp']].copy()      
    df_idx['speed'] = pd.Series(speed[:,0],index = df_idx.index)                    # 将speed和velocity加入dataframe
    df_idx['x_velocity'] = pd.Series(velocity[:,0],index = df_idx.index)
    df_idx['y_velocity'] = pd.Series(velocity[:,1],index = df_idx.index)
    
    # ======================================================角速度计算===========

    half_bins_spd_agl = int((spd_sm_inv*frame_rate)//track_jump_frame)
    bins_spd_agl = 2*half_bins_spd_agl                                 # 角速度速度平滑窗
    hlf_agl_bins = int((hlf_sm_inv*frame_rate)//track_jump_frame)      # 角速度半平滑窗
    trajectory_0 = df_slide[df_slide['ID']==idx][['X','Y','Timestamp']].values      # 提取x,y坐标和时间戳
    trajectory_1 = (trajectory_0.copy()).astype('float')                            # 转float数据类型
    x = trajectory_1[:,0]
    y = trajectory_1[:,1]
    delta_x = (x[bins_spd_agl:]-x[:len(x)-bins_spd_agl])*pixel_length                                      # 得到减去头尾数据点的delta_x和delta_y
    delta_y = (y[bins_spd_agl:]-y[:len(y)-bins_spd_agl])*pixel_length
    time_step = trajectory_1[:,2]/frame_rate    
    time_step_vec = time_step[bins_spd_agl:]-time_step[:len(time_step)-bins_spd_agl]        # 得到减去头尾数据点的对应delta-x和delta-y的时间
    velocity_bef = np.dstack((delta_x/time_step_vec,delta_y/time_step_vec))[0]             # 使用dstack函数合并x,y方向计算的速度为一个矩阵
    speed_bef = np.linalg.norm(velocity_bef, axis = 1).reshape(-1,1)
    nan_bin_vec = np.full((half_bins_spd_agl,2),np.nan)
    nan_bin_spd = np.full((half_bins_spd_agl,1),np.nan)
    velocity = np.vstack((nan_bin_vec, velocity_bef, nan_bin_vec))
    speed = np.vstack((nan_bin_spd, speed_bef, nan_bin_spd))[:,0]
    time_step_vector = np.vstack((nan_bin_spd, time_step_vec.reshape(-1,1), nan_bin_spd))[:,0]
    
    df_speed_cal = pd.DataFrame(speed,columns = ['speed'])              # 生成一个包含speed的df并提取索引
    
    idx_vec = df_speed_cal.index                                        # 生成索引列表时先掐头去尾再取非零值
    df_washed_speed = df_speed_cal[half_bins_spd_agl:len(idx_vec)-half_bins_spd_agl]
    idx_nz_vec = df_washed_speed[df_washed_speed['speed']!=0].index     # 索引是相对于总长度，但是这个列表中直接去掉了头尾速度为nan的和速率为0的点的索引
    df_speed_cal['agl_velocity'] = pd.Series([np.nan]*len(idx_vec))

    for i in range(hlf_agl_bins,len(idx_nz_vec)-hlf_agl_bins):
        n = hlf_agl_bins
        agl_i = clws_delta_phi(velocity[idx_nz_vec[i-n]],velocity[idx_nz_vec[i+n]])        
        delta_t_i = time_step[idx_nz_vec[i+n]]-time_step[idx_nz_vec[i-n]]
        agl_vel_i = agl_i/delta_t_i

        if np.isnan(agl_vel_i) == False:
            df_speed_cal['agl_velocity'].loc[idx_nz_vec[i]] = agl_vel_i
        else:
#             print(velocity[idx_nz_vec[i-n]],velocity[idx_nz_vec[i+n]])
            df_speed_cal['agl_velocity'].loc[idx_nz_vec[i]] = 0 
    # 将angular_velocity和angular_speed加入dataframe(df_idx)
    df_idx['agl_velocity'] = pd.Series(df_speed_cal['agl_velocity'].values,index = df_idx.index)
    df_idx.loc[:,'x_velocity_agl'] = velocity[:,0]
    df_idx.loc[:,'y_velocity_agl'] = velocity[:,1]
    df_idx.loc[df_idx.speed == 0, 'agl_velocity'] = 0                 # 将speed为0的点角速度设为0
    df_idx['agl_speed'] = np.abs(df_idx['agl_velocity'])
    
    # ===========================================计算CTX=====================================
    points = []
    for col in df_ROI.columns[:]:
        p = df_ROI.loc[0,col]
        points.append([int(p[0]), int(p[1])])
    points = np.array(points)

    # 目标方向（left and right）
    vec_left = points[1]-points[2]
#     print(f'CTX_left的目标方向向量：{vec_left}')
    vec_right = points[2]-points[1]
#     print(f'CTX_right的目标方向向量：{vec_right}')
    # 计算bearing angle和CTX
    # 提取所有数据点速度向量
    vel_vec = df_idx[['x_velocity','y_velocity']].values
    bearing_left = []
    bearing_right = []
    # 不经过角度直接计算ctx
    ctxs = []
    for i in range(len(vel_vec)):
        agl_i_left = clws_delta_phi(vec_left,vel_vec[i,:])
        agl_i_left = agl_i_left/180*np.pi     # bearing angle使用的是弧度制
        bearing_left.append(agl_i_left)                                      # 使用弧度制
        
        vel_vec_i = vel_vec[i,:]
        ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)
        
        ctxs.append(ctx_i)
        
        agl_i_right = clws_delta_phi(vec_right,vel_vec[i,:])
        agl_i_right = agl_i_right/180*np.pi
        bearing_right.append(agl_i_right)
        
    df_idx['CTX'] = pd.Series(ctxs, index = df_idx.index)
    df_idx['bearing_left'] = pd.Series(bearing_left, index = df_idx.index)
    df_idx['CTX_left']=df_idx['bearing_left'].apply(np.cos,axis = 0)
    df_idx['bearing_right'] = pd.Series(bearing_right, index = df_idx.index)
    df_idx['CTX_right']=df_idx['bearing_right'].apply(np.cos,axis = 0)
    df_idx['X_org'] = df_worms[df_worms.ID==idx]['X']                     # 将平滑前的轨迹也加入df
    df_idx['Y_org'] = df_worms[df_worms.ID==idx]['Y']
    
    if outlier_id:
        return df_idx, idx
    else:
        return df_idx, outlier_id

In [5]:
def Insert_SharpTurn(df_als, min_agl = 90):
    print('=======================开始事件分类===================================')
    print('注：事件分类仅完成分类（turn = 1, run = 0），tunrning rate的计算较为灵活，在汇总分析作图时使用')
    # 新建一列为Event
    if 'Event' in df_als:
        df_als = df_als.drop('Event', axis = 1)
    df_als.loc[:,'Event'] = 0
    df_als.loc[df_als.agl_speed >= min_agl, 'Event'] = 1
    return df_als

In [6]:
# 边缘裁剪
def Cut_edge(df, x_cut, y_cut, pixel_length = 0.025):
    # 距离换算
    x_cut_pixel = x_cut/pixel_length
    y_cut_pixel = y_cut/pixel_length

    print(f'裁剪的像素距离为：X {x_cut_pixel}， Y {y_cut_pixel}')
    x_range = [min(df['X'].dropna()), max(df['X'].dropna())]
    y_range = [min(df['Y'].dropna()), max(df['Y'].dropna())]
    print('原X,Y距离范围分别为',x_range, y_range)
    x_cut_range = [min(df['X'].dropna())+x_cut_pixel, max(df['X'].dropna())-x_cut_pixel]
    y_cut_range = [min(df['Y'].dropna())+y_cut_pixel, max(df['Y'].dropna())-y_cut_pixel]
    print('裁剪后距离范围分别为',x_cut_range, y_cut_range)
    
    # 裁剪
    df_cut = df[(df.X >= x_cut_range[0]) & (df.X <= x_cut_range[1]) & (df.Y >= y_cut_range[0]) & (df.Y <= y_cut_range[1])]

    return df_cut

In [7]:
def realign_xy(x, y, sin_phi, cos_phi):
    """
    旋转坐标 (x, y) 角度，由 sin_phi 和 cos_phi 定义。
    """
    rotation_mat = np.array([[cos_phi, sin_phi], [-sin_phi, cos_phi]])
    return np.matmul(rotation_mat, np.array([x, y]))

def realign_coordinate(df, df_roi):
    """
    旋转坐标轴，将X轴对齐到两琼脂块中心的连线。
    """
    # 提取points并调整原点到左下角点
    points = np.array([list(map(int, p.strip('()').split(','))) for p in df_roi.iloc[0, :-1]])
    origin = points[1]
    points -= origin
    df_realign = df.copy()
    df_realign[['X', 'Y']] -= origin

    # 计算旋转角度
    new_x_axis = points[2] - points[1]
    mod = np.linalg.norm(new_x_axis)
    sin_phi, cos_phi = new_x_axis[1] / mod, new_x_axis[0] / mod

    # 旋转df
    df_realign[['X', 'Y']] = df_realign.apply(
        lambda row: realign_xy(row.X, row.Y, sin_phi, cos_phi), axis=1, result_type='expand'
    )

    # 旋转points
    rotated_points = np.array([realign_xy(p[0], p[1], sin_phi, cos_phi) for p in points])

    # 恢复原点
    df_realign[['X', 'Y']] += origin
    rotated_points += origin

    # 返回结果
    point_fin = {f'point{i+1}': [rotated_points[i]] for i in range(4)}
    df_ROI_fin = pd.DataFrame(point_fin)
    return df_realign, df_ROI_fin

In [8]:
# 批量处理单个实验数据的主函数
def process_experiment_data(df, df_ROI, file_info, time_len=3, plate_type=0, noise_lim=100, 
                            time_inv_min=5, max_dist=40, dist_inv=5, x_cut=2, y_cut=2, 
                            ws=40, track_jump_frame=1, remove_outlier=True, pixel_length=0.025):
    """
    对单个实验数据进行处理，去噪、重定位坐标并计算相关特征。

    参数：
        df: 实验数据 DataFrame
        df_ROI: ROI 数据
        file_info: 文件信息列表 [日期, 条件, 分组ID]
        time_len: 实验时长（小时）
        plate_type: 平板类型（0: 圆形，1: 方形）
        noise_lim: 噪声上限
        time_inv_min: 时间间隔（分钟）
        max_dist: 最大距离（mm）
        dist_inv: 距离分组间隔（mm）
        x_cut, y_cut: 边缘剪裁大小
        ws: 滑动窗口大小
        track_jump_frame: 允许的跳帧数
        remove_outlier: 是否移除离群点
        pixel_length: 单个像素长度（mm）

    返回：
        处理后的事件数据 DataFrame
    """
    # 提取 ROI 顶点
    points = [list(map(int, p.strip('()').split(','))) for p in df_ROI.iloc[0, :-1]]

    # 去噪并筛选虫子轨迹
    df_worms = df[df.Diagonal_Length > noise_lim]
    print(f'Noise limit: {noise_lim}, Worm trajectories: {len(df_worms.ID.unique())} IDs')

    # 坐标轴重定位
    df_worms_aligned, df_ROI_aligned = realign_coordinate(df_worms, df_ROI)
    unique_ids = df_worms_aligned['ID'].unique()
    frame_rate = max(df_worms_aligned.Timestamp) / (time_len * 3600)
    print(f'Frame rate: {frame_rate}')

    # 滑动窗口特征计算
    processed_data = []
    outliers = []
    for worm_id in unique_ids:
        try:
            data, outlier_id = Sliding_CTX_Calculation(
                df_worms_aligned, df_ROI_aligned, worm_id, 1, 0.5, 0.5, ws, frame_rate, track_jump_frame, pixel_length
            )
            processed_data.append(data)
            if outlier_id:
                outliers.append(outlier_id)
        except Exception as e:
            print(f'Error processing ID {worm_id}: {e}')

    # 合并所有轨迹数据
    df_processed = pd.concat(processed_data, ignore_index=True)

    # 去除离群点
    if remove_outlier:
        df_processed = df_processed[~df_processed.ID.isin(outliers)]
        print(f'Removed outliers: {len(outliers)} IDs')

    # 计算与中线的位移
    mid_x = (points[1][0] + points[2][0] + points[0][0] + points[3][0]) / 4
    df_processed['Disp_to_mid'] = df_processed['X'] - mid_x

#     # 分割时间和距离区间
#     df_processed = Insert_Period_idx(df_processed, time_inv_min, time_len)
#     df_processed = Insert_Dist_idx(
#         df_processed, df_ROI_aligned, plate_type, max_dist // dist_inv, max_dist, dist_inv, pixel_length
#     )
    df_processed = Cut_edge(df_processed, x_cut, y_cut, pixel_length)

    # 事件分类
    df_processed = Insert_SharpTurn(df_processed, min_agl=90)
#     df_processed['frame_rate'] = frame_rate
    df_processed['Date'] = file_info[0]
    
    # 判断是否多因素设计,如果是，设置为多列，按照Condition+0，1，2编号
    condition_feature = file_info[-2]
    condition_ls = condition_feature.split('-')
    for i, c in enumerate(condition_ls):
        if len(condition_ls) > 1:
            con_col = 'Condition'+str(i)
            df_processed[con_col] = c
        else:
            df_processed['Condition'] = c
    df_processed['Group_id'] = file_info[-1]

    print(f'Processed data for: Date={file_info[0]}, Conditions={condition_ls}, Group ID={file_info[-1]}')

    return df_processed


# 批量处理

待处理文件的文件夹（包含merge文件夹中有merge后的文件，以及rectangle文件）

In [9]:
def find_merge_files(path_dirs, key_word,slice_vec=[]):
    """查找符合条件的文件并记录路径。"""
    merge_files, merge_paths, merge_dirs = [], [], []
    for path_dir in path_dirs:
        merge_dir = os.path.join(path_dir, 'merge_result')
        files = os.listdir(merge_dir)
        filtered_files = [f for f in files if 'second_clear' in f and (key_word in f if key_word else True)]
        merge_files.extend(filtered_files)
        merge_paths.extend([os.path.join(merge_dir, f) for f in filtered_files])
        merge_dirs.extend([path_dir] * len(filtered_files))
        
        # 如果有切片，可以只选择其中一部分文件跑代码
        if slice_vec:
            merge_files = merge_files[slice_vec[0]:slice_vec[1]]
            merge_paths = merge_paths[slice_vec[0]:slice_vec[1]]
            merge_dirs = merge_dirs[slice_vec[0]:slice_vec[1]]
    return merge_files, merge_paths, merge_dirs

def save_data(df, output_path, save_as_pickle=True):
    """保存数据为指定格式（pickle 或 csv）。"""
    if save_as_pickle:
        df.to_pickle(output_path)
    else:
        df.to_csv(output_path, index=False)
    print(f"数据已保存到: {output_path}")

def process_single_file(merge_path, merge_file, merge_dir, key_params, plate_types, plate_type_idx):
    """处理单个文件。"""
    # 加载轨迹数据和 ROI 数据
    df_track = pd.read_csv(merge_path)
    feature_name = merge_file.split('_second_clear')[0]
    roi_file = f"{feature_name}_{plate_types[plate_type_idx]}_info.csv"
    roi_path = os.path.join(merge_dir, roi_file)
    df_ROI = pd.read_csv(roi_path)
    
    # 解析文件名并打印处理信息
    file_info = feature_name.split('_')
    print(f"开始处理文件: {file_info}")
    
    # 调用主处理函数
    df_als = process_experiment_data(
        df_track, df_ROI, file_info,
        time_len=key_params[0], plate_type=plate_type_idx, noise_lim=key_params[1],
        time_inv_min=key_params[2], max_dist=key_params[3], track_jump_frame=1,
        x_cut=2, y_cut=2, ws=40
    )
    return df_als, feature_name

def process_all_files(merge_paths, merge_files, merge_dirs, key_params, plate_types, plate_type_idx, save_as_pickle, concat_pkl, columns_for_concat):
    """批量处理所有文件。"""
    all_data = []
    for merge_path, merge_file, merge_dir in zip(merge_paths, merge_files, merge_dirs):
        df_als, feature_name = process_single_file(merge_path, merge_file, merge_dir, key_params, plate_types, plate_type_idx)
        
        # 保存单文件结果
        output_file = os.path.join(
            merge_dir,
            f"{feature_name.split('.avi')[0]}_smh-als.{'pkl' if save_as_pickle else 'csv'}"
        )
        save_data(df_als, output_file, save_as_pickle)
        
        # 如果需要合并，记录指定列
        if concat_pkl:
            all_data.append(df_als[columns_for_concat] if columns_for_concat else df_als)
        # 合并需要用到file_info的前两项
        file_info = feature_name.split('_')
        file_keyinfo = f'{file_info[0]}_{file_info[1]}'
    return all_data, file_keyinfo

def save_concatenated_data(all_data, output_path):
    """保存合并后的数据。"""
    if not all_data:
        return
    df_combined = pd.concat(all_data, ignore_index=True)
    df_combined = df_combined[df_combined.speed != 0].reset_index(drop=True)
    df_combined.to_pickle(output_path)
    print(f"合并后的数据已保存到: {output_path}")

In [11]:
# 主函数
def main():
    # 配置参数
    path_dirs = [r'Z:\data space+\C. elegans chemotaxis\2025\20250113_45start\2']
    key_word = ''
    # 选择哪一段数据进行处理
    slice_vec = [3,6]
    # 时长，noise_lim，time_inv, max_dist
    key_params = [1.5, 250, 5, 100]
    
    plate_types = ['circle', 'rectangle']
    plate_type_idx = 1
    save_as_pickle = True
    concat_pkl = False
    columns_for_concat = []
    
    
    # 查找文件
    merge_files, merge_paths, merge_dirs = find_merge_files(path_dirs, key_word, slice_vec)
    print(f"共找到 {len(merge_files)} 个文件，准备处理...")
    print('\n'.join(merge_files))
    # 批量处理文件
    all_data,file_keyinfo = process_all_files(
        merge_paths, merge_files, merge_dirs, key_params,
        plate_types, plate_type_idx, save_as_pickle, concat_pkl, columns_for_concat
    )
    
    # 保存合并数据
    if concat_pkl:
        # 以日期_文件名+关键词信息输出总的df为pickle
        concat_output_path = os.path.join(path_dirs[0], f'{file_keyinfo}_{key_word}_all.pkl')
        save_concatenated_data(all_data, concat_output_path)

# 运行主函数
if __name__ == "__main__":
    main()

共找到 3 个文件，准备处理...
20250113_Na-CTX_2.25gNaPre_0gNa-stdLB-cl-45start_1_second_clear.csv
20250113_Na-CTX_2.25gNaPre_0gNa-stdLB-cl-45start_2_second_clear.csv
20250113_Na-CTX_2.25gNaPre_0gNa-stdLB-op-45start_1_second_clear.csv
开始处理文件: ['20250113', 'Na-CTX', '2.25gNaPre', '0gNa-stdLB-cl-45start', '1']
Noise limit: 250, Worm trajectories: 34 IDs
Frame rate: 20.000740740740742
平均时间间隔1.9999890725931748应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔2.002135977261498应该等于总平滑窗2
平均时间间隔1.9999259286693085应该等于总平滑窗2
平均时间间隔1.9999451144514329应该等于总平滑窗2
平均时间间隔1.999925928669308应该等于总平滑窗2
平均时间间隔2.004095599862516应该等于总平滑窗2
平均时间间隔2.0002089136958277应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999259286693083应该等于总平滑窗2
平均时间间隔2.0026601297726945应该等于总平滑窗2
平均时间间隔1.9999259286693085应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999259286693083应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999259286693085应该等于总平滑窗2
平均时间间隔1.999944452848228应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.999944452848228应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.999944452848228应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999259286693085应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999467460517242应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999259286693087应该等于总平滑窗2
平均时间间隔2.0040163146986605应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999379706125207应该等于总平滑窗2
平均时间间隔2.0002215475405127应该等于总平滑窗2
平均时间间隔2.0026440693840293应该等于总平滑窗2
平均时间间隔1.9999259286693085应该等于总平滑窗2
平均时间间隔1.9999259286693083应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.999925928669308应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔2.003350119465293应该等于总平滑窗2
平均时间间隔2.002308894250688应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999259286693085应该等于总平滑窗2
平均时间间隔1.9999259286693087应该等于总平滑窗2
平均时间间隔1.9999259286693083应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999259286693083应该等于总平滑窗2
平均时间间隔1.9999259286693083应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999259286693085应该等于总平滑窗2
平均时间间隔1.9999259286693083应该等于总平滑窗2
Removed outliers: 0 IDs
裁剪的像素距离为：X 80.0， Y 80.0
原X,Y距离范围分别为 [127.0, 3220.8] [49.0, 3005.15]
裁剪后距离范围分别为 [207.0, 3140.8] [129.0, 2925.15]
注：事件分类仅完成分类（turn = 1, run = 0），tunrning rate的计算较为灵活，在汇总分析作图时使用
Processed data for: Date=20250113, Conditions=['0gNa', 'stdLB', 'cl', '45start'], Group ID=1
数据已保存到: Z:\data space+\C. elegans chemotaxis\2025\20250113_45start\2\20250113_Na-CTX_2.25gNaPre_0gNa-stdLB-cl-45start_1_smh-als.pkl
开始处理文件: ['20250113', 'Na-CTX', '2.25gNaPre', '0gNa-stdLB-cl-45start', '2']
Noise limit: 250, Worm trajectories: 52 IDs
Frame rate: 20.000555555555554
平均时间间隔1.9999444459876121应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999444459876121应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔2.000426195115304应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔2.001611622084634应该等于总平滑窗2
平均时间间隔2.001611622084634应该等于总平滑窗2
平均时间间隔1.9999444459876115应该等于总平滑窗2
平均时间间隔1.9999444459876117应该等于总平滑窗2
平均时间间隔1.9999444459876115应该等于总平滑窗2
平均时间间隔1.9999444459876115应该等于总平滑窗2
平均时间间隔2.0040511080943624应该等于总平滑窗2
平均时间间隔2.0006111951588763应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999444459876117应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999444459876121应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔2.000185291543868应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999471719702824应该等于总平滑窗2
平均时间间隔1.9999444459876117应该等于总平滑窗2
平均时间间隔1.9999530560463596应该等于总平滑窗2
平均时间间隔2.0001396895065824应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999444459876117应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999444459876117应该等于总平滑窗2
平均时间间隔1.9999444459876115应该等于总平滑窗2
平均时间间隔1.9999444459876117应该等于总平滑窗2
平均时间间隔1.999944445987612应该等于总平滑窗2
平均时间间隔1.9999444459876115应该等于总平滑窗2
平均时间间隔1.9999444459876115应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔2.0000706146909395应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999444459876117应该等于总平滑窗2
平均时间间隔1.999944445987612应该等于总平滑窗2
平均时间间隔1.9999980031764175应该等于总平滑窗2
平均时间间隔2.0033988954852266应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔2.0065753784826206应该等于总平滑窗2
平均时间间隔1.999944445987612应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999444459876117应该等于总平滑窗2
平均时间间隔1.999944445987612应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999444459876117应该等于总平滑窗2
平均时间间隔1.9999444459876117应该等于总平滑窗2
平均时间间隔1.9999444459876117应该等于总平滑窗2
平均时间间隔1.999944445987612应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.999944445987612应该等于总平滑窗2
平均时间间隔2.000004401752398应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.999986236869549应该等于总平滑窗2
平均时间间隔1.9999640840909698应该等于总平滑窗2
平均时间间隔1.9999444459876112应该等于总平滑窗2
平均时间间隔1.9999444459876115应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999444459876117应该等于总平滑窗2
平均时间间隔1.999944445987612应该等于总平滑窗2
平均时间间隔1.9999444459876117应该等于总平滑窗2
平均时间间隔1.9999444459876117应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.999944445987612应该等于总平滑窗2
平均时间间隔1.999944445987612应该等于总平滑窗2
平均时间间隔2.0000557919310715应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔2.0000557919310715应该等于总平滑窗2
Removed outliers: 0 IDs
裁剪的像素距离为：X 80.0， Y 80.0
原X,Y距离范围分别为 [78.59776828552087, 3959.9127369480602] [63.96428300084216, 2985.2764113728986]
裁剪后距离范围分别为 [158.59776828552089, 3879.9127369480602] [143.96428300084216, 2905.2764113728986]
注：事件分类仅完成分类（turn = 1, run = 0），tunrning rate的计算较为灵活，在汇总分析作图时使用
Processed data for: Date=20250113, Conditions=['0gNa', 'stdLB', 'cl', '45start'], Group ID=2
数据已保存到: Z:\data space+\C. elegans chemotaxis\2025\20250113_45start\2\20250113_Na-CTX_2.25gNaPre_0gNa-stdLB-cl-45start_2_smh-als.pkl
开始处理文件: ['20250113', 'Na-CTX', '2.25gNaPre', '0gNa-stdLB-op-45start', '1']
Noise limit: 250, Worm trajectories: 13 IDs
Frame rate: 20.000740740740742
平均时间间隔2.0005559433230173应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999259286693083应该等于总平滑窗2
平均时间间隔1.9999259286693083应该等于总平滑窗2
平均时间间隔1.9999259286693083应该等于总平滑窗2
平均时间间隔1.9999259286693085应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔1.9999259286693083应该等于总平滑窗2
平均时间间隔1.9999259286693085应该等于总平滑窗2
平均时间间隔1.999937632355689应该等于总平滑窗2
平均时间间隔2.0000685283043147应该等于总平滑窗2
平均时间间隔2.0000090157136334应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔2.001896784474846应该等于总平滑窗2
平均时间间隔2.0238301149405062应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


平均时间间隔2.029921090913559应该等于总平滑窗2


  ctx_i = vel_vec_i[0]/np.linalg.norm(vel_vec_i)


Removed outliers: 0 IDs
裁剪的像素距离为：X 80.0， Y 80.0
原X,Y距离范围分别为 [54.46408918333153, 2174.390862286615] [67.39723296382697, 2981.186468521223]
裁剪后距离范围分别为 [134.46408918333154, 2094.390862286615] [147.39723296382698, 2901.186468521223]
注：事件分类仅完成分类（turn = 1, run = 0），tunrning rate的计算较为灵活，在汇总分析作图时使用
Processed data for: Date=20250113, Conditions=['0gNa', 'stdLB', 'op', '45start'], Group ID=1
数据已保存到: Z:\data space+\C. elegans chemotaxis\2025\20250113_45start\2\20250113_Na-CTX_2.25gNaPre_0gNa-stdLB-op-45start_1_smh-als.pkl


In [9]:
import pickle
def save_concatenated_data(pickle_folder_ls, output_path,
                           columns_for_concat=[], key_1='.pkl',
                           key_2='', key_3='',
                           nokey_1='*'):
    """保存合并后的数据。"""
    
    pickle_files= []
    file_ls_all = []
    for pickle_folder in pickle_folder_ls:
        file_ls = [f for f in os.listdir(pickle_folder) if (key_1 in f) and(key_2 in f) and(key_3 in f) and (nokey_1 not in f) ]
        # 读取所有pickle文件并合并为一个Dataframe
        file_ls_all+=file_ls
        pickle_files += [os.path.join(pickle_folder, f) for f in file_ls]
    print(f'共{len(file_ls_all)}个文件:\n', *file_ls_all, sep='\n')
    # 读取所有数据并合并
    all_data = []
    for pickle_file in pickle_files:
        print('reading:', pickle_file, '\n')
        with open(pickle_file, 'rb') as f:
            df_p = pickle.load(f)  # 加载 Pickle 文件
            if len(columns_for_concat):
                all_data.append(df_p[columns_for_concat] if columns_for_concat else df_p)
            else:
                all_data.append(df_p)
                
    df_als = pd.concat(all_data, ignore_index=True)
    df_als = df_als[df_als.speed != 0].reset_index(drop=True)      # 删除速度为0的点并且重置index
    df_als.to_pickle(output_path)
    print(f"\n合并后的数据已保存到: {output_path}")

In [10]:
paths_pickles = [r'Z:\data space+\C. elegans chemotaxis\2025\20250114\1',
                r'Z:\data space+\C. elegans chemotaxis\2025\20250114\2',
                r'Z:\data space+\C. elegans chemotaxis\2025\20250114\3',
                r'Z:\data space+\C. elegans chemotaxis\2025\20250114_1\1',
                r'Z:\data space+\C. elegans chemotaxis\2025\20250114_1\2',
                r'Z:\data space+\C. elegans chemotaxis\2025\20250114_1\3',
                ]

output_folder = r'Z:\data space+\C. elegans chemotaxis\2025\20250114'

concat_name = '20250114_Na-CTX_18diffusion.pkl'
output_path = os.path.join(output_folder, concat_name)
save_concatenated_data(paths_pickles, output_path,key_1='smh-als', columns_for_concat=[])

共32个文件:

20240114_Na-CTX_2.25gNaPre_0gNa-0gLB-op-45start_1_smh-als.pkl
20240114_Na-CTX_2.25gNaPre_0gNa-0gLB-op-45start_2_smh-als.pkl
20240114_Na-CTX_2.25gNaPre_0gNa-stdLB-cl-45start_1_smh-als.pkl
20240114_Na-CTX_2.25gNaPre_0gNa-stdLB-cl-45start_2_smh-als.pkl
20240114_Na-CTX_2.25gNaPre_0gNa-stdLB-op-45start_1_smh-als.pkl
20240114_Na-CTX_2.25gNaPre_0gNa-stdLB-op-45start_2_smh-als.pkl
20240114_Na-CTX_2.25gNaPre_4.5gNa-0gLB-op-45start_1_smh-als.pkl
20240114_Na-CTX_2.25gNaPre_4.5gNa-0gLB-op-45start_2_smh-als.pkl
20240114_Na-CTX_2.25gNaPre_4.5gNa-stdLB-cl-45start_1_smh-als.pkl
20240114_Na-CTX_2.25gNaPre_4.5gNa-stdLB-cl-45start_2_smh-als.pkl
20240114_Na-CTX_2.25gNaPre_4.5gNa-stdLB-op-45start_1_smh-als.pkl
20240114_Na-CTX_2.25gNaPre_4.5gNa-stdLB-op-45start_2_smh-als.pkl
20250114_Na-CTX_2.25gNaPre_0gNa-0gLB-cl-45start_1_smh-als.pkl
20250114_Na-CTX_2.25gNaPre_0gNa-0gLB-cl-45start_2_smh-als.pkl
20250114_Na-CTX_2.25gNaPre_4.5gNa-0gLB-cl-45start_1_smh-als.pkl
20250114_Na-CTX_2.25gNaPre_4.5gNa-0gLB-

In [11]:
path=r'Z:\data space+\C. elegans chemotaxis\2025\20250114\20250114_Na-CTX_18diffusion.pkl'
df = pd.read_pickle(path)

0           20240114
1           20240114
2           20240114
3           20240114
4           20240114
              ...   
20720288    20250114
20720289    20250114
20720290    20250114
20720291    20250114
20720292    20250114
Name: Date, Length: 20720293, dtype: object

In [13]:
df.columns

Index(['ID', 'X', 'Y', 'Timestamp', 'speed', 'x_velocity', 'y_velocity',
       'agl_velocity', 'x_velocity_agl', 'y_velocity_agl', 'agl_speed', 'CTX',
       'bearing_left', 'CTX_left', 'bearing_right', 'CTX_right', 'X_org',
       'Y_org', 'Disp_to_mid', 'Event', 'Date', 'Condition0', 'Condition1',
       'Condition2', 'Condition3', 'Group_id'],
      dtype='object')

In [14]:
df = df.rename(columns={'Condition0':'TrainCon', 'Condition1':'LBType',
                  'Condition2':'Lid',
                  'Condition3':'TestCon'})