### Preprocess all data in a loop

In [4]:
import pandas as pd
import numpy as np
import warnings
from scipy.interpolate import interp1d

# List of dataset names
dataset_names = [
    #"DYAD02F_", "DYAD06F_", "DYAD06NF_", "DYAD10F_", "DYAD10NF_", "DYAD11F_",
    #"DYAD11NF_", "DYAD12F_", "DYAD12NF_", "DYAD14F_", "DYAD14NF_", "DYAD15F_",
    #"DYAD15I_", "DYAD15NF_", "DYAD16F_", "DYAD16I_", "DYAD16NF_", "DYAD18F_",
    #"DYAD18I_", "DYAD18NF_", "DYAD21F_", "DYAD21NF_", "DYAD23F_", "DYAD23NF_",
    #"DYAD24F_", "DYAD24NF_"
    "DYAD06NF_", "DYAD10NF_", "DYAD11NF_", "DYAD14F_", "DYAD16F_", "DYAD21NF_", "DYAD24NF_"
]

# Base directories for input and output
base_dir = "/Users/ruzenkakaldenbach/Desktop/Drive/raw_data_adjusted/"
output_dir = "/Users/ruzenkakaldenbach/Desktop/Drive/Loopy_preprocessed_data/"

# Path to scaling file
scaling_file_path = "/Users/ruzenkakaldenbach/Desktop/Drive/DESK_Measurements_ALL_with_scaling.xlsx"

# Load the scaling file
PixDistConvert = pd.read_excel(scaling_file_path)

# Define the function to process a single dataset
def process_dataset(dat_name):
    print(f"Processing {dat_name}...")

    # Load the dataset (e.g., 'DYAD06NF__adjusted.xlsx')
    file_path = f"{base_dir}{dat_name}_adjusted.xlsx"
    dat_raw = pd.read_excel(file_path)

    # Extract the scaling factor for the current dataset
    scaling_factor = PixDistConvert.loc[PixDistConvert['Group'] == dat_name, 'Conversion 1px to mm'].values[0]

    # Initialize the output DataFrame with necessary columns
    DF = pd.DataFrame(columns=['xrc', 'yrc', 'xrf', 'yrf', 
                               'xbc', 'ybc', 'xbf', 'ybf', 
                               'xyc', 'yyc', 'xyf', 'yyf', 
                               'frame_timestamp'])

    # Create an array with steps of 0.25 seconds until the max timestamp
    Seconds_025 = np.arange(0, int(np.max(dat_raw['frame_timestamp'])), 0.25)

    # Map the color codes to their prefixes
    color_map = {
        'r': 'red',  # 'r' is mapped to 'red'
        'b': 'blue',  # 'b' is mapped to 'blue'
        'y': 'yellow'  # 'y' is mapped to 'yellow'
    }

    # Loop through each interval in Seconds_025
    for sec in Seconds_025:
        # Filter data for the current time range [sec, sec+0.25)
        dd = dat_raw[(dat_raw['frame_timestamp'] >= sec) & (dat_raw['frame_timestamp'] < sec + 0.25)]

        # Initialize a row for the current second
        row = {'frame_timestamp': sec}

        # Process each color (red, blue, yellow)
        for color_code, color_name in color_map.items():
            # Extract the data for center and front dots for the current color
            datcenter = dd[['x_tr', 'y_tr']][dd['name'].str.contains(f'{color_name}_center')]
            datfront = dd[['x_tr', 'y_tr']][dd['name'].str.contains(f'{color_name}_front')]

            if len(datcenter) > 0 and len(datfront) > 0:  # If both center and front dots exist
                # Calculate the mean x_tr and y_tr coordinates for center and front dots
                row[f'x{color_code}c'] = np.mean(datcenter['x_tr'])
                row[f'y{color_code}c'] = np.mean(datcenter['y_tr'])
                row[f'x{color_code}f'] = np.mean(datfront['x_tr'])
                row[f'y{color_code}f'] = np.mean(datfront['y_tr'])

        # Append the row to the DataFrame
        DF = pd.concat([DF, pd.DataFrame([row])], ignore_index=True)

    # Define a function to interpolate missing values in a column
    def interpolate_column_by_index(df, column_name):
        valid_idx = df.index[~df[column_name].isna()]  # Get indices with valid values
        valid_values = df[column_name][~df[column_name].isna()]  # Extract valid values
        f = interp1d(valid_idx, valid_values, kind='linear', fill_value='extrapolate')  # Linear interpolation
        df[column_name] = f(df.index)  # Apply interpolation to all rows
        return df

    # Interpolate missing values for all coordinate columns
    coordinate_columns = [col for col in DF.columns if col != 'frame_timestamp']
    for column in coordinate_columns:
        DF = interpolate_column_by_index(DF, column)

    # Calculate distances between dyads (squared distances of center and front dots)
    DF['dist_c_rb'] = (DF['xrc'] - DF['xbc'])**2 + (DF['yrc'] - DF['ybc'])**2
    DF['dist_c_ry'] = (DF['xrc'] - DF['xyc'])**2 + (DF['yrc'] - DF['yyc'])**2
    DF['dist_c_by'] = (DF['xbc'] - DF['xyc'])**2 + (DF['ybc'] - DF['yyc'])**2

    DF['dist_f_rb'] = (DF['xrf'] - DF['xbf'])**2 + (DF['yrf'] - DF['ybf'])**2
    DF['dist_f_ry'] = (DF['xrf'] - DF['xyf'])**2 + (DF['yrf'] - DF['yyf'])**2
    DF['dist_f_by'] = (DF['xbf'] - DF['xyf'])**2 + (DF['ybf'] - DF['yyf'])**2

    # Apply scaling factor to all distance columns
    distance_columns = [
        'dist_c_rb', 'dist_c_ry', 'dist_c_by',
        'dist_f_rb', 'dist_f_ry', 'dist_f_by'
    ]
    DF[distance_columns] = DF[distance_columns] * scaling_factor

    # Save the processed dataset to the specified directory
    output_file = f"{output_dir}Loopy_{dat_name}_preprocessed.csv"
    DF.to_csv(output_file, index=False)
    print(f"Saved processed data to {output_file}.")

# Process all datasets in the list
for dataset_name in dataset_names:
    process_dataset(dataset_name)

print("All datasets processed.")


Processing DYAD06NF_...


KeyboardInterrupt: 