### Preprocess all data in a loop

In [1]:
import pandas as pd
import numpy as np
import warnings
from scipy.interpolate import interp1d

# List of dataset names
dataset_names = [
    #"DYAD02F_", "DYAD06F_", "DYAD06NF_", "DYAD10F_", "DYAD10NF_", "DYAD11F_",
    #"DYAD11NF_", "DYAD12F_", "DYAD12NF_", "DYAD14F_", "DYAD14NF_", "DYAD15F_",
    #"DYAD15I_", "DYAD15NF_", "DYAD16F_", "DYAD16I_", "DYAD16NF_", "DYAD18F_",
    #"DYAD18I_", "DYAD18NF_", "DYAD21F_", "DYAD21NF_", "DYAD23F_", "DYAD23NF_",
    #"DYAD24F_", "DYAD24NF_"
    "DYAD06NF_", "DYAD10NF_", "DYAD11NF_", "DYAD14F_", "DYAD16F_", "DYAD21NF_", "DYAD24NF_"
]

# Base directories for input and output
base_dir = "/Users/ruzenkakaldenbach/Desktop/Drive/raw_data_adjusted/"
output_dir = "/Users/ruzenkakaldenbach/Desktop/Drive/Loopy_preprocessed_data/"

# Path to scaling file
scaling_file_path = "/Users/ruzenkakaldenbach/Desktop/Drive/DESK_Measurements_ALL_with_scaling.xlsx"

# Load the scaling file
PixDistConvert = pd.read_excel(scaling_file_path)

# Define the function to calculate unit vectors
def unit_vector(vector):
    """Returns the unit vector of the given vector."""
    magnitude = np.linalg.norm(vector)  # Compute the magnitude (length) of the vector
    if magnitude == 0:  # Avoid division by zero
        return np.array([0, 0])  # Return a zero vector if the magnitude is zero
    return vector / magnitude  # Normalize the vector by dividing by its magnitude

# Define the function to calculate the angle between two unit vectors
def angle_between(v1, v2):
    """Returns the angle in radians between two unit vectors."""
    # Clip the dot product to avoid numerical errors and compute the angle
    return np.arccos(np.clip(np.dot(v1, v2), -1.0, 1.0))

# Define the function to process a single dataset
def process_dataset(dat_name):
    print(f"Processing {dat_name}...")

    # Load the dataset (e.g., 'DYAD06NF__adjusted.xlsx')
    file_path = f"{base_dir}{dat_name}_adjusted.xlsx"
    dat_raw = pd.read_excel(file_path)

    # Extract the scaling factor for the current dataset
    scaling_factor = PixDistConvert.loc[PixDistConvert['Group'] == dat_name, 'Conversion 1px to mm'].values[0]

    # When initializing DF, explicitly specify the data types for each column. 
    # This ensures that columns remain in the desired structure even if rows with all NaN values are concatenated.
    # The warning arises because Pandas plans to handle all-NaN rows differently in future versions.
    # Currently, it excludes all-NaN columns during concatenation for performance reasons. In the future, it will retain them.
    DF = pd.DataFrame({
        'xrc': pd.Series(dtype=float),
        'yrc': pd.Series(dtype=float),
        'xrf': pd.Series(dtype=float),
        'yrf': pd.Series(dtype=float),
        'xbc': pd.Series(dtype=float),
        'ybc': pd.Series(dtype=float),
        'xbf': pd.Series(dtype=float),
        'ybf': pd.Series(dtype=float),
        'xyc': pd.Series(dtype=float),
        'yyc': pd.Series(dtype=float),
        'xyf': pd.Series(dtype=float),
        'yyf': pd.Series(dtype=float),
        'frame_timestamp': pd.Series(dtype=float)
    })

    # Create an array with steps of 0.25 seconds until the max timestamp
    Seconds_025 = np.arange(0, int(np.max(dat_raw['frame_timestamp'])), 0.25)

    # Map the color codes to their prefixes
    color_map = {
        'r': 'red',  # 'r' is mapped to 'red'
        'b': 'blue',  # 'b' is mapped to 'blue'
        'y': 'yellow'  # 'y' is mapped to 'yellow'
    }

    # Loop through each interval in Seconds_025
    for sec in Seconds_025:
        # Filter data for the current time range [sec, sec+0.25)
        dd = dat_raw[(dat_raw['frame_timestamp'] >= sec) & (dat_raw['frame_timestamp'] < sec + 0.25)]

        # Initialize a row for the current second
        row = {'frame_timestamp': sec}

        # Process each color (red, blue, yellow)
        for color_code, color_name in color_map.items():
            # Extract the data for center and front dots for the current color
            datcenter = dd[['x_tr', 'y_tr']][dd['name'].str.contains(f'{color_name}_center')]
            datfront = dd[['x_tr', 'y_tr']][dd['name'].str.contains(f'{color_name}_front')]

            if len(datcenter) > 0 and len(datfront) > 0:  # If both center and front dots exist
                # Calculate the mean x_tr and y_tr coordinates for center and front dots
                row[f'x{color_code}c'] = np.mean(datcenter['x_tr'])
                row[f'y{color_code}c'] = np.mean(datcenter['y_tr'])
                row[f'x{color_code}f'] = np.mean(datfront['x_tr'])
                row[f'y{color_code}f'] = np.mean(datfront['y_tr'])

        # Append the row to the DataFrame
        DF = pd.concat([DF, pd.DataFrame([row])], ignore_index=True)

    # Define a function to interpolate missing values in a column
    def interpolate_column_by_index(df, column_name):
        valid_idx = df.index[~df[column_name].isna()]  # Get indices with valid values
        valid_values = df[column_name][~df[column_name].isna()]  # Extract valid values
        f = interp1d(valid_idx, valid_values, kind='linear', fill_value='extrapolate')  # Linear interpolation
        df[column_name] = f(df.index)  # Apply interpolation to all rows
        return df

    # Interpolate missing values for all coordinate columns
    coordinate_columns = [col for col in DF.columns if col != 'frame_timestamp']
    for column in coordinate_columns:
        DF = interpolate_column_by_index(DF, column)

        # After interpolation step
    if DF.isna().any().any():
        print("Error: Interpolation did not fill all NaN values.")
        print(DF[DF.isna().any(axis=1)])  # Print rows with NaN values
        raise ValueError("Interpolation failed to fill all missing values.")
    else:
        print("Success: No NaN values remain in the DataFrame after interpolation.")

    # Calculate distances between dyads (actual distances of center and front dots)
    DF['dist_c_rb'] = np.sqrt((DF['xrc'] - DF['xbc'])**2 + (DF['yrc'] - DF['ybc'])**2)
    DF['dist_c_ry'] = np.sqrt((DF['xrc'] - DF['xyc'])**2 + (DF['yrc'] - DF['yyc'])**2)
    DF['dist_c_by'] = np.sqrt((DF['xbc'] - DF['xyc'])**2 + (DF['ybc'] - DF['yyc'])**2)
    
    DF['dist_f_rb'] = np.sqrt((DF['xrf'] - DF['xbf'])**2 + (DF['yrf'] - DF['ybf'])**2)
    DF['dist_f_ry'] = np.sqrt((DF['xrf'] - DF['xyf'])**2 + (DF['yrf'] - DF['yyf'])**2)
    DF['dist_f_by'] = np.sqrt((DF['xbf'] - DF['xyf'])**2 + (DF['ybf'] - DF['yyf'])**2)

    # Apply scaling factor to all distance columns
    distance_columns = [
        'dist_c_rb', 'dist_c_ry', 'dist_c_by',
        'dist_f_rb', 'dist_f_ry', 'dist_f_by'
    ]
    DF[distance_columns] = DF[distance_columns] * scaling_factor

    # Check for equal distances and raise warnings
    if (DF['dist_c_ry'] == DF['dist_f_ry']).any():
        warnings.warn("Some rows have equal distances for central and front dots in Red-Yellow dyad (dist_c_ry == dist_f_ry).")
    
    if (DF['dist_c_by'] == DF['dist_f_by']).any():
        warnings.warn("Some rows have equal distances for central and front dots in Blue-Yellow dyad (dist_c_by == dist_f_by).")
    
    if (DF['dist_c_rb'] == DF['dist_f_rb']).any():
        warnings.warn("Some rows have equal distances for central and front dots in Red-Blue dyad (dist_c_rb == dist_f_rb).")


    # Determine whether children are "facing" or "backing" each other
    DF['facing_ry'] = np.where(DF['dist_c_ry'] > DF['dist_f_ry'], 1, 0)
    DF['facing_by'] = np.where(DF['dist_c_by'] > DF['dist_f_by'], 1, 0)
    DF['facing_rb'] = np.where(DF['dist_c_rb'] > DF['dist_f_rb'], 1, 0)

    # Add head orientation vectors based on center and front dots
    DF['vect_x_r'] = DF['xrf'] - DF['xrc']  # Horizontal vector for red
    DF['vect_y_r'] = DF['yrf'] - DF['yrc']  # Vertical vector for red

    DF['vect_x_b'] = DF['xbf'] - DF['xbc']  # Horizontal vector for blue
    DF['vect_y_b'] = DF['ybf'] - DF['ybc']  # Vertical vector for blue

    DF['vect_x_y'] = DF['xyf'] - DF['xyc']  # Horizontal vector for yellow
    DF['vect_y_y'] = DF['yyf'] - DF['yyc']  # Vertical vector for yellow

    # Compute unit vectors for red, blue, and yellow
    DF['unit_vect_x_r'], DF['unit_vect_y_r'] = zip(*DF.apply(lambda row: unit_vector([row['vect_x_r'], row['vect_y_r']]), axis=1))
    DF['unit_vect_x_b'], DF['unit_vect_y_b'] = zip(*DF.apply(lambda row: unit_vector([row['vect_x_b'], row['vect_y_b']]), axis=1))
    DF['unit_vect_x_y'], DF['unit_vect_y_y'] = zip(*DF.apply(lambda row: unit_vector([row['vect_x_y'], row['vect_y_y']]), axis=1))

    # Compute angles in degrees for each dyad
    DF['deg_ry'] = DF.apply(lambda row: np.rad2deg(angle_between(
        [row['unit_vect_x_r'], row['unit_vect_y_r']],
        [row['unit_vect_x_y'], row['unit_vect_y_y']]
    )), axis=1)

    DF['deg_rb'] = DF.apply(lambda row: np.rad2deg(angle_between(
        [row['unit_vect_x_r'], row['unit_vect_y_r']],
        [row['unit_vect_x_b'], row['unit_vect_y_b']]
    )), axis=1)

    DF['deg_by'] = DF.apply(lambda row: np.rad2deg(angle_between(
        [row['unit_vect_x_b'], row['unit_vect_y_b']],
        [row['unit_vect_x_y'], row['unit_vect_y_y']]
    )), axis=1)

    # Save the processed dataset to the specified directory
    output_file = f"{output_dir}Loopy_{dat_name}_processed.csv"
    DF.to_csv(output_file, index=False)
    print(f"Saved processed data to {output_file}.")

# Process all datasets in the list
for dataset_name in dataset_names:
    process_dataset(dataset_name)

print("All datasets processed.")


Processing DYAD06NF_...
Success: No NaN values remain in the DataFrame after interpolation.
Saved processed data to /Users/ruzenkakaldenbach/Desktop/Drive/Loopy_preprocessed_data/Loopy_DYAD06NF__processed.csv.
Processing DYAD10NF_...
Success: No NaN values remain in the DataFrame after interpolation.
Saved processed data to /Users/ruzenkakaldenbach/Desktop/Drive/Loopy_preprocessed_data/Loopy_DYAD10NF__processed.csv.
Processing DYAD11NF_...
Success: No NaN values remain in the DataFrame after interpolation.
Saved processed data to /Users/ruzenkakaldenbach/Desktop/Drive/Loopy_preprocessed_data/Loopy_DYAD11NF__processed.csv.
Processing DYAD14F_...
Success: No NaN values remain in the DataFrame after interpolation.
Saved processed data to /Users/ruzenkakaldenbach/Desktop/Drive/Loopy_preprocessed_data/Loopy_DYAD14F__processed.csv.
Processing DYAD16F_...
Success: No NaN values remain in the DataFrame after interpolation.
Saved processed data to /Users/ruzenkakaldenbach/Desktop/Drive/Loopy_pr

In [2]:
DF

NameError: name 'DF' is not defined