### Preprocess all data in a loop

In [3]:
import pandas as pd
import numpy as np
import warnings
from scipy.interpolate import interp1d
import os

# List of dataset names
dataset_names = [
    "DYAD02F_"#, "DYAD06F_", "DYAD06NF_", "DYAD10F_", "DYAD10NF_", "DYAD11F_",
    #"DYAD11NF_", "DYAD12F_", "DYAD12NF_", "DYAD14F_", "DYAD14NF_", "DYAD15F_",
    #"DYAD15I_", "DYAD15NF_", "DYAD16F_", "DYAD16I_", "DYAD16NF_", "DYAD18F_",
    #"DYAD18I_", "DYAD18NF_", "DYAD21F_", "DYAD21NF_", "DYAD23F_", "DYAD23NF_",
    #"DYAD24F_", "DYAD24NF_"
]

# Base directories for input and output
base_dir = "/Users/ruzenkakaldenbach/Desktop/Behaviour/raw_data_transformed/"
output_dir = "/Users/ruzenkakaldenbach/Desktop/Behaviour/Loopy_preprocessed_data/"

# Path to scaling file
scaling_file_path = "/Users/ruzenkakaldenbach/Desktop/Drive/DESK_Measurements_ALL_with_scaling.xlsx"

# Load the scaling file
PixDistConvert = pd.read_excel(scaling_file_path)

# Define the function to calculate unit vectors
def unit_vector(vector):
    """Returns the unit vector of the given vector."""
    magnitude = np.linalg.norm(vector)  # Compute the magnitude (length) of the vector
    if magnitude == 0:  # Avoid division by zero
        return np.array([0, 0])  # Return a zero vector if the magnitude is zero
    return vector / magnitude  # Normalize the vector by dividing by its magnitude

# Define the function to calculate the angle between two unit vectors
def angle_between(v1, v2):
    """Returns the angle in radians between two unit vectors."""
    # Clip the dot product to avoid numerical errors and compute the angle
    return np.arccos(np.clip(np.dot(v1, v2), -1.0, 1.0))

# Define window size in seconds
window_size = 10  # 5 seconds before + 5 seconds after
window_frames = int(window_size / 0.25)  # Convert to frames (each frame = 0.25s) = 40 frames per 10s window

# Function to calculate moving correlation
def calculate_moving_correlation(df, col1, col2, window_frames):
    correlations = [None] * (window_frames // 2)  # Initialize the list and assign None-value for first 20 rows (5 seconds)
    
    for i in range(window_frames // 2, len(df) - window_frames // 2): 
        # i=20 means from row 21 since index i starts from 0 we have an additional row
        # stop earlier to prevent correlations being longer than df
        window_df = df.iloc[i - window_frames // 2: i + window_frames // 2] # extract the window centred around i +/-20s
        if window_df[col1].isna().any() or window_df[col2].isna().any():
            warnings.warn(f"NaN detected in window for i={i} ({col1}, {col2})", UserWarning)
            correlations.append(None)  # append to the end of the list
        else:
            correlations.append(window_df[col1].corr(window_df[col2])) # compute correlations and append to the end of the list

    correlations.extend([None] * (window_frames // 2))  # Modify the existing list by replacing values with None-value for last 20 rows (5 seconds)
    return correlations

# Define the function to process a single dataset
def process_dataset(dat_name):
    file_path = f"{base_dir}{dat_name}adjusted.xlsx"
    
    # Skip processing if file does not exist
    if not os.path.exists(file_path):
        print(f"Skipping {dat_name}: File not found at {file_path}")
        return
    print(f"Processing {dat_name}...")
    dat_raw = pd.read_excel(file_path)

    # Extract the scaling factor for the current dataset
    scaling_factor = PixDistConvert.loc[PixDistConvert['Group'] == dat_name, 'Conversion 1px to mm'].values[0]

    # When initializing DF, explicitly specify the data types for each column. 
    # This ensures that columns remain in the desired structure even if rows with all NaN values are concatenated.
    # The warning arises because Pandas plans to handle all-NaN rows differently in future versions.
    # Currently, it excludes all-NaN columns during concatenation for performance reasons. In the future, it will retain them.
    DF = pd.DataFrame({
        'xrc': pd.Series(dtype=float),
        'yrc': pd.Series(dtype=float),
        'xrf': pd.Series(dtype=float),
        'yrf': pd.Series(dtype=float),
        'xbc': pd.Series(dtype=float),
        'ybc': pd.Series(dtype=float),
        'xbf': pd.Series(dtype=float),
        'ybf': pd.Series(dtype=float),
        'xyc': pd.Series(dtype=float),
        'yyc': pd.Series(dtype=float),
        'xyf': pd.Series(dtype=float),
        'yyf': pd.Series(dtype=float),
        'frame_timestamp': pd.Series(dtype=float)
    })

    # Create an array with steps of 0.25 seconds until the max timestamp
    Seconds_025 = np.arange(0, int(np.max(dat_raw['frame_timestamp'])), 0.25)

    # Map the color codes to their prefixes
    color_map = {
        'r': 'red',  # 'r' is mapped to 'red'
        'b': 'blue',  # 'b' is mapped to 'blue'
        'y': 'yellow'  # 'y' is mapped to 'yellow'
    }

    # Loop through each interval in Seconds_025
    for sec in Seconds_025:
        # Filter data for the current time range [sec, sec+0.25)
        dd = dat_raw[(dat_raw['frame_timestamp'] >= sec) & (dat_raw['frame_timestamp'] < sec + 0.25)]

        # Initialize a row for the current second
        row = {'frame_timestamp': sec}

        # Process each color (red, blue, yellow)
        for color_code, color_name in color_map.items():
            # Extract the data for center and front dots for the current color
            datcenter = dd[['x_tr', 'y_tr']][dd['name'].str.contains(f'{color_name}_center')]
            datfront = dd[['x_tr', 'y_tr']][dd['name'].str.contains(f'{color_name}_front')]

            if len(datcenter) > 0 and len(datfront) > 0:  # If both center and front dots exist
                # Calculate the mean x_tr and y_tr coordinates for center and front dots
                row[f'x{color_code}c'] = np.mean(datcenter['x_tr'])
                row[f'y{color_code}c'] = np.mean(datcenter['y_tr'])
                row[f'x{color_code}f'] = np.mean(datfront['x_tr'])
                row[f'y{color_code}f'] = np.mean(datfront['y_tr'])

        # Append the row to the DataFrame
        DF = pd.concat([DF, pd.DataFrame([row])], ignore_index=True)

    # Define a function to interpolate missing values in a column
    def interpolate_column_by_index(df, column_name):
        valid_idx = df.index[~df[column_name].isna()]  # Get indices with valid values
        valid_values = df[column_name][~df[column_name].isna()]  # Extract valid values
        f = interp1d(valid_idx, valid_values, kind='linear', fill_value='extrapolate')  # Linear interpolation
        df[column_name] = f(df.index)  # Apply interpolation to all rows
        return df

    # Interpolate missing values for all coordinate columns
    coordinate_columns = [col for col in DF.columns if col != 'frame_timestamp']
    for column in coordinate_columns:
        DF = interpolate_column_by_index(DF, column)

        # After interpolation step
    if DF.isna().any().any():
        print("Error: Interpolation did not fill all NaN values.")
        print(DF[DF.isna().any(axis=1)])  # Print rows with NaN values
        raise ValueError("Interpolation failed to fill all missing values.")
    else:
        print("Success: No NaN values remain in the DataFrame after interpolation.")

    # Calculate distances between dyads (actual distances of center and front dots)
    DF['dist_c_rb'] = np.sqrt((DF['xrc'] - DF['xbc'])**2 + (DF['yrc'] - DF['ybc'])**2)
    DF['dist_c_ry'] = np.sqrt((DF['xrc'] - DF['xyc'])**2 + (DF['yrc'] - DF['yyc'])**2)
    DF['dist_c_by'] = np.sqrt((DF['xbc'] - DF['xyc'])**2 + (DF['ybc'] - DF['yyc'])**2)
    
    DF['dist_f_rb'] = np.sqrt((DF['xrf'] - DF['xbf'])**2 + (DF['yrf'] - DF['ybf'])**2)
    DF['dist_f_ry'] = np.sqrt((DF['xrf'] - DF['xyf'])**2 + (DF['yrf'] - DF['yyf'])**2)
    DF['dist_f_by'] = np.sqrt((DF['xbf'] - DF['xyf'])**2 + (DF['ybf'] - DF['yyf'])**2)

    # Apply scaling factor to all distance columns
    distance_columns = [
        'dist_c_rb', 'dist_c_ry', 'dist_c_by',
        'dist_f_rb', 'dist_f_ry', 'dist_f_by'
    ]
    DF[distance_columns] = DF[distance_columns] * scaling_factor

    # Check for equal distances and raise warnings
    if (DF['dist_c_ry'] == DF['dist_f_ry']).any():
        warnings.warn("Some rows have equal distances for central and front dots in Red-Yellow dyad (dist_c_ry == dist_f_ry).")
    
    if (DF['dist_c_by'] == DF['dist_f_by']).any():
        warnings.warn("Some rows have equal distances for central and front dots in Blue-Yellow dyad (dist_c_by == dist_f_by).")
    
    if (DF['dist_c_rb'] == DF['dist_f_rb']).any():
        warnings.warn("Some rows have equal distances for central and front dots in Red-Blue dyad (dist_c_rb == dist_f_rb).")


    # Determine whether children are "facing" or "backing" each other
    DF['facing_ry'] = np.where(DF['dist_c_ry'] > DF['dist_f_ry'], 1, 0)
    DF['facing_by'] = np.where(DF['dist_c_by'] > DF['dist_f_by'], 1, 0)
    DF['facing_rb'] = np.where(DF['dist_c_rb'] > DF['dist_f_rb'], 1, 0)

    # Add head orientation vectors based on center and front dots
    DF['vect_x_r'] = DF['xrf'] - DF['xrc']  # Horizontal vector for red
    DF['vect_y_r'] = DF['yrf'] - DF['yrc']  # Vertical vector for red

    DF['vect_x_b'] = DF['xbf'] - DF['xbc']  # Horizontal vector for blue
    DF['vect_y_b'] = DF['ybf'] - DF['ybc']  # Vertical vector for blue

    DF['vect_x_y'] = DF['xyf'] - DF['xyc']  # Horizontal vector for yellow
    DF['vect_y_y'] = DF['yyf'] - DF['yyc']  # Vertical vector for yellow

    # Compute unit vectors for red, blue, and yellow
    DF['unit_vect_x_r'], DF['unit_vect_y_r'] = zip(*DF.apply(lambda row: unit_vector([row['vect_x_r'], row['vect_y_r']]), axis=1))
    DF['unit_vect_x_b'], DF['unit_vect_y_b'] = zip(*DF.apply(lambda row: unit_vector([row['vect_x_b'], row['vect_y_b']]), axis=1))
    DF['unit_vect_x_y'], DF['unit_vect_y_y'] = zip(*DF.apply(lambda row: unit_vector([row['vect_x_y'], row['vect_y_y']]), axis=1))

    # Compute angles in degrees for each dyad
    DF['deg_ry'] = DF.apply(lambda row: np.rad2deg(angle_between(
        [row['unit_vect_x_r'], row['unit_vect_y_r']],
        [row['unit_vect_x_y'], row['unit_vect_y_y']]
    )), axis=1)

    DF['deg_rb'] = DF.apply(lambda row: np.rad2deg(angle_between(
        [row['unit_vect_x_r'], row['unit_vect_y_r']],
        [row['unit_vect_x_b'], row['unit_vect_y_b']]
    )), axis=1)

    DF['deg_by'] = DF.apply(lambda row: np.rad2deg(angle_between(
        [row['unit_vect_x_b'], row['unit_vect_y_b']],
        [row['unit_vect_x_y'], row['unit_vect_y_y']]
    )), axis=1)

        # Compute mean positions for each color (center and front combined)
    DF['xb'] = (DF['xbc'] + DF['xbf']) / 2
    DF['yb'] = (DF['ybc'] + DF['ybf']) / 2
    DF['xy'] = (DF['xyc'] + DF['xyf']) / 2
    DF['yy'] = (DF['yyc'] + DF['yyf']) / 2
    DF['xr'] = (DF['xrc'] + DF['xrf']) / 2
    DF['yr'] = (DF['yrc'] + DF['yrf']) / 2
    
    # Compute moving correlations between dyads
    DF['x_corr_ry'] = calculate_moving_correlation(DF, 'xr', 'xy', window_frames)
    DF['y_corr_ry'] = calculate_moving_correlation(DF, 'yr', 'yy', window_frames)
    DF['x_corr_by'] = calculate_moving_correlation(DF, 'xb', 'xy', window_frames)
    DF['y_corr_by'] = calculate_moving_correlation(DF, 'yb', 'yy', window_frames)
    DF['x_corr_rb'] = calculate_moving_correlation(DF, 'xr', 'xb', window_frames)
    DF['y_corr_rb'] = calculate_moving_correlation(DF, 'yr', 'yb', window_frames)
    
    # Save the processed dataset to the specified directory
    output_file = f"{output_dir}Loopy_{dat_name}_processed.csv"
    DF.to_csv(output_file, index=False)
    print(f"Saved processed data to {output_file}.")

# Process all datasets in the list
for dataset_name in dataset_names:
    process_dataset(dataset_name)

print("All datasets processed.")


Processing DYAD02F_...
Success: No NaN values remain in the DataFrame after interpolation.
Saved processed data to /Users/ruzenkakaldenbach/Desktop/Behaviour/Loopy_preprocessed_data/Loopy_DYAD02F__processed.csv.
All datasets processed.


In [4]:
import pandas as pd

# Path to the saved file
file_path = "/Users/ruzenkakaldenbach/Desktop/Behaviour/Loopy_preprocessed_data/Loopy_DYAD24NF__processed.csv"

# Load the CSV into a DataFrame
df = pd.read_csv(file_path)

# Print the DataFrame to the console
df


Unnamed: 0,xrc,yrc,xrf,yrf,xbc,ybc,xbf,ybf,xyc,yyc,...,vect_y_y,unit_vect_x_r,unit_vect_y_r,unit_vect_x_b,unit_vect_y_b,unit_vect_x_y,unit_vect_y_y,deg_ry,deg_rb,deg_by
0,813.748157,411.468976,841.726360,409.320596,803.035023,664.417601,778.179454,691.245274,1025.198740,401.020129,...,24.314813,0.997065,-0.076562,-0.679631,0.733554,-0.717827,0.696221,140.266385,137.205786,3.060600
1,815.997536,411.895140,842.798184,407.661840,777.059541,665.894386,756.116163,694.524140,1024.929080,400.569711,...,24.320153,0.987754,-0.156021,-0.590414,0.807100,-0.720523,0.693431,145.073708,135.162402,9.911306
2,818.619205,407.995233,843.195954,404.161857,761.221597,665.474297,763.140311,700.588511,1024.550603,400.568416,...,24.101949,0.988053,-0.154112,0.054561,0.998510,-0.721637,0.692272,145.055127,95.737665,49.317462
3,820.900081,394.877467,843.426849,394.545665,753.502903,666.800267,772.678241,701.386882,1024.222991,400.789033,...,23.702317,0.999892,-0.014728,0.484880,0.874580,-0.722175,0.691710,137.078200,61.839218,75.238982
4,820.430642,387.608287,842.961063,389.072835,755.299462,663.609011,770.080790,701.316197,1024.391713,400.041072,...,23.551657,0.997894,0.064866,0.364963,0.931022,-0.722686,0.691177,132.557487,64.875501,67.681986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2411,1164.005728,597.700526,1136.936311,568.575285,703.622442,339.471875,713.391266,373.346110,1018.202565,459.800252,...,-9.377915,-0.680783,-0.732486,0.277093,0.960843,-0.951327,-0.308182,29.145480,153.181946,124.036466
2412,1139.741309,559.959324,1112.472235,532.674201,632.805029,358.064600,644.784066,393.255353,947.440519,441.983715,...,-8.038315,-0.706899,-0.707315,0.322245,0.946656,-0.954470,-0.298308,27.660857,153.815582,126.154725
2413,1109.953263,514.291235,1081.035354,496.144789,592.690290,413.657986,622.559272,442.526801,874.926358,419.030871,...,-1.359933,-0.847039,-0.531531,0.719043,0.694966,-0.998431,-0.056002,28.898540,168.084441,139.185901
2414,1068.683937,488.737348,1036.407360,480.491199,566.237935,459.396627,616.595072,484.740869,808.957729,384.281016,...,12.361350,-0.968879,-0.247533,0.893249,0.449563,-0.906548,0.422102,39.298979,167.615933,128.316954
