In [8]:
import pandas as pd
import statistics

In [9]:
def read_trajectory(filepath):

    """
    
    Reads the csv file data

    Args:
        filename (str): the path to the file

    Returns:
        pd.DataFrame containing all the data

    """

    data =  pd.read_csv(filepath, usecols=range(3), sep=" ", header = None) 
    return data

In [10]:
def create_dataframes_by_timestamp(data):
    """
    Creates separate DataFrames for each timestamp in the data.

    Args:
        data (pd.DataFrame): Dataframe containing particle positions.

    Returns:
        list[pd.DataFrame]: List of DataFrames, each representing a single timestamp.
    """

    # print(data)
    #creating list to hold dataframes
    dataframes = []

    #counts the number of time stamps
    timestamp_counter = -1

    #creating lists for each column on the dataframe
    xrows, yrows, zrows = [], [], []
    
    skiplist = 0

    #iterates through the indecies of the data
    for i in range(len(data)):

        #selects the first value of each row
        comx = data.iloc[i, 0]

        #if the value of the row corresponds to a time stamp
        if comx == "t":

            #increment the timestamp counter by 1
            timestamp_counter += 1

            #create new lists to store each of the data points for the x, y and z positions
            xrows.append([])
            yrows.append([])
            zrows.append([])
            
            continue

        #skip any of the header/time stamp related labels 
        if comx == "t" or comx == "b" or comx == "E":
            skiplist = i

        #if the row was not skipped
        if skiplist != i:

            #append the values of each of the positions to their corresponding list
            xrows[timestamp_counter].append(comx)
            yrows[timestamp_counter].append(comy := data.iloc[i, 1])
            zrows[timestamp_counter].append(comz := data.iloc[i, 2])

    
    for x, y, z in zip(xrows, yrows, zrows):
        dataframes.append(pd.DataFrame({"X": x, "Y": y, "Z": z}))

    return dataframes

In [19]:
# t1 = read_trajectory("20C_0.75M/trajectory_1_20C_0.75M_sim.dat")
# d1 = create_dataframes_by_timestamp(t1)

In [18]:
# t2 = read_trajectory("20C_0.75M/trajectory_2_20C_0.75M_sim.dat")
# d2 = create_dataframes_by_timestamp(t2)

# t3 = read_trajectory("20C_0.75M/trajectory_3_20C_0.75M_sim.dat")
# d3 = create_dataframes_by_timestamp(t3)

In [11]:
def average_values(df1, df2, df3):

    """
    Creates an average pd.DataFrame of all the 3 pd.DataFrames for a specific condition

    Args:
        df1 (pd.DataFrame): trial 1 data frame
        df2 (pd.DataFrame): trial 2 data frame
        df3 (pd.DataFrame): trial 3 data frame

    Returns:
        pd.DataFrame that contains an average of all the values

    """

    #creates a list to store the dfs of each time point
    avg_dfs = []

    #iterates through all the dataframes (index) contained within df1 (list of dataframes)
    for i in range(len(df1)):

        #creates 3 lists to store the positional average values
        X_avg, Y_avg, Z_avg = [], [], []

        #iterates through the values of a single data frame
        for j in range(len(df1[0])):

            #takes the mean and appends the mean value of all 3 dfs positional values at a particular index for x, y and z positions
            X_avg.append(statistics.mean([float(df1[i].iloc[j, 0]), float(df2[i].iloc[j, 0]), float(df3[i].iloc[j, 0])]))
            Y_avg.append(statistics.mean([float(df1[i].iloc[j, 1]), float(df2[i].iloc[j, 1]), float(df3[i].iloc[j, 1])]))
            Z_avg.append(statistics.mean([float(df1[i].iloc[j, 2]), float(df2[i].iloc[j, 2]), float(df3[i].iloc[j, 2])]))

        #creates a complete dataframe for a time point and stores it in the avg_dfs list
        avg_dfs.append(pd.DataFrame({"X": X_avg, "Y": Y_avg, "Z" : Z_avg}))
    
    return avg_dfs

In [21]:
# a1 = average_values(d1, d2, d3)
# a1

[            X          Y         Z
 0   -1.932173   3.235102  3.169494
 1   -2.356763   2.939592  3.366319
 2   -2.771465   2.525240  3.371740
 3   -3.003159   2.058256  3.175190
 4   -3.007023   1.539964  2.943754
 ..        ...        ...       ...
 190 -4.484302 -10.458519  2.879936
 191 -4.875264 -10.759186  3.012116
 192 -5.266078 -11.000011  2.818392
 193 -5.569338 -11.041165  2.431092
 194 -5.870528 -11.018176  1.906220
 
 [195 rows x 3 columns],
             X          Y         Z
 0   -1.960268   2.995823  3.146984
 1   -2.472219   2.774948  3.210085
 2   -2.819217   2.393275  3.061413
 3   -2.968243   1.933809  2.816784
 4   -2.843604   1.433355  2.671049
 ..        ...        ...       ...
 190 -4.575874 -10.083079  2.300663
 191 -4.843157 -10.286539  2.501842
 192 -5.182512 -10.477471  2.496667
 193 -5.577931 -10.606664  2.382725
 194 -5.862251 -10.676159  2.073877
 
 [195 rows x 3 columns],
             X          Y         Z
 0   -2.283237   3.018581  3.528703
 1   -2.75

### Public Function:

This function will be accessed on the public interface --> this is the only function that will be called

In [12]:
def prep_trajectory_data(filepaths):

    """

    Creates a properly formatted dataframe with all 3 averaged trajectory datas in a single data frame

    Args:
        filepaths (list): list of filepaths --> SHOULD ONLY CONTAIN 3 FILEPATHS

    Returns:
        pd.DataFrame containing averaged positional values of all 3 trajectories
    
    """

    #create a list to store all the formatted trajectories
    trajectories = []
    
    #iterates through specific range to produce correct numbers corresponding to trail number
    for fp in filepaths:

        #adds the unformatted trajectory to a temporary variable
        temp_traj= read_trajectory(fp)

        #formats the trajectory and appends it to the list of trajectories
        trajectories.append(create_dataframes_by_timestamp(temp_traj))

    #calls function to create a df with averaged positional values
    avg_traj = average_values(trajectories[0], trajectories[1], trajectories[2])

    return avg_traj

In [13]:
# avg_traj_20C_75M = prep_trajectory_data(["20C_0.75M/trajectory_1_20C_0.75M_sim.dat",
#                                        "20C_0.75M/trajectory_2_20C_0.75M_sim.dat",
#                                        "20C_0.75M/trajectory_3_20C_0.75M_sim.dat"])

In [14]:
# avg_traj_20C_75M

[            X          Y         Z
 0   -1.932173   3.235102  3.169494
 1   -2.356763   2.939592  3.366319
 2   -2.771465   2.525240  3.371740
 3   -3.003159   2.058256  3.175190
 4   -3.007023   1.539964  2.943754
 ..        ...        ...       ...
 190 -4.484302 -10.458519  2.879936
 191 -4.875264 -10.759186  3.012116
 192 -5.266078 -11.000011  2.818392
 193 -5.569338 -11.041165  2.431092
 194 -5.870528 -11.018176  1.906220
 
 [195 rows x 3 columns],
             X          Y         Z
 0   -1.960268   2.995823  3.146984
 1   -2.472219   2.774948  3.210085
 2   -2.819217   2.393275  3.061413
 3   -2.968243   1.933809  2.816784
 4   -2.843604   1.433355  2.671049
 ..        ...        ...       ...
 190 -4.575874 -10.083079  2.300663
 191 -4.843157 -10.286539  2.501842
 192 -5.182512 -10.477471  2.496667
 193 -5.577931 -10.606664  2.382725
 194 -5.862251 -10.676159  2.073877
 
 [195 rows x 3 columns],
             X          Y         Z
 0   -2.283237   3.018581  3.528703
 1   -2.75