## Smoothing

### tidy participant data 
extract info from txt file
convert to csv with time, and velocities in xyz directions

In [1]:
import csv
import re

# convert txt controller input file to csv with time and velocity extracted
# input: path to txt, path to generated csv
def txt_to_csv(txt_file, csv_file):
    with open(txt_file, 'r') as file, open(csv_file, 'w', newline='') as csv_out:
        writer = csv.writer(csv_out)
        writer.writerow(['time', 'x', 'y', 'z', 'Magnitude'])  # Write CSV header

        for line in file:
            line = line.strip()
            if line:
                result = extract_time_and_velocity(line)
                if result:
                    time_value, x, y, z = result
                    if x == 0 and y == 0 and z == 0:
                        continue  # Skip lines with zero velocity
                    magnitude = (x**2 + y**2 + z**2)**0.5
                    writer.writerow([time_value, x, y, z, magnitude])

# helper function to extract the time and velocity values from each line
# input: line of csv
# output: extraced values
def extract_time_and_velocity(line):
    # Use a regular expression to capture time and velocity components
    time_match = re.search(r'Time:\s*([0-9.]+)', line)
    velocity_match = re.search(r'Velocity:\s*\(([^,]+),\s*([^,]+),\s*([^)]+)\)', line)

    if time_match and velocity_match:
        time_value = float(time_match.group(1))
        x, y, z = velocity_match.groups()
        return time_value, float(x), float(y), float(z)
    return NotImplemented



# # Example usage
# txt_file = 'data.txt'  # Path to your txt file
# csv_file = 'cleaned_data.csv'  # Output CSV file

# txt_to_csv(txt_file, csv_file)

### apply spline
Get dataset with time, velocity, acceleration, jerk

In [2]:
import numpy as np
import pandas as pd
from scipy.interpolate import UnivariateSpline
import matplotlib.pyplot as plt

# smooth velocity points over time 
# input: path to generated csv file
def spline(csv_file):
  # Load the CSV data
  data = pd.read_csv('cleaned_data.csv') # results of txt_to_csv
  time = data['time'].values
  velocity = data['Magnitude'].values

  # Fit a spline to the filtered velocity data
  # s value controls degree of smoothing
  spline = UnivariateSpline(time, velocity, s=1)

  # Compute the first and second derivative of the spline (acceleration, jerk)
  acceleration = spline.derivative(n=1)(time)
  jerk = spline.derivative(n=2)(time)

  data["Acceleration"] = acceleration
  data["Jerk"] = jerk

  data.to_csv(csv_file, index=False)

# # Example Usage
# spline('cleaned_data.csv', 'smoothed.csv')

### plotting 
ignore:: only used to test univariate spline + include figures in paper



In [None]:
# Plot the original data, smoothed spline, and derived acceleration
plt.figure(figsize=(12, 6))

# Plot the smoothed velocity data
plt.subplot(3, 1, 1)
plt.plot(time, velocity, 'o', label='Original Velocity Data')
plt.plot(time, spline(time), '-', label='Smoothed Spline')
plt.xlabel('Time (s)')
plt.ylabel('Velocity')
plt.legend()

# Plot the acceleration data
plt.subplot(3, 1, 2)
plt.plot(time, acceleration, label='Acceleration (First Derivative)')
plt.xlabel('Time (s)')
plt.ylabel('Acceleration')
plt.legend()

# Plot the jerk data
plt.subplot(3, 1, 3)
plt.plot(time, jerk, label='Jerk (Second Derivative)')
plt.xlabel('Time (s)')
plt.ylabel('Jerk')
plt.legend()

plt.tight_layout()
plt.show()

### Combine & run spline

In [10]:
def smooth_participant(input, output):
  # convert txt to csv & smooth data
  txt_to_csv(input, 'cleaned_data.csv')
  spline('cleaned_data.csv', output)
  return output

In [None]:
# run for every file in every bridge folder
easy1 = os.listdir('easy1')
easy2 = os.listdir('easy2')
hard1 = os.listdir('hard1')
hard2 = os.listdir('hard2')

for file in easy1:
  input = 'easy1/' + file
  participant_name = file[:-4]
  output = 'easy1_smoothed/' + participant_name + '.csv'
  smooth_participant(input, output)

for file in easy2:
  input = 'easy2/' + file
  participant_name = file[:-4]
  output = 'easy2_smoothed/' + participant_name + '.csv'
  smooth_participant(input, output)
  
for file in hard1:
  input = 'hard1/' + file
  participant_name = file[:-4]
  output = 'hard1_smoothed/' + participant_name + '.csv'
  smooth_participant(input, output)

for file in hard2:
  input = 'hard2/' + file
  participant_name = file[:-4]
  output = 'hard2_smoothed/' + participant_name + '.csv'
  smooth_participant(input, output)

## Applying Times

### import timecard

In [5]:
import pandas as pd
import os

# Load the timecard file and split by bridge
times = pd.read_csv('timecards/combined_timecard_3seconds.csv')

times = times.dropna(subset=['Start'])
times = times[~times["Label"].astype(str).str.contains("Base", case=False, na=False)]
times = times[~times["Label"].astype(str).str.contains("FA", case=False, na=False)]

easy1_times = times[times['Study Name'] == 'Bridge 1']
easy2_times = times[times['Study Name'] == 'Bridge 2']
hard1_times = times[times['Study Name'] == 'Bridge 3']
hard2_times = times[times['Study Name'] == 'Bridge 4']

easy1_times.head(60)

Unnamed: 0,Study Name,Respondent,Start,End,Label
0,Bridge 1,20002,681604.76,684604.76,Screen recording 2
1,Bridge 1,20002,105236.07,108236.07,Crack 1 Hit
2,Bridge 1,20002,113953.9,116953.9,Crack 2 Hit
3,Bridge 1,20002,129727.52,132727.52,Crack 3 Hit
4,Bridge 1,20002,140906.03,143906.03,Crack 4 Hit
5,Bridge 1,20002,147468.52,149968.52,Crack 5 Hit
6,Bridge 1,20002,170255.04,173255.04,Base 1
7,Bridge 1,20002,225519.07,228519.07,Crack 12 Hit
8,Bridge 1,20002,244836.19,247836.19,Base 2
9,Bridge 1,20002,249000.86,250800.86,Crack 11 Hit


### Calculate for each bridge 
this section is very much not modular :/

easy1:

In [14]:
# adjust time based on recording start time
adjust = 0

# Iterate through each row in easy1_times
for index, row in easy1_times.iterrows():
    respondent = row['Respondent']
    start_time = row['Start']
    end_time = row['End']
    # first row of bridge gives time to adjust
    if row['Label'][0:16] == 'Screen recording':
      adjust = row['Start']

    # Construct the file path for the participant's CSV file
    file_path = f'smoothed/easy1_smoothed/{respondent}.csv'

    # Check if the file exists
    if os.path.exists(file_path):
        # Load the participant's CSV file
        participant_data = pd.read_csv(file_path)

        # Filter the data between the start and end times
        # adjust for milliseconds & start time
        filtered_data = participant_data[(participant_data['time'] >= (start_time-adjust)/1000) & (participant_data['time'] <= (end_time-adjust)/1000)]

        # If there is no data for the time interval, skip row
        if filtered_data.empty:
            continue

        # calculate and append the results to the timecard
        easy1_times.at[index, 'avg_velocity'] = filtered_data['Magnitude'].mean()
        easy1_times.at[index, 'sd_velocity'] = filtered_data['Magnitude'].std()
        easy1_times.at[index, 'med_velocity'] = filtered_data['Magnitude'].median()
        easy1_times.at[index, 'avg_acceleration'] = filtered_data['Acceleration'].mean()
        easy1_times.at[index, 'sd_acceleration'] = filtered_data['Acceleration'].std()
        easy1_times.at[index, 'med_acceleration'] = filtered_data['Acceleration'].median()
        easy1_times.at[index, 'avg_jerk'] = filtered_data['Jerk'].mean()
        easy1_times.at[index, 'sd_jerk'] = filtered_data['Jerk'].std()
        easy1_times.at[index, 'med_jerk'] = filtered_data['Jerk'].median()
    else:
        # If the participant file does not exist, append a NaN or default value
        easy1_times.at[index, 'avg_velocity'] = None
        easy1_times.at[index, 'sd_velocity'] = None
        easy1_times.at[index, 'med_velocity'] = None
        easy1_times.at[index, 'avg_acceleration'] = None
        easy1_times.at[index, 'sd_acceleration'] = None
        easy1_times.at[index, 'med_acceleration'] = None
        easy1_times.at[index, 'avg_jerk'] = None
        easy1_times.at[index, 'sd_jerk'] = None
        easy1_times.at[index, 'med_jerk'] = None

# drop irrelevant labels
easy1_times.drop(columns=['Start', 'End'], inplace=True, axis=1)

# Save the updated easy1_times.csv file
easy1_times.to_csv('easy1_velocities.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  easy1_times.at[index, 'avg_velocity'] = filtered_data['Magnitude'].mean()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  easy1_times.at[index, 'sd_velocity'] = filtered_data['Magnitude'].std()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  easy1_times.at[index, 'med_velocity'] = filtered_data['Magn

repeat for easy2

In [5]:
adjust = 0

# Iterate through each row in easy2_times
for index, row in easy2_times.iterrows():
    respondent = row['Respondent']
    start_time = row['Start']
    end_time = row['End']
    if row['Label'][0:16] == 'Screen recording':
      adjust = row['Start']

    # Construct the file path for the participant's CSV file
    file_path = f'easy2_smoothed/{respondent}.csv'

    # Check if the file exists
    if os.path.exists(file_path):
        # Load the participant's CSV file
        participant_data = pd.read_csv(file_path)

        # Filter the data between the start and end times
        filtered_data = participant_data[(participant_data['time'] >= (start_time-adjust)/1000) & (participant_data['time'] <= (end_time-adjust)/1000)]

        # If there is no data for the time interval, skip row
        if filtered_data.empty:
            continue

        # Append the results to the list
        easy2_times.at[index, 'avg_velocity'] = filtered_data['Magnitude'].mean()
        easy2_times.at[index, 'sd_velocity'] = filtered_data['Magnitude'].std()
        easy2_times.at[index, 'med_velocity'] = filtered_data['Magnitude'].median()
        easy2_times.at[index, 'avg_acceleration'] = filtered_data['Acceleration'].mean()
        easy2_times.at[index, 'sd_acceleration'] = filtered_data['Acceleration'].std()
        easy2_times.at[index, 'med_acceleration'] = filtered_data['Acceleration'].median()
        easy2_times.at[index, 'avg_jerk'] = filtered_data['Jerk'].mean()
        easy2_times.at[index, 'sd_jerk'] = filtered_data['Jerk'].std()
        easy2_times.at[index, 'med_jerk'] = filtered_data['Jerk'].median()
    else:
        # If the participant file does not exist, append a NaN or default value
        easy2_times.at[index, 'avg_velocity'] = None
        easy2_times.at[index, 'sd_velocity'] = None
        easy2_times.at[index, 'med_velocity'] = None
        easy2_times.at[index, 'avg_acceleration'] = None
        easy2_times.at[index, 'sd_acceleration'] = None
        easy2_times.at[index, 'med_acceleration'] = None
        easy2_times.at[index, 'avg_jerk'] = None
        easy2_times.at[index, 'sd_jerk'] = None
        easy2_times.at[index, 'med_jerk'] = None

easy2_times.drop(columns=['Start', 'End'], inplace=True, axis=1)

# Save the updated easy1_times.csv file
easy2_times.to_csv('easy2_velocities.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  easy2_times.at[index, 'avg_velocity'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  easy2_times.at[index, 'sd_velocity'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  easy2_times.at[index, 'med_velocity'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try 

repeat for hard 1

In [7]:
adjust = 0

# Iterate through each row in hard1_times
for index, row in hard1_times.iterrows():
    respondent = row['Respondent']
    start_time = row['Start']
    end_time = row['End']
    if row['Label'][0:16] == 'Screen recording':
      adjust = row['Start']

    # Construct the file path for the participant's CSV file
    file_path = f'hard1_smoothed/{respondent}.csv'

    # Check if the file exists
    if os.path.exists(file_path):
        # Load the participant's CSV file
        participant_data = pd.read_csv(file_path)

        # Filter the data between the start and end times
        filtered_data = participant_data[(participant_data['time'] >= (start_time-adjust)/1000) & (participant_data['time'] <= (end_time-adjust)/1000)]

        # If there is no data for the time interval, skip row
        if filtered_data.empty:
            continue

        # Append the results to the list
        hard1_times.at[index, 'avg_velocity'] = filtered_data['Magnitude'].mean()
        hard1_times.at[index, 'sd_velocity'] = filtered_data['Magnitude'].std()
        hard1_times.at[index, 'med_velocity'] = filtered_data['Magnitude'].median()
        hard1_times.at[index, 'avg_acceleration'] = filtered_data['Acceleration'].mean()
        hard1_times.at[index, 'sd_acceleration'] = filtered_data['Acceleration'].std()
        hard1_times.at[index, 'med_acceleration'] = filtered_data['Acceleration'].median()
        hard1_times.at[index, 'avg_jerk'] = filtered_data['Jerk'].mean()
        hard1_times.at[index, 'sd_jerk'] = filtered_data['Jerk'].std()
        hard1_times.at[index, 'med_jerk'] = filtered_data['Jerk'].median()
    else:
        # If the participant file does not exist, append a NaN or default value
        hard1_times.at[index, 'avg_velocity'] = None
        hard1_times.at[index, 'sd_velocity'] = None
        hard1_times.at[index, 'med_velocity'] = None
        hard1_times.at[index, 'avg_acceleration'] = None
        hard1_times.at[index, 'sd_acceleration'] = None
        hard1_times.at[index, 'med_acceleration'] = None
        hard1_times.at[index, 'avg_jerk'] = None
        hard1_times.at[index, 'sd_jerk'] = None
        hard1_times.at[index, 'med_jerk'] = None

hard1_times.drop(columns=['Start', 'End'], inplace=True, axis=1)

# Save the updated easy1_times.csv file
hard1_times.to_csv('hard1_velocities.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard1_times.at[index, 'avg_velocity'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard1_times.at[index, 'sd_velocity'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard1_times.at[index, 'med_velocity'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try 

repeat for hard2

In [8]:
adjust = 0

# Iterate through each row in hard2_times
for index, row in hard2_times.iterrows():
    respondent = row['Respondent']
    start_time = row['Start']
    end_time = row['End']
    if row['Label'][0:16] == 'Screen recording':
      adjust = row['Start']

    # Construct the file path for the participant's CSV file
    file_path = f'hard2_smoothed/{respondent}.csv'

    # Check if the file exists
    if os.path.exists(file_path):
        # Load the participant's CSV file
        participant_data = pd.read_csv(file_path)

        # Filter the data between the start and end times
        filtered_data = participant_data[(participant_data['time'] >= (start_time-adjust)/1000) & (participant_data['time'] <= (end_time-adjust)/1000)]

        # If there is no data for the time interval, skip row
        if filtered_data.empty:
            continue

        # Append the results to the list
        hard2_times.at[index, 'avg_velocity'] = filtered_data['Magnitude'].mean()
        hard2_times.at[index, 'sd_velocity'] = filtered_data['Magnitude'].std()
        hard2_times.at[index, 'med_velocity'] = filtered_data['Magnitude'].median()
        hard2_times.at[index, 'avg_acceleration'] = filtered_data['Acceleration'].mean()
        hard2_times.at[index, 'sd_acceleration'] = filtered_data['Acceleration'].std()
        hard2_times.at[index, 'med_acceleration'] = filtered_data['Acceleration'].median()
        hard2_times.at[index, 'avg_jerk'] = filtered_data['Jerk'].mean()
        hard2_times.at[index, 'sd_jerk'] = filtered_data['Jerk'].std()
        hard2_times.at[index, 'med_jerk'] = filtered_data['Jerk'].median()
    else:
        # If the participant file does not exist, append a NaN or default value
        hard2_times.at[index, 'avg_velocity'] = None
        hard2_times.at[index, 'sd_velocity'] = None
        hard2_times.at[index, 'med_velocity'] = None
        hard2_times.at[index, 'avg_acceleration'] = None
        hard2_times.at[index, 'sd_acceleration'] = None
        hard2_times.at[index, 'med_acceleration'] = None
        hard2_times.at[index, 'avg_jerk'] = None
        hard2_times.at[index, 'sd_jerk'] = None
        hard2_times.at[index, 'med_jerk'] = None

hard2_times.drop(columns=['Start', 'End'], inplace=True, axis=1)

# Save the updated easy1_times.csv file
hard2_times.to_csv('hard2_velocities.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard2_times.at[index, 'avg_velocity'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard2_times.at[index, 'sd_velocity'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hard2_times.at[index, 'med_velocity'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try 

### Combine bridges

In [9]:
easy1 = pd.read_csv('easy1_velocities.csv')
easy2 = pd.read_csv('easy2_velocities.csv')
hard1 = pd.read_csv('hard1_velocities.csv')
hard2 = pd.read_csv('hard2_velocities.csv')

easy1['Study Name'] = "Bridge 1"
easy2['Study Name'] = "Bridge 2"
hard1['Study Name'] = "Bridge 3"
hard2['Study Name'] = "Bridge 4"

combined = pd.concat([easy1, easy2, hard1, hard2])

#Drop rows where they are expliclty omitted
combined.drop(combined[(combined["Respondent"]==20019)].index, inplace=True)
combined.drop(combined[(combined["Respondent"]==20040)].index, inplace=True)

#Drop cracks that are all hit
combined.drop(combined[(combined['Study Name'].str.contains("Bridge 1", na=False)) & (combined['Label'].str.lower().str.contains("crack 3 hit", na=False))].index, inplace=True)
combined.drop(combined[(combined['Study Name'].str.contains("Bridge 2", na=False)) & (combined['Label'].str.lower().str.contains("crack 3 hit", na=False))].index, inplace=True)
combined.drop(combined[(combined['Study Name'].str.contains("Bridge 2", na=False)) & (combined['Label'].str.lower().str.contains("crack 10 hit", na=False))].index, inplace=True)
combined.drop(combined[(combined['Study Name'].str.contains("Bridge 2", na=False)) & (combined['Label'].str.lower().str.contains("crack 14 hit", na=False))].index, inplace=True)
combined.drop(combined[(combined['Study Name'].str.contains("Bridge 2", na=False)) & (combined['Label'].str.lower().str.contains("crack 15 hit", na=False))].index, inplace=True)
combined.drop(combined[(combined['Study Name'].str.contains("Bridge 3", na=False)) & (combined['Label'].str.lower().str.contains("crack 4 hit", na=False))].index, inplace=True)
combined.drop(combined[(combined['Study Name'].str.contains("Bridge 3", na=False)) & (combined['Label'].str.lower().str.contains("crack 5 hit", na=False))].index, inplace=True)
combined.drop(combined[(combined['Study Name'].str.contains("Bridge 3", na=False)) & (combined['Label'].str.lower().str.contains("crack 17 hit", na=False))].index, inplace=True)
combined.drop(combined[(combined['Study Name'].str.contains("Bridge 3", na=False)) & (combined['Label'].str.lower().str.contains("crack 20 hit", na=False))].index, inplace=True)
combined.drop(combined[(combined['Study Name'].str.contains("Bridge 4", na=False)) & (combined['Label'].str.lower().str.contains("crack 4 hit", na=False))].index, inplace=True)
combined.drop(combined[(combined['Study Name'].str.contains("Bridge 4", na=False)) & (combined['Label'].str.lower().str.contains("crack 15 hit", na=False))].index, inplace=True)
combined.drop(combined[(combined['Study Name'].str.contains("Bridge 4", na=False)) & (combined['Label'].str.lower().str.contains("crack 16 hit", na=False))].index, inplace=True)


combined = combined[~combined["Label"].astype(str).str.contains("Screen recording", case=False, na=False)]

combined.to_csv('velocities/velocities_3seconds.csv', index=False)
