In [31]:
import pandas as pd

# Load the CSV into a DataFrame
df = pd.read_csv("data/finalized_data/combined_processed_SaccadeTable.csv")  # Replace with your CSV file path
distance_df = pd.read_csv("data/finalized_data/combined_distance_table.csv")  # Replace with your CSV file path

# Define expected bridges and cracks
expected_bridges = [f"Bridge {i}" for i in range(1, 5)]
expected_cracks = [f"crack {i}" for i in range(1, 21)]

# Group rows by respondent name
missing_data = []

# Ensure AOI Label and Study Name are in lowercase for consistent matching
df['AOI Label'] = df['AOI Label'].str.lower()
df['Study Name'] = df['Study Name'].str.lower()

# Iterate over each respondent
for respondent, group in df.groupby("Respondent Name"):
    # Check for each bridge
    for bridge in expected_bridges:
        bridge_group = group[group['Study Name'] == bridge.lower()]
        if bridge_group.empty:
            missing_data.append({"Respondent Name": respondent, "Study Name": bridge, "Missing": "All cracks"})
            continue
        
        # Check for each crack in the current bridge
        present_cracks = set(
            bridge_group['AOI Label']
            .str.extract(r'(crack \d+)', expand=False)
            .dropna()
        )
        missing_cracks = set(expected_cracks) - present_cracks
        
        if missing_cracks:
            missing_data.append({
                "Respondent Name": respondent,
                "Study Name": bridge,
                "Missing": ", ".join(missing_cracks)
            })

# Create a DataFrame of missing entries
missing_df = pd.DataFrame(missing_data)

# Print missing entries
print("Missing Entries:")
print(missing_df)

# Save to CSV for later reference
missing_df.to_csv("missing_entries.csv", index=False)
print("Missing data saved to 'missing_entries.csv'")


Missing Entries:
     Respondent Name Study Name  \
0              20002   Bridge 1   
1              20002   Bridge 2   
2              20002   Bridge 3   
3              20002   Bridge 4   
4              20003   Bridge 1   
..               ...        ...   
159            20045   Bridge 4   
160            20046   Bridge 1   
161            20046   Bridge 2   
162            20046   Bridge 3   
163            20046   Bridge 4   

                                               Missing  
0      crack 14, crack 17, crack 13, crack 10, crack 3  
1    crack 14, crack 17, crack 19, crack 1, crack 1...  
2    crack 11, crack 14, crack 17, crack 16, crack ...  
3    crack 14, crack 15, crack 16, crack 8, crack 4...  
4                                           All cracks  
..                                                 ...  
159  crack 14, crack 19, crack 12, crack 15, crack ...  
160  crack 11, crack 17, crack 1, crack 12, crack 1...  
161  crack 14, crack 19, crack 15, crack 16, crac

In [32]:
import pandas as pd

# Load the CSV into a DataFrame
distance_df = pd.read_csv("data/finalized_data/combined_distance_table.csv")  # Replace with your CSV file path

# Group rows by respondent name
missing_data_2 = []

# Ensure AOI Label and Study Name are in lowercase for consistent matching
distance_df['Label'] = distance_df['Label'].str.lower()
distance_df['Study Name'] = distance_df['Study Name'].str.lower()

# Iterate over each respondent
for respondent, group in distance_df.groupby("Respondent Name"):
    # Check for each bridge
    for bridge in expected_bridges:
        bridge_group = group[group['Study Name'] == bridge.lower()]
        if bridge_group.empty:
            missing_data_2.append({"Respondent Name": respondent, "Study Name": bridge, "Missing": "All cracks"})
            continue
        
        # Check for each crack in the current bridge
        present_cracks = set(
            bridge_group['Label']
            .str.extract(r'(crack \d+)', expand=False)
            .dropna()
        )
        missing_cracks = set(expected_cracks) - present_cracks
        
        if missing_cracks:
            missing_data_2.append({
                "Respondent Name": respondent,
                "Study Name": bridge,
                "Missing": ", ".join(missing_cracks)
            })

# Create a DataFrame of missing entries
missing_df_2 = pd.DataFrame(missing_data_2)

# Print missing entries
print("Missing Entries:")
print(missing_df_2)

# Save to CSV for later reference
missing_df_2.to_csv("missing_entries_2.csv", index=False)


Missing Entries:
     Respondent Name Study Name  \
0              20002   Bridge 1   
1              20002   Bridge 2   
2              20002   Bridge 3   
3              20002   Bridge 4   
4              20003   Bridge 1   
..               ...        ...   
163            20045   Bridge 4   
164            20046   Bridge 1   
165            20046   Bridge 2   
166            20046   Bridge 3   
167            20046   Bridge 4   

                                               Missing  
0                                              crack 3  
1      crack 14, crack 19, crack 15, crack 10, crack 3  
2    crack 11, crack 17, crack 4, crack 5, crack 10...  
3                          crack 16, crack 4, crack 15  
4                                           All cracks  
..                                                 ...  
163                        crack 16, crack 4, crack 15  
164                        crack 3, crack 12, crack 15  
165    crack 14, crack 19, crack 15, crack 10, cr

In [33]:
import pandas as pd

# Load the CSV into a DataFrame
fixation_df = pd.read_csv("data/finalized_data/fixation_aoi.csv")  # Replace with your CSV file path

# Define expected bridges and cracks
expected_bridges = [f"Bridge {i}" for i in range(1, 5)]
expected_cracks = [f"crack {i}" for i in range(1, 21)]

# Group rows by respondent name
missing_data_3 = []

# Ensure AOI Label and Study Name are in lowercase for consistent matching
fixation_df['Label'] = fixation_df['Label'].str.lower()
fixation_df['Study Name'] = fixation_df['Study Name'].str.lower()

# Iterate over each respondent
for respondent, group in fixation_df.groupby("Respondent Name"):
    # Check for each bridge
    for bridge in expected_bridges:
        bridge_group = group[group['Study Name'] == bridge.lower()]
        if bridge_group.empty:
            missing_data_3.append({"Respondent Name": respondent, "Study Name": bridge, "Missing": "All cracks"})
            continue
        
        # Check for each crack in the current bridge
        present_cracks = set(
            bridge_group['Label']
            .str.extract(r'(crack \d+)', expand=False)
            .dropna()
        )
        missing_cracks = set(expected_cracks) - present_cracks
        
        if missing_cracks:
            missing_data_3.append({
                "Respondent Name": respondent,
                "Study Name": bridge,
                "Missing": ", ".join(missing_cracks)
            })

# Create a DataFrame of missing entries
missing_df_3 = pd.DataFrame(missing_data_3)

# Print missing entries
print("Missing Entries:")
print(missing_df_3)

# Save to CSV for later reference
missing_df_3.to_csv("missing_entries_3.csv", index=False)


Missing Entries:
    Respondent Name Study Name                                 Missing
0             20002   Bridge 2                                crack 19
1             20002   Bridge 3                      crack 11, crack 10
2             20003   Bridge 1                              All cracks
3             20003   Bridge 2                              All cracks
4             20003   Bridge 3  crack 11, crack 12, crack 13, crack 10
..              ...        ...                                     ...
86            20044   Bridge 1                                crack 17
87            20044   Bridge 2                                crack 19
88            20045   Bridge 2                              All cracks
89            20046   Bridge 1             crack 6, crack 12, crack 15
90            20046   Bridge 2                                crack 19

[91 rows x 3 columns]


In [30]:

# Assuming missing_data and missing_data_2 are pandas DataFrames
# Replace these with your actual data loading code if necessary
missing_data = pd.read_csv("missing_entries.csv")
missing_data_2 = pd.read_csv("missing_entries_2.csv")
missing_data_3 = pd.read_csv("missing_entries_3.csv")

# Perform an outer join to identify differences
comparison = pd.merge(
    missing_data_2, missing_data_3,
    on=["Respondent Name", "Study Name", "Missing"],  # Columns to compare
    how="outer",
    indicator=True  # This will add a column to indicate the source of the row
)

# Find rows unique to missing_data
unique_to_missing_data = comparison[comparison["_merge"] == "left_only"]

# Find rows unique to missing_data_2
unique_to_missing_data_2 = comparison[comparison["_merge"] == "right_only"]

# Print the results
print("Rows in missing_data but not in missing_data_2:")
print(unique_to_missing_data)

print("\nRows in missing_data_2 but not in missing_data:")
print(unique_to_missing_data_2)

# Save results for further inspection
unique_to_missing_data.to_csv("unique_to_missing_data.csv", index=False)
unique_to_missing_data_2.to_csv("unique_to_missing_data_2.csv", index=False)

print("\nDifferences saved to 'unique_to_missing_data.csv' and 'unique_to_missing_data_2.csv'")


Rows in missing_data but not in missing_data_2:
     Respondent Name Study Name  \
0              20002   Bridge 1   
1              20002   Bridge 2   
2              20002   Bridge 3   
3              20002   Bridge 4   
6              20003   Bridge 3   
..               ...        ...   
163            20045   Bridge 4   
164            20046   Bridge 1   
165            20046   Bridge 2   
166            20046   Bridge 3   
167            20046   Bridge 4   

                                               Missing     _merge  
0                                              crack 3  left_only  
1      crack 14, crack 19, crack 15, crack 10, crack 3  left_only  
2    crack 11, crack 17, crack 4, crack 5, crack 10...  left_only  
3                          crack 16, crack 4, crack 15  left_only  
6    crack 11, crack 17, crack 12, crack 13, crack ...  left_only  
..                                                 ...        ...  
163                        crack 16, crack 4, crack 15 

In [36]:
import pandas as pd

# Load the CSVs into DataFrames
df = pd.read_csv("data/finalized_data/combined_processed_SaccadeTable.csv")
distance_df = pd.read_csv("data/finalized_data/combined_distance_table.csv")

# Perform an outer join to find rows in `distance_df` that are not in `df`
comparison = pd.merge(
    distance_df,
    df,
    on=["Study Name", "Respondent Name", "Label"],  # Columns to compare
    how="left",
    indicator=True  # Add a column to indicate source
)

# Filter rows that exist only in `distance_df`
only_in_distance = comparison[comparison["_merge"] == "left_only"]

# Drop the `_merge` column to clean up the output
only_in_distance = only_in_distance.drop(columns=["_merge"])

# Print the rows that are unique to `distance_df`
print("Rows in distance_df but not in df:")
print(only_in_distance)

# Save the output for further inspection if needed
only_in_distance.to_csv("unique_to_distance.csv", index=False)
print("Unique rows saved to 'unique_to_distance.csv'")


Rows in distance_df but not in df:
     Study Name  Respondent Name          Label  Distance  \
3      Bridge 3            20002    Crack 6 Hit       2.0   
9      Bridge 3            20002  Crack 14 Miss       2.0   
11     Bridge 3            20002   Crack 16 Hit       2.0   
12     Bridge 3            20002   Crack 18 Hit       2.0   
22     Bridge 3            20003  Crack 15 Miss       3.0   
...         ...              ...            ...       ...   
2117   Bridge 2            20046    Crack 2 Hit       2.0   
2119   Bridge 2            20046    Crack 5 Hit       2.0   
2120   Bridge 2            20046   Crack 6 Miss       4.0   
2123   Bridge 2            20046    Crack 9 Hit       2.0   
2127   Bridge 2            20046   Crack 16 HIt       3.0   

      Saccade Duration Mean  Saccade Duration Std  Saccade Duration Median  \
3                       NaN                   NaN                      NaN   
9                       NaN                   NaN                      NaN  