In [123]:
# how to run it:
# % java -cp "libs/*:data/*:src" analysis.main

In [41]:
# Prepare eye tracking data for analysis.
# The d2 lab has software that can batch analyze the eye tracking data.
# It requires the format: p{ID}_all_gaze.csv, p{ID}_fixations.csv
# The eye tracking data is copied to a folder called "batch_process_me"

import os
import shutil

# Set the path to the ILS Official Study Data and the destination folder
study_data_path = "ILS Official Study Data"
dest_folder = "batch_process_me"
parent_dir = "."  # Current directory. You can specify another path as needed

path = os.path.join(parent_dir, dest_folder)

# Create the destination folder if it doesn't exist
os.makedirs(path, exist_ok=True)

# List all the participant directories in the study data folder
participant_dirs = [d for d in os.listdir(study_data_path) if d.startswith("p")]

# Loop through each participant directory
for participant_dir in participant_dirs:
    # Construct the path to the participant's directory
    participant_path = os.path.join(study_data_path, participant_dir)
    
    # Check if it's a directory
    if os.path.isdir(participant_path):

        # List all files in the participant directory
        for file in os.listdir(participant_path):
            # Check for the specific files we want to copy
            if file.endswith("_all_gaze.csv") or file.endswith("_fixations.csv"):
                # Construct the full path to the file
                file_path = os.path.join(participant_path, file)
                # Copy the file to the destination folder
                shutil.copy(file_path, dest_folder)

print("Files copied to", dest_folder)

Files copied to batch_process_me


In [72]:
# Now that we have computed our gaze mertics,
# We combine them into a single CSV

import pandas as pd
import os

# Define the directory containing the participant folders
base_directory = "gaze_metrics"
combined_gaze_metrics_csv = "all_participants_gaze_metrics.csv"

# List to hold data from each participant's CSV file
all_data = []

# Loop through each item in the base directory
for item in os.listdir(base_directory):
    participant_dir = os.path.join(base_directory, item)
    
    # Check if the item is a directory (e.g., p1, p2, etc.)
    if os.path.isdir(participant_dir):
        csv_file = os.path.join(participant_dir, "combineResults.csv")

        # Check if the CSV file exists in this directory
        if os.path.exists(csv_file):
            df = pd.read_csv(csv_file)

            # Extract participant ID, assuming the format is 'p<number>'
            participant_id = int(item[1:])  # This removes the 'p' and converts to integer
            df['Participant'] = participant_id

            all_data.append(df)

# Concatenate all data into a single DataFrame
combined_data = pd.concat(all_data, ignore_index=True)

# Sort the DataFrame based on the participant number
combined_data = combined_data.sort_values(by='Participant')

# Move 'Participant' column to the first position
first_column = combined_data.pop('Participant')
combined_data.insert(0, 'Participant', first_column)

# Save the combined data to a new CSV file
combined_data.to_csv(combined_gaze_metrics_csv, index=False)

print(f"All files have been combined into {combined_gaze_metrics_csv}")

All files have been combined into all_participants_gaze_metrics.csv


In [111]:
# With the gaze metrics for each participant combined,
# It is time to add performance scores 
# (additional subjective data to be added later)

# First, we combine the performance scores into a single csv.
# Define the directory containing the scores
scores_directory = "pilot_scores"
combined_scores_csv = "all_participants_scores.csv"

# List to hold data from each participant's score file
all_scores_data = []

# Loop through each item in the base directory
for item in os.listdir(scores_directory):
    participant_dir = os.path.join(scores_directory, item)
    
    # Check if the item is a directory (e.g., p1, p2, etc.)
    if os.path.isdir(participant_dir):
        scores_csv_file = os.path.join(participant_dir, f"{item}_score.csv")
        
        # Check if the CSV file exists
        if os.path.exists(scores_csv_file):
            scores_df = pd.read_csv(scores_csv_file)

            # Extract participant number, assuming the format is 'p<number>'
            participant_number = int(item[1:])  # This removes the 'p' prefix

            # Add a column for the participant number as the first column
            scores_df.insert(0, 'Participant', participant_number)
            
            all_scores_data.append(scores_df)

# Concatenate all data into a single DataFrame
combined_scores_data = pd.concat(all_scores_data, ignore_index=True)

# Sort the DataFrame based on the participant number
combined_scores_data = combined_scores_data.sort_values(by='Participant')

# Save the combined data to a new CSV file
combined_scores_data.to_csv(combined_scores_csv, index=False)

print(f"All scores have been combined into {combined_scores_csv}")

All scores have been combined into all_participants_scores.csv


In [112]:
# Here we combine the scores and the gaze metrics into a single CSV.

import pandas as pd

# File paths
gaze_metrics_csv = "all_participants_gaze_metrics.csv"
scores_csv = "all_participants_scores.csv"
combined_csv = "combined_gaze_metrics_and_scores.csv"

# Read the gaze metrics and scores into DataFrames
gaze_metrics_df = pd.read_csv(gaze_metrics_csv)
scores_df = pd.read_csv(scores_csv)

# Merge the DataFrames on the 'Participant' column
# Ensure that the 'Participant' column is consistent and present in both DataFrames
combined_df = pd.merge(gaze_metrics_df, scores_df, on='Participant')

# Save the combined DataFrame to a new CSV file
combined_df.to_csv(combined_csv, index=False)

print(f"Gaze metrics and scores have been combined into {combined_csv}")

Gaze metrics and scores have been combined into combined_gaze_metrics_and_scores.csv


In [115]:
# Time to split the participants on expertise
# Using approach score here because that's what we have.

data_csv = "combined_gaze_metrics_and_scores.csv"
data_df = pd.read_csv(data_csv)

# Sort the DataFrame based on the "Approach Score" column
sorted_df = data_df.sort_values(by='Approach Score', ascending=False)

# Find the index to split the DataFrame into two equal groups
middle_index = len(sorted_df) // 2

# Split the DataFrame into two groups based on sorted order
lower_half = sorted_df[middle_index:]  # Participants with lower scores
upper_half = sorted_df[:middle_index]  # Participants with higher scores

upper_half.to_csv('higher_approach_scores.csv', index=False)
lower_half.to_csv('lower_approach_scores.csv', index=False)

print("The data has been saved into 'higher_approach_scores.csv' and 'lower_approach_scores.csv'.")

In [149]:
# Extracting pilot experience in hours from pre-study survey

import pandas as pd

# Define the file path
file_path = 'ILS Official Study Data/Questionaires/Pre-study Survey  (Responses).xlsx'

# Load the Excel file into a DataFrame
df = pd.read_excel(file_path)

# all_columns = [
#     "Timestamp", 
#     "Enter your Last Name, First Name", 
#     "Enter your Email", 
#     "1. Which category below would best describe you?", 
#     "2. What pilot license/certificate/endorsements do you currently hold? Select all that apply.",
#     "3. What is your total flight time to date (in hours)? (Including in a real aircraft or in an FAA approved full flight simulator or training device)",
#     "4. Approximately how many hours in a FAA approved full flight simulator or training device do you currently hold?",
#     "5. How many actual instrument hours do you have? (actual instrument conditions in an aircraft)",
#     "6. How many simulated instrument hours do you have? (simulated in a real aircraft and/or in FAA approved flight simulator or training device)",
#     "7. How many instrument approaches have you done within the last 6 months?",
#     "8. In the past two years, approximately how many total hours have you spent using an at-home simulator such as X-Plane or Microsoft Flight Simulator?",
#     "9. How many hours per week do you play console/PC video games?",
#     "Enter your Airmen Certificate Number",
#     "Enter your age",
#     "Participant ID",
#     "How did they hear about the study?",
#     "Notes",
#     "FAA Airmen Inquiry Database"
# ]

selected_columns = [
    "Participant ID",
    "1. Which category below would best describe you?", 
    "2. What pilot license/certificate/endorsements do you currently hold? Select all that apply.",
    "3. What is your total flight time to date (in hours)? (Including in a real aircraft or in an FAA approved full flight simulator or training device)",
    #"4. Approximately how many hours in a FAA approved full flight simulator or training device do you currently hold?",
    #"5. How many actual instrument hours do you have? (actual instrument conditions in an aircraft)",
    #"6. How many simulated instrument hours do you have? (simulated in a real aircraft and/or in FAA approved flight simulator or training device)",
    #"7. How many instrument approaches have you done within the last 6 months?",
    #"8. In the past two years, approximately how many total hours have you spent using an at-home simulator such as X-Plane or Microsoft Flight Simulator?",
    #"9. How many hours per week do you play console/PC video games?"
]

# Create a new DataFrame with only the selected columns
df_selected = df[selected_columns]

# Filter the DataFrame to keep only rows where the participant ID matches "p{number}"
df_filtered = df_selected.copy()[df_selected['Participant ID'].str.match(r'^p\d+$', na=False)]

# Step 1: Remove the 'p' from the participant ID
df_filtered['Participant ID'] = df_filtered['Participant ID'].str.replace('p', '')

# Step 2: Convert the participant ID to integer for proper sorting
df_filtered['Participant ID'] = df_filtered['Participant ID'].astype(int)

# Step 3: Sort the DataFrame based on the participant ID
df_sorted = df_filtered.sort_values('Participant ID')

# Set 'Participant ID' as the index of the DataFrame
df_sorted.set_index('Participant ID', inplace=True)

# Display the first few rows of the DataFrame with the new index
#print(df_sorted.head())

pilot_experience_csv = "pilot_experience.csv"
# Save the pilot experience DataFrame to a new CSV file
df_sorted.to_csv(pilot_experience_csv)

print(f"Pilot experience data saved to {pilot_experience_csv}")

Pilot experience data saved to pilot_experience.csv


In [153]:
# Here we combine the expertise, scores, and the gaze metrics into a single CSV.

import pandas as pd

# File paths
gaze_metrics_and_scores_csv = "combined_gaze_metrics_and_scores.csv"
pilot_experience_csv = "pilot_experience.csv"
combined_3_csv = "combined_experience_gaze_metrics_and_scores.csv"


# Read the gaze metrics and scores into DataFrames
gaze_metrics_and_scores_df = pd.read_csv(gaze_metrics_and_scores_csv)
pilot_experience_df = pd.read_csv(pilot_experience_csv)

# Merge the DataFrames on the 'Participant' column
# Ensure that the 'Participant' column is consistent and present in both DataFrames
combined_df = pd.merge(gaze_metrics_and_scores_df, pilot_experience_df, left_on='Participant', right_on='Participant ID')

# Drop the 'participant id' column as it is redundant
combined_df.drop('Participant ID', axis=1, inplace=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv(combined_3_csv, index=False)

print(f"Gaze metrics and scores and experience have been combined into {combined_3_csv}")

Gaze metrics and scores and experience have been combined into combined_experience_gaze_metrics_and_scores.csv


In [154]:
# Print basic statistics for the "flight hours" column
print(combined_df["3. What is your total flight time to date (in hours)? (Including in a real aircraft or in an FAA approved full flight simulator or training device)"].describe())

count       32.000000
mean      1132.962500
std       3056.684582
min         70.000000
25%        188.475000
50%        358.050000
75%        912.500000
max      17527.000000
Name: 3. What is your total flight time to date (in hours)? (Including in a real aircraft or in an FAA approved full flight simulator or training device), dtype: float64


In [155]:
# Split the pilot data on median flight hours

data_csv = "combined_experience_gaze_metrics_and_scores.csv"
data_df = pd.read_csv(data_csv)

# Sort the DataFrame based on the "Approach Score" column
sorted_df = data_df.sort_values(by="3. What is your total flight time to date (in hours)? (Including in a real aircraft or in an FAA approved full flight simulator or training device)", ascending=False)

# Find the index to split the DataFrame into two equal groups
middle_index = len(sorted_df) // 2

# Split the DataFrame into two groups based on sorted order
lower_half = sorted_df[middle_index:] # Participants with fewer hours
upper_half = sorted_df[:middle_index] # Participants with more hours

upper_half.to_csv('higher_hours.csv', index=False)
lower_half.to_csv('lower_hours.csv', index=False)

print("The data has been saved into 'higher_hours.csv' and 'lower_hours.csv'.")

The data has been saved into 'higher_hours.csv' and 'lower_hours.csv'.
