In [91]:
import pandas as pd
from functools import reduce
import glob 
import os

# Load both CSV files into DataFrames

def takeTimecards(extension = "3seconds.csv"):
    def getdata(timecard_name):
        def extract_file(input_directory, timecard_name):
            for file_path in glob.glob(os.path.join(input_directory, "*.csv")):
                if file_path.lower().endswith(timecard_name):
                    file = pd.read_csv(file_path)
                    return file
            else:
                print("This shouldnt happen", input_directory)
                return
            
        fixation_directory = "../data/finalized_data/fixation/"
        saccade_directory = "../data/finalized_data/saccade_filled/"
        sensor_directory = "../data/finalized_data/sensor/"
        velocity_directory = "../data/finalized_data/velocity/"
        fixation_data = extract_file(fixation_directory, timecard_name)
        saccade_data = extract_file(saccade_directory, timecard_name)
        sensor_data = extract_file(sensor_directory, timecard_name)
        velocity_data = extract_file(velocity_directory, timecard_name)
        return fixation_data, saccade_data, sensor_data, velocity_data
        
    def renameColumns(data_list):
        new_data_list = []
        for data in data_list:
            rename_dict = {}
            for col in data.columns:
                if col.strip().lower()== "aoi label":
                    rename_dict[col] = "Label"
                if col.strip().lower() == "respondent name":
                    rename_dict[col] = "Respondent"
                if col.strip().lower() == "bridge":
                    rename_dict[col] = "Study Name"
            data = data.rename(columns = rename_dict)
            new_data_list.append(data)
        return new_data_list

    def normalize_keys(df):
        df["Study Name"] = df["Study Name"].str.strip().str.lower()
        df["Label"] = df["Label"].str.strip().str.lower()
        return df

    fixation_data, saccade_data, sensor_data, velocity_data = getdata(extension)
    distance_data = pd.read_csv("../data/finalized_data/combined_distance_table.csv")
    data_list = [distance_data, saccade_data, fixation_data, sensor_data]
    data_list = renameColumns(data_list)
    # distance_data = data_list[0]
    # velocity_data = data_list[1]
    data_list = [normalize_keys(df) for df in data_list]

    # # Find rows that are only in df1
    # diff = data_list[0].merge(data_list[1], how='outer', indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
    # print(len(diff))
    # print(diff)

    # diff = data_list[1].merge(data_list[0], how='outer', indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
    # print(len(diff))
    # print(diff)


    # # Perform the merge on common columns
    merged_df = reduce(lambda left, right: pd.merge(left, right, on=["Study Name", "Respondent", "Label"], how='outer'), data_list)
    print(len(merged_df))
 

    personal_data = pd.read_csv("../data/finalized_data/personal_data_merged.csv")
    final_merged_df = merged_df.merge(personal_data, on = "Respondent", how = "left")
    print(len(final_merged_df))
    return final_merged_df

    # Save the merged DataFrame to a new CSV file
    # final_merged_df.to_csv(f'../data/merged_data/merged_data_{extension}', index=False)
    # print(f"Saved at merged_data_{extension}")
    
# Load the CSV file into a DataFrame
def filterData(df, extension, directory_name):
    # df = pd.read_csv(f'../data/merged_data/merged_data_{extension}',keep_default_na=False)
    df = df.drop("Study Name", axis='columns')
    df = df.drop("Respondent", axis='columns')
    df.drop(df[df['Label'].str.contains("Base", na=False)].index, inplace=True)

    # drop "Start" and "End" columns
    df = df.drop(["Start", "End"], axis=1, errors="ignore")
    
    #drop distance for no-distance
    df = df.drop(["Distance"], axis=1, errors="ignore")


    label_count = {}
    label_count["miss"] = 0
    label_count["hit"] = 0
    for index, row in df.iterrows():
        aoi_label = row["Label"].lower()
        if "hit" in aoi_label:
            df.loc[index, 'Label'] = 0
            label_count["hit"] +=1
        elif "miss" in aoi_label:
            df.loc[index, 'Label'] = 1
            label_count["miss"] +=1
        else: #shouldnt happen
            print(aoi_label)

    print(label_count)
    print(label_count["miss"]/(label_count['miss'] + label_count['hit']))

    df.to_csv(f"../data/merged_data/{directory_name}/filtered_merged_{extension}", index=False)
    print(f"saved at filtered_merged_{extension}")

time = "5seconds.csv"
directory_name = "no_distance"
final_merged_df = takeTimecards(time)
filterData(final_merged_df, time, directory_name)

2061
2061
{'miss': 477, 'hit': 1584}
0.2314410480349345
saved at filtered_merged_5seconds.csv
