In [1]:
import params
import os
import re
import pandas as pd
import glob

In [2]:
# Functions to categorise trials data to simpler columns.

def categorise_correct(row):  
    '''This function gets a row and returns yes if the subject was correct with his answer and no if he was incorrect'''
    if (row['angleChange'] != 0 or row['SensoMotoric Delay'] != 0) and row['QuestionResult'] == 0 :
        return 'yes'
    elif row['angleChange'] == 0 and row['SensoMotoric Delay'] == 0 and row['QuestionResult'] == 1 :
        return 'yes'
    return 'no'

def categorise_location(row, translator):
    '''This function gets a row and a translator dict and returns the location of the butterfly'''
    return translator[row['setup task Number']]
    
def categorise_spatial_side(row):
    '''This function gets a row and returns the type of spatial alteration (left, right, normal)'''
    if row['angleChange'] > 0 :
        return 'right'
    elif row['angleChange'] < 0:
        return 'left'
    return 'normal'

def categorise_spatial_alteration_level(row, alteration_levels):  
    '''This function gets a row and a list of alteration levels and returns the level of spatial alteration,
    1 signifies no alteration''' 
    alteration_levels.sort()
    if row['angleChange'] == 0 :
        return '1'
    elif row['angleChange'] == alteration_levels[0] or row['angleChange'] == -1 * alteration_levels[0]:
        return '2'
    elif row['angleChange'] == alteration_levels[1] or row['angleChange'] == -1 * alteration_levels[1]:
        return '3'
    elif row['angleChange'] == alteration_levels[2] or row['angleChange'] == -1 * alteration_levels[2]:
        return '4'
    
def categorise_temporal_alteration_level(row, alteration_levels):  
    '''This function gets a row and a list of alteration levels and returns the level of temporal alteration,
    1 signifies no alteration''' 
    alteration_levels.sort()
    if row['SensoMotoric Delay'] == 0 :
        return '1'
    elif row['SensoMotoric Delay'] == alteration_levels[0]:
        return '2'
    elif row['SensoMotoric Delay'] == alteration_levels[1]:
        return '3'
    elif row['SensoMotoric Delay'] == alteration_levels[2]:
        return '4'

In [3]:
# Preprocessing, Stage I - Extraction.

# Read results and trials files.
subjects = [subject for subject in os.listdir(params.data_dir) if re.search(params.data_dir_pattern, subject)]

for subject in subjects:
    usedsubnum = subject.split('Sub')[1]
    realsubnum = params.subnum_translator[int(usedsubnum)]
    part1_results = pd.read_csv(glob.glob(params.data_dir + subject + params.part1_results_path)[0])
    part2_results = pd.read_csv(glob.glob(params.data_dir + subject + params.part2_results_path)[0])
    part3_results = pd.read_csv(glob.glob(params.data_dir + subject + params.part3_results_path)[0])
    part1_trials = pd.read_csv(glob.glob(params.data_dir + subject + params.part1_trials_path)[0])
    part2_trials = pd.read_csv(glob.glob(params.data_dir + subject + params.part2_trials_path)[0])
    part3_trials = pd.read_csv(glob.glob(params.data_dir + subject + params.part3_trials_path)[0])
    
    # prepare data to merge answers with trials.
    for trials in [part1_trials, part2_trials, part3_trials]:
        trials.drop(trials.tail(1).index, inplace = True)
        trials.rename(columns = {'#trial number':'TrialNumber'}, inplace = True)
        
    part1_merged = pd.merge(part1_results, part1_trials, on='TrialNumber')
    part2_merged = pd.merge(part2_results, part2_trials, on='TrialNumber')
    part3_merged = pd.merge(part3_results, part3_trials, on='TrialNumber')
    
    # if does not exist, create the preprocessed output directory and each subject's output directory.   
    if not os.path.exists(params.preprocessed_output_dir):
        os.makedirs(params.preprocessed_output_dir)
    if not os.path.exists(params.preprocessed_output_dir+subject):
        os.makedirs(params.preprocessed_output_dir+subject)
        
    # choose only the columns we will use and save the files.
    for part in ["part1", "part2", "part3"]:
        merged_clean = globals()[part+"_merged"][['TrialNumber', 'block number', 'SensoMotoric Delay', 'angleChange', 'ResponseTime','setup task Number', 'QuestionResult']]
        merged_clean.to_csv(params.preprocessed_output_dir + subject + "/" + part + ".csv", index = False)

In [4]:
# Preprocessing, Stage II - Calculation.

# Calculate the simpler columns using the categorise functions in order to continue with further analysis.
subjects = [subject for subject in os.listdir(params.preprocessed_output_dir) if re.search(params.data_dir_pattern, subject)]

for subject in subjects:
    parts = [part for part in os.listdir(params.preprocessed_output_dir + "/" + subject) if re.search("^part*",part)]
    for part in parts:
        data = pd.read_csv(params.preprocessed_output_dir + "/" + subject + "/" + part)
        data.drop(data.loc[data['setup task Number']==66].index, inplace=True)
        data['correct'] = data.apply(lambda row: categorise_correct(row), axis=1)
        data['butterflyLocation'] = data.apply(lambda row: categorise_location(row, params.tasknum_translator), axis=1)
        data['temporalAlterationLevel'] = data.apply(lambda row: 
                                                     categorise_temporal_alteration_level(row, params.temporal_alteration_levels), axis=1)
        data['spatialAlterationLevel'] = data.apply(lambda row: 
                                                    categorise_spatial_alteration_level(row, params.spatial_alteration_levels), axis=1)
        data['angleSide'] = data.apply(lambda row: categorise_spatial_side(row), axis=1)
        data.to_csv(params.preprocessed_output_dir + "/" + subject + "/preprocessed_" + part, index=False)
        

In [11]:
# Preprocessing, Stage III - Combination.
    
all_subjects_df = pd.DataFrame(columns=['subjectNum', 'subjectID', 'startingAlteration', 'TemporalBlockL1', 'TemporalBlockL2',
                                        'TemporalBlockL3','TemporalBlockL4', 'SpatialBlockL1', 'SpatialBlockL2', 'SpatialBlockL3',
                                        'SpatialBlockL4', 'CombinedBlockT1S1', 'CombinedBlockT1S2', 'CombinedBlockT1S3', 
                                        'CombinedBlockT1S4', 'CombinedBlockT2S1', 'CombinedBlockT2S2', 'CombinedBlockT2S3', 
                                        'CombinedBlockT2S4', 'CombinedBlockT3S1', 'CombinedBlockT3S2', 'CombinedBlockT3S3',
                                        'CombinedBlockT3S4', 'CombinedBlockT4S1', 'CombinedBlockT4S2', 'CombinedBlockT4S3',
                                        'CombinedBlockT4S4', 'LocBotAngL', 'LocBotAngN', 'LocBotAngR', 'LocMidAngL', 
                                        'LocMidAngN', 'LocMidAngR', 'LocTopAngL', 'LocTopAngN', 'LocTopAngR', 
                                        'LeftSpatial', 'NormalSpatial', 'RightSpatial', 'LeftSpatialTotal', 'NormalSpatialTotal', 
                                        'RightSpatialTotal', 'ResponseTimeT1S1', 'ResponseTimeT1S2', 'ResponseTimeT1S3', 
                                        'ResponseTimeT1S4', 'ResponseTimeT2S1', 'ResponseTimeT2S2', 'ResponseTimeT2S3',
                                        'ResponseTimeT2S4', 'ResponseTimeT3S1', 'ResponseTimeT3S2', 'ResponseTimeT3S3',
                                        'ResponseTimeT3S4', 'ResponseTimeT4S1', 'ResponseTimeT4S2', 'ResponseTimeT4S3',
                                        'ResponseTimeT4S4'])

# Creating a row for each subject with the percent self attribution of each condition
subjects = [subject for subject in os.listdir(params.preprocessed_output_dir) if re.search(params.data_dir_pattern, subject)]
for subject in subjects:
    preprocessed_parts = sorted([part for part in os.listdir(params.preprocessed_output_dir + "/" + subject) if re.search("preprocessed_part*",part)])
    subject_row = [] # Initialize subject row.
    subject_row.append(params.subnum_translator[int(subject.split("Sub")[1].split(".csv")[0])]) # subjectNum
    subject_row.append(int(subject.split("Sub")[1].split(".csv")[0])) # subjectID
    part1 = pd.read_csv(params.preprocessed_output_dir + "/" + subject + "/" + preprocessed_parts[0])
    
    # Divide results into blocks for block dependant analysis.
    if len(set(part1.angleChange.tolist()))==1:
        # This means part1 is Temporal and part2 is Spatial.
        subject_row.append('Temporal')
        temporal_part = pd.read_csv(params.preprocessed_output_dir + "/" + subject + "/" + preprocessed_parts[0])
        spatial_part = pd.read_csv(params.preprocessed_output_dir + "/" + subject + "/" + preprocessed_parts[1])
        combined_part = pd.read_csv(params.preprocessed_output_dir + "/" + subject + "/" + preprocessed_parts[2])
    else:
        # This means part1 is Spatial and part2 is Temporal.
        subject_row.append('Spatial')
        spatial_part = pd.read_csv(params.preprocessed_output_dir + "/" + subject + "/" + preprocessed_parts[0])
        temporal_part = pd.read_csv(params.preprocessed_output_dir + "/" + subject + "/" + preprocessed_parts[1])
        combined_part = pd.read_csv(params.preprocessed_output_dir + "/" + subject + "/" + preprocessed_parts[2])
        
    # TemporalBlockL1, TemporalBlockL2, TemporalBlockL3, TemporalBlockL4
    for level in range(1,len(set(temporal_part['temporalAlterationLevel'].tolist())) + 1):
            subject_row.append(round((len(temporal_part.loc[(temporal_part['temporalAlterationLevel'] == level) &
                                                     (temporal_part['QuestionResult'] == 1)])/
                                      len(temporal_part.loc[(temporal_part['temporalAlterationLevel'] == level)])*100), 3))
    
    #SpatialBlockL1, SpatialBlockL2, SpatialBlockL3, SpatialBlockL4
    for level in range(1,len(set(spatial_part['spatialAlterationLevel'].tolist())) + 1):
            subject_row.append(round((len(spatial_part.loc[(spatial_part['spatialAlterationLevel'] == level) &
                                                     (spatial_part['QuestionResult'] == 1)])/
                                      len(spatial_part.loc[(spatial_part['spatialAlterationLevel'] == level)])*100), 3))
    
    # 'CombinedBlockT1S1', 'CombinedBlockT1S2', 'CombinedBlockT1S3', 'CombinedBlockT1S4', 'CombinedBlockT2S1',
    # 'CombinedBlockT2S2', 'CombinedBlockT2S3', 'CombinedBlockT2S4', 'CombinedBlockT3S1', 'CombinedBlockT3S2',
    # 'CombinedBlockT3S3', 'CombinedBlockT3S4', 'CombinedBlockT4S1', 'CombinedBlockT4S2', 'CombinedBlockT4S3',
    # 'CombinedBlockT4S4'
    for temporal_level in range(1,len(set(combined_part['temporalAlterationLevel'].tolist())) + 1):
        for spatial_level in range(1,len(set(combined_part['spatialAlterationLevel'].tolist())) + 1):
            subject_row.append(round((len(combined_part.loc[(combined_part['temporalAlterationLevel'] == temporal_level) &
                                                    (combined_part['spatialAlterationLevel'] == spatial_level) &
                                                    (combined_part['QuestionResult'] == 1)])/
                                     len(combined_part.loc[(combined_part['temporalAlterationLevel'] == temporal_level) &
                                                   (combined_part['spatialAlterationLevel'] == spatial_level)])*100), 3))
    
    # 'LocBotAngL', 'LocBotAngN', 'LocBotAngR', 'LocMidAngL', 'LocMidAngN', 'LocMidAngR', 'LocTopAngL', 'LocTopAngN',
    # 'LocTopAngR'
    # only combined part
    for location in sorted(list((set(combined_part['butterflyLocation'].tolist())))):
        for angle in sorted(list((set(combined_part['angleSide'].tolist())))):
            subject_row.append(round((len(combined_part.loc[(combined_part['butterflyLocation'] == location) &
                                                       (combined_part['angleSide'] == angle) &
                                                       (combined_part['QuestionResult'] == 1)])/
                                     len(combined_part.loc[(combined_part['butterflyLocation'] == location) &
                                                      (combined_part['angleSide'] == angle)])*100), 3))
    
    #LeftSpatial, NormalSpatial, RightSpatial
    # only combined_part
    for angle in sorted(list((set(combined_part['angleSide'].tolist())))):
        subject_row.append(round((len(combined_part.loc[(combined_part['angleSide'] == angle) &
                                                    (combined_part['QuestionResult'] == 1)])/
                                  len(combined_part.loc[(combined_part['angleSide'] == angle)])*100), 3))
    
    #LeftSpatialTotal, NormalSpatialTotal, RightSpatialTotal
    # combined_part + spatial_part
    combined_and_spatial = pd.concat([combined_part, spatial_part])
    for angle in sorted(list((set(combined_and_spatial['angleSide'].tolist())))):
        subject_row.append(round((len(combined_and_spatial.loc[(combined_and_spatial['angleSide'] == angle) &
                                                    (combined_and_spatial['QuestionResult'] == 1)])/
                                  len(combined_and_spatial.loc[(combined_and_spatial['angleSide'] == angle)])*100), 3))
    
    # 'ResponseTimeT1S1', 'ResponseTimeT1S2', 'ResponseTimeT1S3', 'ResponseTimeT1S4', 'ResponseTimeT2S1', 'ResponseTimeT2S2',
    # 'ResponseTimeT2S3', 'ResponseTimeT2S4', 'ResponseTimeT3S1', 'ResponseTimeT3S2', 'ResponseTimeT3S3', 'ResponseTimeT3S4',
    # 'ResponseTimeT4S1', 'ResponseTimeT4S2', 'ResponseTimeT4S3', 'ResponseTimeT4S4'
    for temporal_level in range(1,len(set(combined_part['temporalAlterationLevel'].tolist())) + 1):
        for spatial_level in range(1,len(set(combined_part['spatialAlterationLevel'].tolist())) + 1):
            subject_row.append(round(combined_part.loc[(combined_part['temporalAlterationLevel'] == temporal_level) &
                                                        (combined_part['spatialAlterationLevel'] == spatial_level), 
                                                        'ResponseTime'].mean(), 3))
    
    
    all_subjects_df.loc[len(all_subjects_df)] = subject_row
all_subjects_df.to_csv(params.preprocessed_output_dir + "/all_subjects.csv", index=False)