In [28]:
import numpy as np
import pandas as pd
from scipy import stats
import os


### Data Import and Cleaning

In [29]:
# Getting names of files from results folder
## For python Script
# script_dir = os.path.dirname(os.path.abspath(__file__))
# results_folder = os.path.join(script_dir, "results_data")
results_folder = r"Visual_Cancellation_RAW/results_data"
clicks_folder = r"Visual_Cancellation_RAW/clicks_data"

results_raw = os.listdir(results_folder)
clicks_raw = os.listdir(clicks_folder)

print("List of Results Files :")
print(results_raw)

print("List of Click Files :")
print(clicks_raw)


List of Results Files :
['sub01_star_cancellation_results.csv', 'sub02_star_cancellation_results.csv', 'sub04_star_cancellation_results.csv', 'sub05_star_cancellation_results.csv', 'sub06_star_cancellation_results.csv', 'sub07_star_cancellation_results.csv', 'sub09_star_cancellation_results.csv', 'sub10_star_cancellation_results.csv', 'sub11_star_cancellation_results.csv', 'sub12_star_cancellation_results.csv', 'sub13_star_cancellation_results.csv', 'sub14_star_cancellation_results.csv', 'sub15_star_cancellation_results.csv', 'sub16_star_cancellation_results.csv', 'sub17_star_cancellation_results.csv', 'sub18_star_cancellation_results.csv', 'sub19_star_cancellation_results.csv', 'sub03_star_cancellation_results.csv']
List of Click Files :
['sub01_star_cancellation_clicks.csv', 'sub02_star_cancellation_clicks.csv', 'sub04_star_cancellation_clicks.csv', 'sub05_star_cancellation_clicks.csv', 'sub06_star_cancellation_clicks.csv', 'sub07_star_cancellation_clicks.csv', 'sub09_star_cancellati

##### Results Data

In [30]:
# Creating a list of dataframes and merging them into one dataframe
results_columns = ["participant_id","participant_group","round_index","time_used","targets_total",
                   "found_total","omissions_total","q1_hits","q2_hits","q3_hits","q4_hits","q1_misses",
                   "q2_misses","q3_misses","q4_misses"]

results_df = []
for i in results_raw:
    df = pd.read_csv(results_folder + "/" + i, header = None, names = results_columns)
    results_df.append(df)

results_df = pd.concat(results_df, ignore_index=True)

results_df

Unnamed: 0,participant_id,participant_group,round_index,time_used,targets_total,found_total,omissions_total,q1_hits,q2_hits,q3_hits,q4_hits,q1_misses,q2_misses,q3_misses,q4_misses
0,1,1,1,30.0,112,44,68,20,17,7,0,4,7,31,26
1,1,1,2,30.0,112,50,62,24,0,26,0,0,34,0,28
2,1,1,3,30.0,112,51,61,24,0,23,4,0,34,0,27
3,1,1,1,30.0,112,37,75,22,15,0,0,10,10,33,22
4,1,1,2,30.0,112,40,72,16,11,13,0,2,17,13,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,18,01,3,30.00,112,43,69,10,18,6,9,13,6,27,23
99,participant_id,participant_group,round_index,time_used,targets_total,found_total,omissions_total,q1_hits,q2_hits,q3_hits,q4_hits,q1_misses,q2_misses,q3_misses,q4_misses
100,3,2,1,30,112,44,68,30,3,11,0,4,21,20,23
101,3,2,2,30,112,50,62,8,23,15,4,17,14,5,26


In [31]:
# Removing headers from saving task saving mechanic
preprocessed_results = results_df[results_df["participant_id"] != "participant_id"]
preprocessed_results

Unnamed: 0,participant_id,participant_group,round_index,time_used,targets_total,found_total,omissions_total,q1_hits,q2_hits,q3_hits,q4_hits,q1_misses,q2_misses,q3_misses,q4_misses
0,1,1,1,30.0,112,44,68,20,17,7,0,4,7,31,26
1,1,1,2,30.0,112,50,62,24,0,26,0,0,34,0,28
2,1,1,3,30.0,112,51,61,24,0,23,4,0,34,0,27
3,1,1,1,30.0,112,37,75,22,15,0,0,10,10,33,22
4,1,1,2,30.0,112,40,72,16,11,13,0,2,17,13,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,18,01,2,30.00,112,43,69,7,5,16,15,11,23,10,25
98,18,01,3,30.00,112,43,69,10,18,6,9,13,6,27,23
100,3,2,1,30,112,44,68,30,3,11,0,4,21,20,23
101,3,2,2,30,112,50,62,8,23,15,4,17,14,5,26


In [32]:
# Consistent formating and type check
preprocessed_results = pd.DataFrame(preprocessed_results)

col_format_results = {
    "participant_id" : int,
    "participant_group" : int,
    "round_index" : int,
    "time_used" : float,
    "targets_total" : int,
    "found_total" : int,
    "omissions_total" : int,
    "q1_hits" : int,
    "q2_hits" : int,
    "q3_hits" : int,
    "q4_hits" : int,
    "q2_misses" : int,
    "q3_misses" : int,
    "q4_misses" : int,
}

for col, format in col_format_results.items():
    preprocessed_results[col] = preprocessed_results[col].astype(format)

preprocessed_results

Unnamed: 0,participant_id,participant_group,round_index,time_used,targets_total,found_total,omissions_total,q1_hits,q2_hits,q3_hits,q4_hits,q1_misses,q2_misses,q3_misses,q4_misses
0,1,1,1,30.0,112,44,68,20,17,7,0,4,7,31,26
1,1,1,2,30.0,112,50,62,24,0,26,0,0,34,0,28
2,1,1,3,30.0,112,51,61,24,0,23,4,0,34,0,27
3,1,1,1,30.0,112,37,75,22,15,0,0,10,10,33,22
4,1,1,2,30.0,112,40,72,16,11,13,0,2,17,13,40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,18,1,2,30.0,112,43,69,7,5,16,15,11,23,10,25
98,18,1,3,30.0,112,43,69,10,18,6,9,13,6,27,23
100,3,2,1,30.0,112,44,68,30,3,11,0,4,21,20,23
101,3,2,2,30.0,112,50,62,8,23,15,4,17,14,5,26


In [33]:
# grouping by participant_id and only taking the first 3 results in each group + converting copy to new dataframe
preprocessed_results = preprocessed_results.groupby("participant_id").head(3)

# Sort participant_id
preprocessed_results = preprocessed_results.sort_values("participant_id")

# reset indexing
preprocessed_results = preprocessed_results.reset_index(drop=True)

preprocessed_results

Unnamed: 0,participant_id,participant_group,round_index,time_used,targets_total,found_total,omissions_total,q1_hits,q2_hits,q3_hits,q4_hits,q1_misses,q2_misses,q3_misses,q4_misses
0,1,1,1,30.0,112,44,68,20,17,7,0,4,7,31,26
1,1,1,2,30.0,112,50,62,24,0,26,0,0,34,0,28
2,1,1,3,30.0,112,51,61,24,0,23,4,0,34,0,27
3,2,1,1,30.0,112,31,81,4,1,26,0,28,24,7,22
4,2,1,2,30.0,112,36,76,7,12,4,13,11,16,22,27
5,2,1,3,30.0,112,32,80,6,3,13,10,17,21,20,22
6,3,2,1,30.0,112,44,68,30,3,11,0,4,21,20,23
7,3,2,2,30.0,112,50,62,8,23,15,4,17,14,5,26
8,3,2,3,30.0,112,43,69,17,0,13,13,15,34,12,8
9,4,2,3,30.0,112,44,68,8,3,28,5,15,21,5,27


##### Clicks Data

In [34]:
# Creating large clicks dataframe
clicks_columns = ["participant_id", "participant_group","round_index" ,"click_time" ,"click_x" ,"click_y" ,"click_quadrant" ,"was_target" ,"target_quadrant"]

clicks_df = []
for i in clicks_raw:
    file = pd.read_csv(clicks_folder + "/" + i, names=clicks_columns)
    clicks_df.append(file)

clicks_df = pd.concat(clicks_df, ignore_index=True)

clicks_df.drop_duplicates(inplace=True)

clicks_df

Unnamed: 0,participant_id,participant_group,round_index,click_time,click_x,click_y,click_quadrant,was_target,target_quadrant
0,1,1,0,1.796,1097.0,310.0,2,1,2
1,1,1,0,2.904,654.0,587.0,3,1,3
2,1,1,0,3.629,193.0,644.0,3,1,3
3,1,1,0,4.279,234.0,809.0,3,1,3
4,1,1,0,5.305,1141.0,663.0,4,1,4
...,...,...,...,...,...,...,...,...,...
4477,3,2,3,27.919,1635,857,4,1,4
4478,3,2,3,28.779,1737,752,4,1,4
4479,3,2,3,29.224,1665,693,4,1,4
4480,3,2,3,29.613,1620,666,4,0,0


In [35]:
# Removing extra headers and practice rounds
clicks_df = clicks_df[clicks_df["participant_id"] != "participant_id"]
preprocessed_clicks = clicks_df[clicks_df["round_index"] != 0]
preprocessed_clicks

Unnamed: 0,participant_id,participant_group,round_index,click_time,click_x,click_y,click_quadrant,was_target,target_quadrant
8,1,1,1,1.765,220.0,134.0,1,1,1
9,1,1,1,2.273,309.0,104.0,1,1,1
10,1,1,1,2.815,327.0,175.0,1,1,1
11,1,1,1,3.357,246.0,274.0,1,1,1
12,1,1,1,4.024,459.0,282.0,1,1,1
...,...,...,...,...,...,...,...,...,...
4477,3,2,3,27.919,1635,857,4,1,4
4478,3,2,3,28.779,1737,752,4,1,4
4479,3,2,3,29.224,1665,693,4,1,4
4480,3,2,3,29.613,1620,666,4,0,0


In [36]:
# Ensuring consistent formating
preprocessed_clicks = pd.DataFrame(preprocessed_clicks)

col_format_clicks = {
    "participant_id" : int,
    "participant_group" : int,
    "round_index" : int,
    "click_time" : float,
    "click_x" : float,
    "click_y" : float,
    "click_quadrant" : int,
    "was_target" : int,
    "target_quadrant" : int,
}

for col, format in col_format_clicks.items():
    preprocessed_clicks[col] = preprocessed_clicks[col].astype(format)


preprocessed_clicks

Unnamed: 0,participant_id,participant_group,round_index,click_time,click_x,click_y,click_quadrant,was_target,target_quadrant
8,1,1,1,1.765,220.0,134.0,1,1,1
9,1,1,1,2.273,309.0,104.0,1,1,1
10,1,1,1,2.815,327.0,175.0,1,1,1
11,1,1,1,3.357,246.0,274.0,1,1,1
12,1,1,1,4.024,459.0,282.0,1,1,1
...,...,...,...,...,...,...,...,...,...
4477,3,2,3,27.919,1635.0,857.0,4,1,4
4478,3,2,3,28.779,1737.0,752.0,4,1,4
4479,3,2,3,29.224,1665.0,693.0,4,1,4
4480,3,2,3,29.613,1620.0,666.0,4,0,0


In [37]:
# Removing accidental clicks
preprocessed_clicks = preprocessed_clicks[preprocessed_clicks["was_target"] == 1]
preprocessed_clicks

Unnamed: 0,participant_id,participant_group,round_index,click_time,click_x,click_y,click_quadrant,was_target,target_quadrant
8,1,1,1,1.765,220.0,134.0,1,1,1
9,1,1,1,2.273,309.0,104.0,1,1,1
10,1,1,1,2.815,327.0,175.0,1,1,1
11,1,1,1,3.357,246.0,274.0,1,1,1
12,1,1,1,4.024,459.0,282.0,1,1,1
...,...,...,...,...,...,...,...,...,...
4475,3,2,3,27.016,1676.0,908.0,4,1,4
4477,3,2,3,27.919,1635.0,857.0,4,1,4
4478,3,2,3,28.779,1737.0,752.0,4,1,4
4479,3,2,3,29.224,1665.0,693.0,4,1,4


In [38]:
# Sorting by participant_id
preprocessed_clicks = preprocessed_clicks.sort_values("participant_id")

# reset indexing
preprocessed_clicks = preprocessed_clicks.reset_index(drop=True)

preprocessed_clicks

Unnamed: 0,participant_id,participant_group,round_index,click_time,click_x,click_y,click_quadrant,was_target,target_quadrant
0,1,1,1,1.765,220.0,134.0,1,1,1
1,1,1,2,29.529,826.0,550.0,3,1,3
2,1,1,3,2.230,468.0,68.0,1,1,1
3,1,1,3,3.022,346.0,316.0,1,1,1
4,1,1,3,4.422,133.0,274.0,1,1,1
...,...,...,...,...,...,...,...,...,...
2156,18,1,3,28.224,628.0,199.0,1,1,1
2157,18,1,3,29.024,733.0,266.0,1,1,1
2158,18,1,3,29.840,777.0,395.0,1,1,1
2159,18,1,1,29.597,1515.0,728.0,4,1,4


### Exporting Data

In [39]:
# creates cleaned folder if there is not one and places dataset in them

cleaned_dir = "cleaned_cancellation_data"
os.makedirs(cleaned_dir, exist_ok=True)

preprocessed_results.to_csv(os.path.join(cleaned_dir, "cleaned_cancellation_results"), index = False)
preprocessed_clicks.to_csv(os.path.join(cleaned_dir, "cleaned_cancellation_clicks"), index = False)

In [40]:
preprocessed_clicks["participant_id"].value_counts()

participant_id
14    154
15    148
1     145
3     145
18    133
4     132
12    131
9     130
11    128
10    127
17    124
6     121
16    118
7     116
5     112
2      99
13     98
Name: count, dtype: int64