### Data Analysis Headphone study

In this notebook we will do some data analysis on data acquired at KD2Lab.\
Let's start by importing the relevant tooling and loading the data

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter


In [2]:
df = pd.read_csv('all_apps_wide-2025-03-06.csv')

In [3]:
df.head()

Unnamed: 0,participant.id_in_session,participant.code,participant.label,participant._is_bot,participant._index_in_pages,participant._max_page_index,participant._current_app_name,participant._current_page_name,participant.time_started_utc,participant.visited,...,headphones_finish_3.1.subsession.round_number,debriefing_3.1.player.id_in_group,debriefing_3.1.player.role,debriefing_3.1.player.payoff,debriefing_3.1.player.mathsRating,debriefing_3.1.player.clickRating,debriefing_3.1.player.mathSkill,debriefing_3.1.player.clickSkill,debriefing_3.1.group.id_in_subsession,debriefing_3.1.subsession.round_number
0,1,8gp06sxa,,0,370,371,debriefing_3,Debriefing,2025-03-06 14:50:25.669358,1,...,1,1,,0.0,,,,,1,1
1,2,sswre0ps,,0,371,371,debriefing_3,ThankYou,2025-03-06 14:51:12.130240,1,...,1,2,,0.0,7.0,7.0,2.0,3.0,1,1
2,3,ga0lmmg0,,0,371,371,debriefing_3,ThankYou,2025-03-06 14:51:30.677287,1,...,1,3,,0.0,3.0,2.0,4.0,4.0,1,1
3,4,uzvf1ov2,,0,370,371,debriefing_3,Debriefing,2025-03-06 16:07:49.459604,1,...,1,4,,0.0,,,,,1,1
4,5,ni3ulblj,,0,370,371,debriefing_3,Debriefing,2025-03-06 16:07:49.473199,1,...,1,5,,0.0,,,,,1,1


In [4]:
print(df.keys)

<bound method NDFrame.keys of    participant.id_in_session participant.code  participant.label  \
0                          1         8gp06sxa                NaN   
1                          2         sswre0ps                NaN   
2                          3         ga0lmmg0                NaN   
3                          4         uzvf1ov2                NaN   
4                          5         ni3ulblj                NaN   

   participant._is_bot  participant._index_in_pages  \
0                    0                          370   
1                    0                          371   
2                    0                          371   
3                    0                          370   
4                    0                          370   

   participant._max_page_index participant._current_app_name  \
0                          371                  debriefing_3   
1                          371                  debriefing_3   
2                          371        

In [5]:
df.shape

(5, 1513)

In [6]:
#getting more data on the dataset
print(df.describe())

       participant.id_in_session  participant.label  participant._is_bot  \
count                   5.000000                0.0                  5.0   
mean                    3.000000                NaN                  0.0   
std                     1.581139                NaN                  0.0   
min                     1.000000                NaN                  0.0   
25%                     2.000000                NaN                  0.0   
50%                     3.000000                NaN                  0.0   
75%                     4.000000                NaN                  0.0   
max                     5.000000                NaN                  0.0   

       participant._index_in_pages  participant._max_page_index  \
count                     5.000000                          5.0   
mean                    370.400000                        371.0   
std                       0.547723                          0.0   
min                     370.000000             

# Cleaning the data
### okay, now that we've seen that there is a lot of data for each of the 5 players. 

However, in the df.head we can see that player 2 and 3 seem to have more responses in some questions.\
We will have to check if this is only in some questions or if we might have to focus only on these participants.\
Let's start out by getting rid of all the columns filled with purely NaNs and see what those columns are. 



In [7]:
df_all_nan = df.loc[:, df.isna().all()]
df_clean = df.loc[:, ~df.isna().all()]


In [8]:
print(df_all_nan.shape)
print(df_clean.shape)


(5, 721)
(5, 792)


In [9]:
print(df_all_nan.keys())

Index(['participant.label', 'participant.mturk_worker_id',
       'participant.mturk_assignment_id', 'participant.puzzle_levels',
       'participant.puzzle_imgs', 'participant.puzzle_imgs_medium',
       'participant.puzzle_imgs_easy', 'participant.puzzle_rounds',
       'participant.puzzle_difficulty_selection', 'participant.math_levels',
       ...
       'headphones_finish_3.1.player.maia30',
       'headphones_finish_3.1.player.maia31',
       'headphones_finish_3.1.player.maia32',
       'headphones_finish_3.1.player.maia33',
       'headphones_finish_3.1.player.maia34',
       'headphones_finish_3.1.player.maia35',
       'headphones_finish_3.1.player.maia36',
       'headphones_finish_3.1.player.maia37',
       'headphones_finish_3.1.player.maia38c', 'debriefing_3.1.player.role'],
      dtype='object', length=721)


In [10]:
df_clean.head()

Unnamed: 0,participant.id_in_session,participant.code,participant._is_bot,participant._index_in_pages,participant._max_page_index,participant._current_app_name,participant._current_page_name,participant.time_started_utc,participant.visited,participant.payoff,...,headphones_finish_3.1.group.id_in_subsession,headphones_finish_3.1.subsession.round_number,debriefing_3.1.player.id_in_group,debriefing_3.1.player.payoff,debriefing_3.1.player.mathsRating,debriefing_3.1.player.clickRating,debriefing_3.1.player.mathSkill,debriefing_3.1.player.clickSkill,debriefing_3.1.group.id_in_subsession,debriefing_3.1.subsession.round_number
0,1,8gp06sxa,0,370,371,debriefing_3,Debriefing,2025-03-06 14:50:25.669358,1,0.0,...,1,1,1,0.0,,,,,1,1
1,2,sswre0ps,0,371,371,debriefing_3,ThankYou,2025-03-06 14:51:12.130240,1,0.0,...,1,1,2,0.0,7.0,7.0,2.0,3.0,1,1
2,3,ga0lmmg0,0,371,371,debriefing_3,ThankYou,2025-03-06 14:51:30.677287,1,0.0,...,1,1,3,0.0,3.0,2.0,4.0,4.0,1,1
3,4,uzvf1ov2,0,370,371,debriefing_3,Debriefing,2025-03-06 16:07:49.459604,1,0.0,...,1,1,4,0.0,,,,,1,1
4,5,ni3ulblj,0,370,371,debriefing_3,Debriefing,2025-03-06 16:07:49.473199,1,0.0,...,1,1,5,0.0,,,,,1,1


### Deleting the "Ghost Players"

It seems like player 0,3 and 4 are not real players or are lacking too much information.\
Therefore we will delete these empty rows

In [11]:
df_clean.drop(index= [0,3,4], inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean.drop(index= [0,3,4], inplace= True)


In [12]:
df_clean.head()

Unnamed: 0,participant.id_in_session,participant.code,participant._is_bot,participant._index_in_pages,participant._max_page_index,participant._current_app_name,participant._current_page_name,participant.time_started_utc,participant.visited,participant.payoff,...,headphones_finish_3.1.group.id_in_subsession,headphones_finish_3.1.subsession.round_number,debriefing_3.1.player.id_in_group,debriefing_3.1.player.payoff,debriefing_3.1.player.mathsRating,debriefing_3.1.player.clickRating,debriefing_3.1.player.mathSkill,debriefing_3.1.player.clickSkill,debriefing_3.1.group.id_in_subsession,debriefing_3.1.subsession.round_number
1,2,sswre0ps,0,371,371,debriefing_3,ThankYou,2025-03-06 14:51:12.130240,1,0.0,...,1,1,2,0.0,7.0,7.0,2.0,3.0,1,1
2,3,ga0lmmg0,0,371,371,debriefing_3,ThankYou,2025-03-06 14:51:30.677287,1,0.0,...,1,1,3,0.0,3.0,2.0,4.0,4.0,1,1


### Let's try to find the columns which contain data regarding the "whack-a-mole"-task
In the csv file the math task is referrenced as "mathTask", while the whack-a-mole task is labelled "clickTask"

In [13]:
# creating a partial df with only the columns that contain the string 'clickTask' in the key/column name
df_clickTask = df_clean.loc[:, df_clean.columns.str.contains('clickTask')]
df_clickTask.head()

Unnamed: 0,clickTask_1.1.player.id_in_group,clickTask_1.1.player.payoff,clickTask_1.1.player.fss06,clickTask_1.1.player.fss08,clickTask_1.1.player.fss09,clickTask_1.1.player.tlx_single,clickTask_1.1.player.difficulty,clickTask_1.1.player.mr_mood,clickTask_1.1.player.mr_sleepy,clickTask_1.1.player.mr_motivy,...,clickTask_3.8.player.mr_motivy,clickTask_3.8.player.mf_01,clickTask_3.8.player.mf_02,clickTask_3.8.player.mf_03,clickTask_3.8.player.mf_04,clickTask_3.8.player.headset_comfort,clickTask_3.8.player.rest_actions_eo,clickTask_3.8.player.rest_actions_ec,clickTask_3.8.group.id_in_subsession,clickTask_3.8.subsession.round_number
1,2,0.0,4,6,6,17,2,100,99,100,...,54,7,7,1,1,2,;onLoad;Thu Mar 06 2025 18:07:49 GMT+0100 (Mit...,;onLoad;Thu Mar 06 2025 18:07:17 GMT+0100 (Mit...,1,8
2,3,0.0,6,7,5,17,3,56,35,42,...,16,2,2,5,5,4,;onLoad;Thu Mar 06 2025 18:08:56 GMT+0100 (Mit...,;onLoad;Thu Mar 06 2025 18:08:23 GMT+0100 (Mit...,1,8


In [14]:
# getting all columns where "mathTask" is present in the key name
df_mathTask =  df_clean.loc[:, df_clean.columns.str.contains('mathTask')]
df_mathTask.head()


Unnamed: 0,mathTask_1.1.player.id_in_group,mathTask_1.1.player.payoff,mathTask_1.1.player.fss06,mathTask_1.1.player.fss08,mathTask_1.1.player.fss09,mathTask_1.1.player.tlx_single,mathTask_1.1.player.difficulty,mathTask_1.1.player.mr_mood,mathTask_1.1.player.mr_sleepy,mathTask_1.1.player.mr_motivy,...,mathTask_3.8.player.mr_motivy,mathTask_3.8.player.mf_01,mathTask_3.8.player.mf_02,mathTask_3.8.player.mf_03,mathTask_3.8.player.mf_04,mathTask_3.8.player.headset_comfort,mathTask_3.8.player.rest_actions_eo,mathTask_3.8.player.rest_actions_ec,mathTask_3.8.group.id_in_subsession,mathTask_3.8.subsession.round_number
1,2,0.0,2,6,7,14,2,85,82,90,...,50,7,7,1,1,2,;onLoad;Thu Mar 06 2025 17:58:34 GMT+0100 (Mit...,;onLoad;Thu Mar 06 2025 17:58:01 GMT+0100 (Mit...,1,8
2,3,0.0,6,6,5,17,4,72,75,73,...,13,2,2,6,6,4,;onLoad;Thu Mar 06 2025 17:58:21 GMT+0100 (Mit...,;onLoad;Thu Mar 06 2025 17:57:49 GMT+0100 (Mit...,1,8


### Now that we know that we know which data we have, let's check which data we are missing
More specifically let's check if clickTask columns are missing data

In [15]:
# Let's get rid of the unnecessary lines:
df_all_nan.drop(index= [0,3,4], inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all_nan.drop(index= [0,3,4], inplace= True)


In [16]:
# checking which columns from the original df contain the string "clickTask" in the column name and simultaneously contain NaNs
nan_clickTask = df_all_nan.loc[:, df_all_nan.columns.str.contains("clickTask")]
nan_clickTask.head()

Unnamed: 0,clickTask_1.1.player.role,clickTask_1.1.player.mf_01,clickTask_1.1.player.mf_02,clickTask_1.1.player.mf_03,clickTask_1.1.player.mf_04,clickTask_1.1.player.headset_comfort,clickTask_1.1.player.rest_actions_eo,clickTask_1.1.player.rest_actions_ec,clickTask_1.1.player.currRound,clickTask_1.2.player.role,...,clickTask_3.7.player.rest_actions_ec,clickTask_3.7.player.click_actions,clickTask_3.7.player.currRound,clickTask_3.8.player.role,clickTask_3.8.player.fss06,clickTask_3.8.player.fss08,clickTask_3.8.player.fss09,clickTask_3.8.player.difficulty,clickTask_3.8.player.click_actions,clickTask_3.8.player.currRound
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,


In [17]:
# Now the same for the mathTask
nan_mathTask = df_all_nan.loc[:, df_all_nan.columns.str.contains('mathTask')]
nan_mathTask.head()

Unnamed: 0,mathTask_1.1.player.role,mathTask_1.1.player.mf_01,mathTask_1.1.player.mf_02,mathTask_1.1.player.mf_03,mathTask_1.1.player.mf_04,mathTask_1.1.player.headset_comfort,mathTask_1.1.player.rest_actions_eo,mathTask_1.1.player.rest_actions_ec,mathTask_1.1.player.currRound,mathTask_1.2.player.role,...,mathTask_3.7.player.rest_actions_ec,mathTask_3.7.player.math_actions,mathTask_3.7.player.currRound,mathTask_3.8.player.role,mathTask_3.8.player.fss06,mathTask_3.8.player.fss08,mathTask_3.8.player.fss09,mathTask_3.8.player.difficulty,mathTask_3.8.player.math_actions,mathTask_3.8.player.currRound
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,


##### Looks like we've quite some data missing
However it seems like the NaN values are always the same.
Let's check this for the reaction task.


In [23]:
missing_atts_click = dict(Counter(col.split('.player.')[1] for col in nan_clickTask.columns))

print(missing_atts_click)


{'role': 24, 'mf_01': 21, 'mf_02': 21, 'mf_03': 21, 'mf_04': 21, 'headset_comfort': 21, 'rest_actions_eo': 21, 'rest_actions_ec': 21, 'currRound': 24, 'fss06': 6, 'fss08': 6, 'fss09': 6, 'tlx_single': 3, 'difficulty': 6, 'mr_mood': 3, 'mr_sleepy': 3, 'mr_motivy': 3, 'click_actions': 6}


In [24]:
missing_atts_math = dict(Counter(col.split('.player.')[1] for col in  nan_mathTask.columns))

print(missing_atts_math) 

{'role': 24, 'mf_01': 21, 'mf_02': 21, 'mf_03': 21, 'mf_04': 21, 'headset_comfort': 21, 'rest_actions_eo': 21, 'rest_actions_ec': 21, 'currRound': 24, 'fss06': 6, 'fss08': 6, 'fss09': 6, 'tlx_single': 3, 'difficulty': 6, 'mr_mood': 3, 'mr_sleepy': 3, 'mr_motivy': 3, 'math_actions': 6}
