In [1]:
import pandas as pd

In [12]:
# One-time code to de-identify data; original file deleted
# raw = pd.read_csv('Voice_DATA_2022-02-25_1744.csv', index_col=0, encoding="iso-8859-1")
# raw = raw.drop(['mobile_id', 'first_name', 'last_name', 'address', 
#                 'telephone_1', 'telephone_2','email','dob','live_in_boston_area'], axis=1)
# raw.to_csv('deID_voice_data.csv')


In [139]:
pheno = pd.read_csv('deID_voice_data.csv', index_col=0)

# per BIDS spec: https://bids-specification.readthedocs.io/en/stable/03-modality-agnostic-files.html#participants-file
# participants.tsv "MUST contain the column participant_id, 
# which MUST consist of sub-<label> values identifying one row for each participant"

pheno=pheno.rename({'voice_id':'participant_id'}, axis=1)

#drop participant ID NaN rows
pheno = pheno.loc[pheno.participant_id.dropna().index].reset_index(drop=True)

#drop test subjects
test_sub = ['5555','voice1000','1000']
pheno = pheno[~pheno.participant_id.isin(test_sub)]

# confirming that 955 is the only subject appearing twice in the phenotype data
# pheno.participant_id.value_counts()
# [p for p in pheno.participant_id.value_counts()  if p>1]

#convert from just the number to sub-voice### per the imaging naming convention
#if statement prevents adding it multiple times by accident
if 'sub-voice' not in pheno.participant_id.iloc[0]:
    pheno.participant_id = [f'sub-voice{pid}' for pid in pheno.participant_id if 'sub-voice']

pheno

Unnamed: 0,redcap_event_name,redcap_survey_identifier,initial_screening_timestamp,participant_id,date_enrolled,sex,age,weight,mental_illness,mental_illness_text,...,miscnotes_excluded,miscnotes_excludedreason,miscnotes_excludedreasontxt,miscnotes_complete,mri_session001,mri_session002,mri_session003,mri_session004,meg_session001,scandates_complete
3,session001_arm_1,,2015-06-11 10:46:43,sub-voice876,2015-06-11,1.0,49.0,190.0,1,depression,...,1.0,,,1,,,,,,0.0
5,session001_arm_1,,2015-06-11 15:04:13,sub-voice874,2015-06-11,1.0,23.0,180.0,0,,...,1.0,1.0,,1,,,,,,0.0
6,session001_arm_1,,2015-06-12 10:22:21,sub-voice875,2015-06-12,1.0,65.0,164.0,1,MDD,...,,,,0,,,,,,0.0
7,session001_arm_1,,2015-06-12 13:49:32,sub-voice873,2015-06-12,1.0,51.0,220.0,1,mdd self id,...,,,,0,,,,,,0.0
8,session001_arm_1,,2015-06-21 17:17:03,sub-voice872,2015-06-21,1.0,47.0,150.0,1,mdd,...,,,,0,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,session001_arm_1,,2017-09-11 11:25:15,sub-voice953,2017-09-11,1.0,24.0,140.0,0,,...,,,,0,,,,,,0.0
115,session001_arm_1,,2017-09-11 14:47:33,sub-voice952,2017-09-11,0.0,22.0,134.0,0,,...,,,,0,,,,,,0.0
116,session001_arm_1,,2017-10-17 16:12:30,sub-voice951,2017-10-17,1.0,46.0,200.0,0,,...,,,,0,,,,,,0.0
117,session001_arm_1,,2017-11-03 13:30:04,sub-voice950,2017-11-03,1.0,20.0,107.0,0,,...,,,,0,,,,,,0.0


In [None]:
#Handedness fix
#Commonly used optional columns in participants.tsv files are age, sex, handedness, strain, and strain_rrid. 
#String value indicating one of "left", "right", "ambidextrous".
#For "left", use one of these values: left, l, L, LEFT, Left.
#For "right", use one of these values: right, r, R, RIGHT, Right.
#For "ambidextrous", use one of these values: ambidextrous, a, A, AMBIDEXTROUS, Ambidextrous.
pheno=pheno.rename({'handed':'handedness'}, axis=1)



In [18]:
#From data dictionary, the question was "are you right handed" where "yes" is assumed to mean "1" and therefore RH
# which is supported by the distribution of handedness in the dataset
data_dict = pd.read_csv('Voice_DataDictionary_2022-02-25.csv', index_col=0, encoding="iso-8859-1")
data_dict.loc['handed']

Form Name                                         initial_screening
Section Header                                                  NaN
Field Type                                                    yesno
Field Label                                   Are you right handed?
Choices, Calculations, OR Slider Labels                         NaN
Field Note                                                      NaN
Text Validation Type OR Show Slider Number                      NaN
Text Validation Min                                             NaN
Text Validation Max                                             NaN
Identifier?                                                     NaN
Branching Logic (Show field only if...)                         NaN
Required Field?                                                   y
Custom Alignment                                                NaN
Question Number (surveys only)                                  NaN
Matrix Group Name                               

In [126]:
pheno['handedness'].value_counts()

handedness
1.0    78
0.0    15
Name: count, dtype: int64

In [134]:
handed_dict = {1.0:'r',0.0:'l'}
pheno = pheno.replace({'handedness':handed_dict})
pheno.handedness.value_counts()

handedness
r    78
l    15
Name: count, dtype: int64

In [150]:
# Fixing sex
# String value indicating phenotypical sex, one of "male", "female", "other".
# For "male", use one of these values: male, m, M, MALE, Male.
# For "female", use one of these values: female, f, F, FEMALE, Female.
# For "other", use one of these values: other, o, O, OTHER, Other.

data_dict.loc['sex']

# 0 is Female, 1 is Male

Form Name                                       initial_screening
Section Header                                                NaN
Field Type                                                  radio
Field Label                                                Gender
Choices, Calculations, OR Slider Labels       0, Female | 1, Male
Field Note                                                    NaN
Text Validation Type OR Show Slider Number                    NaN
Text Validation Min                                           NaN
Text Validation Max                                           NaN
Identifier?                                                   NaN
Branching Logic (Show field only if...)                       NaN
Required Field?                                                 y
Custom Alignment                                              NaN
Question Number (surveys only)                                NaN
Matrix Group Name                                             NaN
Name: sex,

In [151]:
sex_dict = {1.0:'male',0.0:'female'}
pheno = pheno.replace({'sex':sex_dict})
pheno.sex.value_counts()

sex
male      57
female    48
Name: count, dtype: int64

In [152]:
pheno

Unnamed: 0,redcap_event_name,redcap_survey_identifier,initial_screening_timestamp,participant_id,date_enrolled,sex,age,weight,mental_illness,mental_illness_text,...,miscnotes_excluded,miscnotes_excludedreason,miscnotes_excludedreasontxt,miscnotes_complete,mri_session001,mri_session002,mri_session003,mri_session004,meg_session001,scandates_complete
3,session001_arm_1,,2015-06-11 10:46:43,sub-voice876,2015-06-11,male,49.0,190.0,1,depression,...,1.0,,,1,,,,,,0.0
5,session001_arm_1,,2015-06-11 15:04:13,sub-voice874,2015-06-11,male,23.0,180.0,0,,...,1.0,1.0,,1,,,,,,0.0
6,session001_arm_1,,2015-06-12 10:22:21,sub-voice875,2015-06-12,male,65.0,164.0,1,MDD,...,,,,0,,,,,,0.0
7,session001_arm_1,,2015-06-12 13:49:32,sub-voice873,2015-06-12,male,51.0,220.0,1,mdd self id,...,,,,0,,,,,,0.0
8,session001_arm_1,,2015-06-21 17:17:03,sub-voice872,2015-06-21,male,47.0,150.0,1,mdd,...,,,,0,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,session001_arm_1,,2017-09-11 11:25:15,sub-voice953,2017-09-11,male,24.0,140.0,0,,...,,,,0,,,,,,0.0
115,session001_arm_1,,2017-09-11 14:47:33,sub-voice952,2017-09-11,female,22.0,134.0,0,,...,,,,0,,,,,,0.0
116,session001_arm_1,,2017-10-17 16:12:30,sub-voice951,2017-10-17,male,46.0,200.0,0,,...,,,,0,,,,,,0.0
117,session001_arm_1,,2017-11-03 13:30:04,sub-voice950,2017-11-03,male,20.0,107.0,0,,...,,,,0,,,,,,0.0


In [153]:
pheno.to_csv('participants.tsv', sep='\t')

In [154]:
#move participants.tsv to the right level
!cp participants.tsv ../..

In [3]:
p = pd.read_csv('../../participants.tsv', sep='\t', index_col=0)

In [4]:
p

Unnamed: 0,redcap_event_name,redcap_survey_identifier,initial_screening_timestamp,participant_id,date_enrolled,sex,age,weight,mental_illness,mental_illness_text,...,miscnotes_excluded,miscnotes_excludedreason,miscnotes_excludedreasontxt,miscnotes_complete,mri_session001,mri_session002,mri_session003,mri_session004,meg_session001,scandates_complete
3,session001_arm_1,,2015-06-11 10:46:43,sub-voice876,2015-06-11,male,49.0,190.0,1.0,depression,...,1.0,,,1,,,,,,0.0
5,session001_arm_1,,2015-06-11 15:04:13,sub-voice874,2015-06-11,male,23.0,180.0,0.0,,...,1.0,1.0,,1,,,,,,0.0
6,session001_arm_1,,2015-06-12 10:22:21,sub-voice875,2015-06-12,male,65.0,164.0,1.0,MDD,...,,,,0,,,,,,0.0
7,session001_arm_1,,2015-06-12 13:49:32,sub-voice873,2015-06-12,male,51.0,220.0,1.0,mdd self id,...,,,,0,,,,,,0.0
8,session001_arm_1,,2015-06-21 17:17:03,sub-voice872,2015-06-21,male,47.0,150.0,1.0,mdd,...,,,,0,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,session001_arm_1,,2017-09-11 11:25:15,sub-voice953,2017-09-11,male,24.0,140.0,0.0,,...,,,,0,,,,,,0.0
115,session001_arm_1,,2017-09-11 14:47:33,sub-voice952,2017-09-11,female,22.0,134.0,0.0,,...,,,,0,,,,,,0.0
116,session001_arm_1,,2017-10-17 16:12:30,sub-voice951,2017-10-17,male,46.0,200.0,0.0,,...,,,,0,,,,,,0.0
117,session001_arm_1,,2017-11-03 13:30:04,sub-voice950,2017-11-03,male,20.0,107.0,0.0,,...,,,,0,,,,,,0.0


In [60]:
cat = 'beckdepressionii_timestamp'

In [61]:
p[cat].value_counts()

beckdepressionii_timestamp
2015-06-11 15:20:42    1
2016-12-16 11:06:51    1
2016-09-08 11:15:42    1
2016-09-08 16:16:22    1
2016-09-28 17:03:08    1
2016-10-03 14:27:50    1
2016-11-15 16:13:27    1
2016-11-22 15:15:53    1
2016-11-28 16:34:23    1
2017-01-24 18:05:22    1
2015-10-03 16:56:00    1
2017-02-16 11:26:22    1
2017-04-20 09:45:03    1
2017-08-28 11:35:22    1
2017-09-11 11:28:05    1
2017-09-11 14:49:38    1
2017-10-17 16:15:10    1
2017-11-03 13:32:08    1
2016-07-15 10:53:45    1
2016-07-07 14:13:33    1
2016-07-05 17:42:01    1
2017-04-21 10:35:52    1
2015-07-16 10:40:45    1
2016-01-11 11:41:56    1
2015-10-16 11:11:37    1
2015-10-21 10:09:15    1
2015-11-02 13:09:24    1
2015-11-06 11:04:09    1
2015-11-16 14:02:48    1
2015-11-23 14:28:46    1
2016-01-27 14:03:42    1
2016-02-23 17:21:15    1
2016-02-26 13:49:44    1
2016-03-01 13:13:31    1
2016-03-16 11:24:22    1
2016-03-30 11:26:15    1
2016-05-13 17:38:41    1
2018-01-30 15:21:40    1
Name: count, dtype: int

In [63]:
data_dict#.loc[cat]#['Field Label']

Unnamed: 0_level_0,Form Name,Section Header,Field Type,Field Label,"Choices, Calculations, OR Slider Labels",Field Note,Text Validation Type OR Show Slider Number,Text Validation Min,Text Validation Max,Identifier?,Branching Logic (Show field only if...),Required Field?,Custom Alignment,Question Number (surveys only),Matrix Group Name
"ï»¿""Variable / Field Name""",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
study_id,initial_screening,,text,Study ID,,,,,,,,y,,,
voice_id,initial_screening,,text,Voice ID Number only,,"e.g. voice###, enter ###",integer,,,,,y,,,
mobile_id,initial_screening,,text,Mobile Survey ID,,,,,,y,,,,,
date_enrolled,initial_screening,Demographic Characteristics,text,Date subject signed consent,,YYYY-MM-DD,date_ymd,,,,,y,,,
first_name,initial_screening,,text,First Name,,,,,,y,,y,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mri_session001,scandates,,text,MRI session001,,YYYY-MM-DD,date_ymd,,,,,,,,
mri_session002,scandates,,text,MRI session002,,YYYY-MM-DD,date_ymd,,,,,,,,
mri_session003,scandates,,text,MRI session003,,YYYY-MM-DD,date_ymd,,,,,,,,
mri_session004,scandates,,text,MRI session004,,YYYY-MM-DD,date_ymd,,,,,,,,
