In [1]:
import pandas as pd
import numpy as np
import os
import re
import ast

In [2]:
data_dir = '/Users/xiaoqianxiao/UKB/data'
participantsInfo_file = 'participants.csv'
participantsInfo_file_path = os.path.join(data_dir,participantsInfo_file)
participantsInfo = pd.read_csv(participantsInfo_file_path)

In [3]:
neuroticism_fields = ['1920', '1930', '1940', '1950', '1960', '1970', '1980', '1990', 
                      '2000', '2010', '2020', '2030']
# anxiety status files:
anxiety_status_fields = ['1970','1980','1990','2070']
#demographic factors' field ids, including: eid, sex, age at the first scan, IQ and EA (5)
demographic_fields = ['31','21003_i2','20016_i2','6138_i2']
# current depression or anxiety status while scanning (3)
current_status_fields = ['2050_i2','2060_i2','2070_i2']
# Self_Reported_Mental_Health (1)
Self_Reported_Mental_Health_fields = ['29000','20002_i2','21062']
# Ever_Diagnosed_Mental_Health_Problem (1)
Ever_Diagnosed_Mental_Health_Problem_fields = ['20544']
# Self reported history depression: CIDI (13)
history_depression_fields = ['20436','20439','20440','20446','20441','20449','20536','20532','20435','20450','20437']
# Self reported history anxiety: CIDI (18)
history_anxiety_fields = ['20421','20420','20538','20425','20542','20543','20540','20541','20539','20537','20418','20426','20423','20429','20419','20422','20417','20427']
# PHQ (9)
PHQ_fields = ['20514','20510','20517','20519','20511','20507','20508','20518','20513']
# GAD7 (7)
GAD7_fields = ['20506','20509','20520','20515','20516','20505','20512']
GAD7_followup_fields = ['28735','29059','29060','29061','29062','29063','29064']
# hospital data: ICD10 and ICD9 (2)
hospital_data_fields = ['41270','41271']
# fMRI data
fMRI_fields = ['31016','31018','31019','31015','31014']
control_fields = ['20544','20002','20514','20510','20517','20519','20511','20507','20508','20518','20513', '20506','20509','20520','20515','20516','20505','20512']
all_fields_ids = demographic_fields + current_status_fields + Ever_Diagnosed_Mental_Health_Problem_fields + history_depression_fields + history_anxiety_fields + PHQ_fields + GAD7_fields + GAD7_followup_fields + hospital_data_fields + control_fields + fMRI_fields + Self_Reported_Mental_Health_fields + anxiety_status_fields + neuroticism_fields
print("Number of  all fields needed: ", len(all_fields_ids))

Number of  all fields needed:  104


In [4]:
def fields_for_id_x(field_id):
    field_id = str(field_id)
    field_items = re.split(r'[,\s_]+', field_id)
    if len(field_items) == 1:
        fields = 'p{}'.format(field_items[0])
    elif len(field_items) == 2:
        fields = 'p{}_i{}'.format(field_items[0], field_items[1])
    return fields

import pandas as pd
from ast import literal_eval as safe_literal_eval
import numpy as np

def extract_strings(target_prefixes, icd_codes):
    """
    Extract ICD-10 codes that start with any of the target prefixes.
    
    Parameters:
    - target_prefixes (list): List of prefixes to match.
    - icd_codes (list): List of ICD-10 codes.

    Returns:
    - list: List of matching ICD-10 codes.
    """
    return [code for code in icd_codes if any(code.startswith(prefix) for prefix in target_prefixes)]


In [5]:
import pandas as pd
import ast

dl = []

for i in range(participantsInfo.shape[0]):
    value = participantsInfo['p6138_i2'].iloc[i]
    
    if pd.notna(value):
        try:
            parsed = ast.literal_eval(value) if isinstance(value, str) and value.strip().startswith('[') else value
            if isinstance(parsed, list):
                max_val = max(parsed)
            else:
                max_val = parsed

            if max_val == -3 or max_val == -7:
                dl.append(participantsInfo.index[i])
            elif max_val == 1:
                participantsInfo.loc[i, 'p6138_i2'] = 20
            elif max_val == 2:
                participantsInfo.loc[i, 'p6138_i2'] = 13
            elif max_val in [3, 4]:
                participantsInfo.loc[i, 'p6138_i2'] = 10
            elif max_val == 5:
                participantsInfo.loc[i, 'p6138_i2'] = 19
            elif max_val == 6:
                participantsInfo.loc[i, 'p6138_i2'] = 15
        except (ValueError, SyntaxError):
            print(f"Skipping row {i} due to parse error: {value}")
    # NaNs are ignored as per original code

# Drop the unwanted rows
participantsInfo.drop(dl, inplace=True)


In [6]:
participantsInfo.loc[:,'p6138_i2'].unique()

array([nan, 10, 20, 15, 19, 13], dtype=object)

In [7]:
# get subset with participants have EA data series:
participants_withEA = participantsInfo.dropna(subset=['p6138_i2'])
# deal with GAD nan and prefer not to response
GAD7_fields_name = [fields_for_id_x(field) for field in GAD7_fields]
# get subset with participants with GAD record:
participants_withGAD = participants_withEA.dropna(subset=GAD7_fields_name)
# get subset with participants with GAD response: get rid of -818 
participants_GAD = participants_withGAD[~participants_withGAD[GAD7_fields_name].isin([-818]).any(axis=1)]
participants_GAD[GAD7_fields_name] = participants_GAD[GAD7_fields_name] - 1
# get subset with participants with GAD response: replace -818 with 0 [396 in total]
#participants_withGAD[GAD7_fields_name] = participants_withGAD[GAD7_fields_name].replace(-818, 0)
# get subset with participants have fMRI data series:
participants_withfMRI_cortical = participants_GAD.dropna(subset=['p31016_i2'])
participants_withfMRI = participants_withfMRI_cortical.dropna(subset=['p31019_i2'])
print("number of participants with fMRI time series: {}".format(participants_withfMRI.shape[0]))
df_fMRI = pd.DataFrame(participants_withfMRI['eid'])

number of participants with fMRI time series: 5076


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  participants_GAD[GAD7_fields_name] = participants_GAD[GAD7_fields_name] - 1


In [8]:
participants_withfMRI.columns

Index(['Unnamed: 0.5', 'Unnamed: 0.4', 'Unnamed: 0.3', 'Unnamed: 0.2',
       'Unnamed: 0.1', 'Unnamed: 0', 'eid', 'p3063_i0_a0', 'p3063_i0_a1',
       'p3063_i0_a2',
       ...
       'p2030_i0', 'p2030_i1', 'p2030_i2', 'p2030_i3',
       'kdm_ba_InitialAssessmentVisit',
       'kdm_acceleration_InitialAssessmentVisit',
       'kdm_ba_RepeatAssessmentVisit',
       'kdm_acceleration_RepeatAssessmentVisit', 'GAD_baseline',
       'GAD_followup'],
      dtype='object', length=337)

In [9]:
# get subset with participants have EA data series:
participants_withEA = participantsInfo.dropna(subset=['p6138_i2'])
# deal with GAD nan and prefer not to response
GAD7_fields_name = [fields_for_id_x(field) for field in GAD7_fields]
# get subset with participants with GAD record:
participants_withGAD = participants_withEA.dropna(subset=GAD7_fields_name)
# get subset with participants with GAD response: get rid of -818 
participants_GAD = participants_withGAD[~participants_withGAD[GAD7_fields_name].isin([-818]).any(axis=1)]
participants_GAD[GAD7_fields_name] = participants_GAD[GAD7_fields_name] - 1
# get subset with participants with GAD response: replace -818 with 0 [396 in total]
#participants_withGAD[GAD7_fields_name] = participants_withGAD[GAD7_fields_name].replace(-818, 0)
# get subset with participants have fMRI data series:
participants_withfMRI_cortical_repeat = participants_GAD.dropna(subset=['p31016_i2', 'p31016_i3'])
participants_withfMRI_repeat = participants_withfMRI_cortical.dropna(subset=['p31019_i2', 'p31019_i3'])
print("number of participants with fMRI time series: {}".format(participants_withfMRI_repeat.shape[0]))
df_fMRI_repeat = pd.DataFrame(participants_withfMRI_repeat['eid'])

number of participants with fMRI time series: 383


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  participants_GAD[GAD7_fields_name] = participants_GAD[GAD7_fields_name] - 1


In [10]:
l1 = []
l2 = []
for i in range(participants_withfMRI.shape[0]):
    #depression
    if participants_withfMRI['p2050_i2'].iloc[i] >= 3 or participants_withfMRI['p2060_i2'].iloc[i] >= 3:
        l1.append(participants_withfMRI['eid'].iloc[i])
    #anxiety
    if participants_withfMRI['p2070_i2'].iloc[i] >= 3:
        l2.append(participants_withfMRI['eid'].iloc[i])

In [11]:
#CIDI
history_anxiety_fields = ['20421','20420','20538','20425','20542','20543','20540','20541','20539','20537','20418','20426','20423','20429','20419','20422','20417','20427']
df_fMRI['history_anxiety_num_criterias'] = None
df_fMRI['history_anxiety_num_symptoms'] = None
history_anxiety_subjs = []
for i in range(participants_withfMRI.shape[0]):
    num_criterias = 0
    if participants_withfMRI['p20421'].iloc[i] == 1:
        num_criterias = num_criterias + 1
    if participants_withfMRI['p20420'].iloc[i] >= 6 or participants_withfMRI['p20420'].iloc[i] == -999:
        num_criterias = num_criterias + 1
    if participants_withfMRI['p20538'].iloc[i] == 1:
        num_criterias = num_criterias + 1
    if participants_withfMRI['p20425'].iloc[i] == 1 or participants_withfMRI['p20542'].iloc[i] == 1:
        num_criterias = num_criterias + 1
    if participants_withfMRI['p20543'].iloc[i] == 2 or participants_withfMRI['p20540'].iloc[i] == 1:
        num_criterias = num_criterias + 1
    if participants_withfMRI['p20541'].iloc[i] == 1 or participants_withfMRI['p20539'].iloc[i] == 3 or participants_withfMRI['p20537'].iloc[i] == 3:
        num_criterias = num_criterias + 1
    if participants_withfMRI['p20418'].iloc[i] >= 2:
        num_criterias = num_criterias + 1
    
    # 3 somatic symptoms out of
    num_symptoms = 0
    if participants_withfMRI['p20426'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
    if participants_withfMRI['p20423'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
    if participants_withfMRI['p20429'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
    if participants_withfMRI['p20419'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
    if participants_withfMRI['p20422'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
    if participants_withfMRI['p20417'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
    if participants_withfMRI['p20427'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
        
    if num_symptoms >= 3:
        num_criterias = num_criterias + 1
    
    if num_criterias == 8:
        history_anxiety_subjs.append(participants_withfMRI['eid'].iloc[i])
        
    df_fMRI.iloc[i, df_fMRI.columns.get_loc('history_anxiety_num_criterias')] = num_criterias
    df_fMRI.iloc[i, df_fMRI.columns.get_loc('history_anxiety_num_symptoms')] = num_symptoms
df_fMRI['history_anxiety'] = df_fMRI['history_anxiety_num_criterias'].apply(lambda x: (x == 8)).replace((1,0),('True','False'))
print("number of history anxiety: ", df_fMRI.loc[df_fMRI['history_anxiety']==True, 'history_anxiety'].shape[0])

number of history anxiety:  366


In [12]:
#CIDI
history_anxiety_fields = ['20421','20420','20538','20425','20542','20543','20540','20541','20539','20537','20418','20426','20423','20429','20419','20422','20417','20427']
df_fMRI_repeat['history_anxiety_num_criterias'] = None
df_fMRI_repeat['history_anxiety_num_symptoms'] = None
history_anxiety_subjs = []
for i in range(participants_withfMRI_repeat.shape[0]):
    num_criterias = 0
    if participants_withfMRI_repeat['p20421'].iloc[i] == 1:
        num_criterias = num_criterias + 1
    if participants_withfMRI_repeat['p20420'].iloc[i] >= 6 or participants_withfMRI_repeat['p20420'].iloc[i] == -999:
        num_criterias = num_criterias + 1
    if participants_withfMRI_repeat['p20538'].iloc[i] == 1:
        num_criterias = num_criterias + 1
    if participants_withfMRI_repeat['p20425'].iloc[i] == 1 or participants_withfMRI_repeat['p20542'].iloc[i] == 1:
        num_criterias = num_criterias + 1
    if participants_withfMRI_repeat['p20543'].iloc[i] == 2 or participants_withfMRI_repeat['p20540'].iloc[i] == 1:
        num_criterias = num_criterias + 1
    if participants_withfMRI_repeat['p20541'].iloc[i] == 1 or participants_withfMRI_repeat['p20539'].iloc[i] == 3 or participants_withfMRI_repeat['p20537'].iloc[i] == 3:
        num_criterias = num_criterias + 1
    if participants_withfMRI_repeat['p20418'].iloc[i] >= 2:
        num_criterias = num_criterias + 1
    
    # 3 somatic symptoms out of
    num_symptoms = 0
    if participants_withfMRI_repeat['p20426'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
    if participants_withfMRI_repeat['p20423'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
    if participants_withfMRI_repeat['p20429'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
    if participants_withfMRI_repeat['p20419'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
    if participants_withfMRI_repeat['p20422'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
    if participants_withfMRI_repeat['p20417'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
    if participants_withfMRI_repeat['p20427'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
        
    if num_symptoms >= 3:
        num_criterias = num_criterias + 1
    
    if num_criterias == 8:
        history_anxiety_subjs.append(participants_withfMRI_repeat['eid'].iloc[i])
        
    df_fMRI_repeat.iloc[i, df_fMRI_repeat.columns.get_loc('history_anxiety_num_criterias')] = num_criterias
    df_fMRI_repeat.iloc[i, df_fMRI_repeat.columns.get_loc('history_anxiety_num_symptoms')] = num_symptoms
df_fMRI_repeat['history_anxiety'] = df_fMRI_repeat['history_anxiety_num_criterias'].apply(lambda x: (x == 8)).replace((1,0),('True','False'))
print("number of history anxiety: ", df_fMRI_repeat.loc[df_fMRI_repeat['history_anxiety']==True, 'history_anxiety'].shape[0])

number of history anxiety:  27


In [13]:
anxiety_trait_fields = ['1970_2', '1980_2', '1990_2', 
                      '2000_2', '2010_2']
anxiety_trait_fields_name = [fields_for_id_x(field) for field in anxiety_trait_fields]
df_fMRI['anxiety_trait_score'] = participants_withfMRI[anxiety_trait_fields_name].sum(axis = 1)

In [14]:
type(participants_withfMRI['p2000_i2'].unique())

numpy.ndarray

In [33]:
participants_withfMRI.filter(like='p1930')

Unnamed: 0,p1930_i0,p1930_i1,p1930_i2,p1930_i3
1,0.0,0.0,0.0,
20,0.0,,0.0,
24,0.0,,0.0,
54,-1.0,,1.0,
61,0.0,,0.0,-1.0
...,...,...,...,...
99320,0.0,,0.0,
99327,1.0,0.0,0.0,0.0
99337,0.0,,1.0,
99346,0.0,,0.0,


In [17]:
# Current anxiety at baseline
GAD7_fields_name = [fields_for_id_x(field) for field in GAD7_fields]
df_fMRI['GAD7_score'] = participants_withfMRI[GAD7_fields_name].sum(axis = 1)
df_fMRI['Current_Anxiety'] = False
df_fMRI.loc[df_fMRI['GAD7_score'] >= 10,'Current_Anxiety'] = True
df_fMRI.loc[df_fMRI['GAD7_score'] < 5,'Current_no_Anxiety'] = True
print('number of Current_Anxiety:', df_fMRI['Current_Anxiety'].sum())
print('number of Current_no_Anxiety:', df_fMRI['Current_no_Anxiety'].sum())

number of Current_Anxiety: 193
number of Current_no_Anxiety: 4262


In [18]:
# Current anxiety at baseline with two fMRI scan
GAD7_fields_name = [fields_for_id_x(field) for field in GAD7_fields]
df_fMRI_repeat['GAD7_score'] = participants_withfMRI_repeat[GAD7_fields_name].sum(axis = 1)
df_fMRI_repeat['Current_Anxiety'] = False
df_fMRI_repeat.loc[df_fMRI_repeat['GAD7_score'] >= 10,'Current_Anxiety'] = True
df_fMRI_repeat.loc[df_fMRI_repeat['GAD7_score'] < 5,'Current_no_Anxiety'] = True
print('number of Current_Anxiety:', df_fMRI_repeat['Current_Anxiety'].sum())
print('number of Current_no_Anxiety:', df_fMRI_repeat['Current_no_Anxiety'].sum())

number of Current_Anxiety: 13
number of Current_no_Anxiety: 322


In [19]:
# Current anxiety at follow up 
GAD7_fields_name = [fields_for_id_x(field) for field in GAD7_followup_fields]
df_fMRI['GAD7_score_followup'] = participants_withfMRI[GAD7_fields_name].sum(axis = 1)
df_fMRI['Current_Anxiety_followup'] = False
df_fMRI.loc[df_fMRI['GAD7_score_followup'] >= 10,'Current_Anxiety_followup'] = True
df_fMRI.loc[df_fMRI['GAD7_score_followup'] < 5,'Current_no_Anxiety_followup'] = True
print('number of Current_Anxiety_followup:', df_fMRI['Current_Anxiety_followup'].sum())
print('number of Current_no_Anxiety_followup:', df_fMRI['Current_no_Anxiety_followup'].sum())

number of Current_Anxiety_followup: 147
number of Current_no_Anxiety_followup: 4450


In [20]:
# Current anxiety at follow up  with two fMRI scan
GAD7_fields_name = [fields_for_id_x(field) for field in GAD7_followup_fields]
df_fMRI_repeat['GAD7_score_followup'] = participants_withfMRI_repeat[GAD7_fields_name].sum(axis = 1)
df_fMRI_repeat['Current_Anxiety_followup'] = False
df_fMRI_repeat.loc[df_fMRI_repeat['GAD7_score_followup'] >= 10,'Current_Anxiety_followup'] = True
df_fMRI_repeat.loc[df_fMRI_repeat['GAD7_score_followup'] < 5,'Current_no_Anxiety_followup'] = True
print('number of Current_Anxiety_followup:', df_fMRI_repeat['Current_Anxiety_followup'].sum())
print('number of Current_no_Anxiety_followup:', df_fMRI_repeat['Current_no_Anxiety_followup'].sum())

number of Current_Anxiety_followup: 15
number of Current_no_Anxiety_followup: 336


In [32]:
n1 = df_fMRI.loc[(df_fMRI['Current_Anxiety'] == True) & (df_fMRI['Current_Anxiety_followup'] == True)].shape[0]
print(f'number of Anxiety at baseline and follow-up: {n1}')
n2 = df_fMRI.loc[(df_fMRI['Current_Anxiety'] == True) & (df_fMRI['Current_Anxiety_followup'] == False)].shape[0]
print(f'number of Anxiety at baseline but not follow-up: {n2}')
n3 = df_fMRI.loc[(df_fMRI['Current_Anxiety'] == False) & (df_fMRI['Current_Anxiety_followup'] == True)].shape[0]
print(f'number of Anxiety at follow-up but not baseline: {n3}')
n4 = df_fMRI.loc[(df_fMRI['Current_Anxiety'] == False) & (df_fMRI['Current_Anxiety_followup'] == False)].shape[0]
print(f'number of no Anxiety at baseline and follow-up: {n4}')
n5 = df_fMRI.loc[(df_fMRI['Current_Anxiety'] == True) & (df_fMRI['Diagnosed_ICD10'] == True)].shape[0]
print(f'number of current Anxiety with diagnose: {n5}')
n6 = df_fMRI.loc[(df_fMRI['Current_Anxiety'] == True) & (df_fMRI['Diagnosed_ICD10'] == False)].shape[0]
print(f'number of current Anxiety with no diagnose: {n6}')
n7 = df_fMRI.loc[(df_fMRI['Current_Anxiety'] == True) & (df_fMRI['history_anxiety']==True)].shape[0]
print(f'number of current Anxiety with anxiety history: {n7}')
n8 = df_fMRI.loc[(df_fMRI['Current_Anxiety'] == True) & (df_fMRI['history_anxiety']==False)].shape[0]
print(f'number of current Anxiety with no anxiety history: {n8}')

number of Anxiety at baseline and follow-up: 47
number of Anxiety at baseline but not follow-up: 146
number of Anxiety at follow-up but not baseline: 100
number of no Anxiety at baseline and follow-up: 4783
number of current Anxiety with diagnose: 29
number of current Anxiety with no diagnose: 164
number of current Anxiety with anxiety history: 82
number of current Anxiety with no anxiety history: 111


In [22]:
# participants have repeated fMRI data
n1 = df_fMRI_repeat.loc[(df_fMRI_repeat['Current_Anxiety'] == True) & (df_fMRI_repeat['Current_Anxiety_followup'] == True)].shape[0]
print(f'number of Anxiety at baseline and follow-up: {n1}')
n2 = df_fMRI_repeat.loc[(df_fMRI_repeat['Current_Anxiety'] == True) & (df_fMRI_repeat['Current_Anxiety_followup'] == False)].shape[0]
print(f'number of Anxiety at baseline but not follow-up: {n2}')
n3 = df_fMRI_repeat.loc[(df_fMRI_repeat['Current_Anxiety'] == False) & (df_fMRI_repeat['Current_Anxiety_followup'] == True)].shape[0]
print(f'number of Anxiety at follow-up but not baseline: {n3}')
n4 = df_fMRI_repeat.loc[(df_fMRI_repeat['Current_Anxiety'] == False) & (df_fMRI_repeat['Current_Anxiety_followup'] == False)].shape[0]
print(f'number of no Anxiety at baseline and follow-up: {n4}')
n5 = df_fMRI_repeat.loc[(df_fMRI_repeat['Current_Anxiety'] == True) & (df_fMRI_repeat['history_anxiety']==True)].shape[0]
print(f'number of current Anxiety with anxiety history: {n5}')
n6 = df_fMRI_repeat.loc[(df_fMRI_repeat['Current_Anxiety'] == True) & (df_fMRI_repeat['history_anxiety']==False)].shape[0]
print(f'number of current Anxiety with no anxiety history: {n6}')

number of Anxiety at baseline and follow-up: 3
number of Anxiety at baseline but not follow-up: 10
number of Anxiety at follow-up but not baseline: 12
number of no Anxiety at baseline and follow-up: 358
number of current Anxiety with anxiety history: 4
number of current Anxiety with no anxiety history: 9


In [23]:
# hospital data-linkage: 
# Data-Field 41270 
from ast import literal_eval
df_fMRI['ICD10'] = participants_withfMRI['p41270'].apply(
    lambda x: extract_strings(['F40','F41'], literal_eval(x))
    if isinstance(x, str)
    else []
)
df_fMRI['Diagnosed_ICD10'] = False
df_fMRI.loc[df_fMRI['ICD10'].apply(lambda x: x != []), 'Diagnosed_ICD10'] = True
print('number of Diagnosed in ICD10:', df_fMRI['Diagnosed_ICD10'].sum())
# Data-Field 41271 
df_fMRI['ICD9'] = participants_withfMRI['p41271'].apply(
    lambda x: extract_strings(['300'], literal_eval(x))
    if isinstance(x, str)
    else []
)
df_fMRI['Diagnosed_ICD9'] = False
df_fMRI.loc[df_fMRI['ICD9'].apply(lambda x: x != []), 'Diagnosed_ICD9'] = True
print('number of Diagnosed in ICD9:', df_fMRI['Diagnosed_ICD9'].sum())

number of Diagnosed in ICD10: 176
number of Diagnosed in ICD9: 0


In [24]:
import pandas as pd
from collections import Counter
from ast import literal_eval as safe_literal_eval

# Subtype dictionary
subtype_dic = {
    'F400': 'Agoraphobia',
    'F401': 'Social_phobias',
    'F402': 'Specific_phobias',
    'F408': 'Other_phobic_anxiety_disorders',
    'F409': 'Unspecified_phobic_anxiety_disorder',
    'F410': 'Panic_disorder',
    'F411': 'GAD',
    'F412': 'Mixed_anxiety_and_depressive_disorder',
    'F413': 'Other_mixed_anxiety_disorders',
    'F418': 'Other_specified_anxiety_disorders',
    'F419': 'Unspecified_anxiety_disorders',
    'F42': 'OCD'
}

# Function to find the first matching subtype from a list of ICD-10 codes
def get_subtype(icd_codes, subtype_dic):
    """
    Match a list of ICD-10 codes to the first matching subtype in the dictionary.
    
    Parameters:
    - icd_codes (list): List of ICD-10 diagnosis codes.
    - subtype_dic (dict): Dictionary of subtypes with ICD-10 codes as keys.

    Returns:
    - str: Matching subtype name, or None if no match is found.
    """
    if isinstance(icd_codes, list):  # Ensure the input is a list
        for code in icd_codes:
            for subtype_code, subtype_name in subtype_dic.items():
                if code.startswith(subtype_code):  # Match ICD code prefix
                    return subtype_name
    return None

# Apply the function to the dataset
df_fMRI.loc[:, 'ICD10_diagnose'] = participants_withfMRI['p41270'].apply(
    lambda x: get_subtype(safe_literal_eval(x), subtype_dic) if isinstance(x, str) else None
)

# Count occurrences of each subtype using Counter
subtype_counts = Counter(
    subtype for subtype in df_fMRI['ICD10_diagnose'] if subtype is not None
)

# Print results in a readable format
print("Subtype Counts:")
for subtype, count in subtype_counts.items():
    print(f"{subtype}: {count}")


Subtype Counts:
GAD: 4
Unspecified_anxiety_disorders: 114
Specific_phobias: 15
Mixed_anxiety_and_depressive_disorder: 31
Panic_disorder: 10
Social_phobias: 1
OCD: 1
Other_phobic_anxiety_disorders: 1


In [25]:
# Self-reported lifetime professional diagnosis [Data-Field 29000]
Self_Reported_MedicalCondition = participants_withfMRI.filter(like='29000')

# Anxiety disorder codes (from 10 to 15 inclusive)
anxiety_codes = list(range(10, 16))

# Function to safely check for a code in the list
def safe_check_code(row, target_code):
    """
    Safely check if a target code exists in a list of codes.
    
    Parameters:
    - row: Input data (string expected to be a list representation)
    - target_code: Code to search for in the list
    
    Returns:
    - bool: True if code is found, False otherwise
    """
    try:
        if isinstance(row, str) and row.strip():
            parsed_row = safe_literal_eval(row)  # Convert string to list safely
            if isinstance(parsed_row, list):
                return target_code in parsed_row
    except (ValueError, SyntaxError):  # Handle invalid string parsing
        pass
    return False

# Loop through the anxiety disorder codes and add a binary column for each
for code in anxiety_codes:
    col_name = f"Self_Reported_diagnosis_{code}"  # Dynamically name the column
    df_fMRI[col_name] = Self_Reported_MedicalCondition['p29000'].apply(
        lambda row: safe_check_code(row, code)
    )

# Combine all Self_Reported_diagnosis columns into one final binary column
columns_self_reported = df_fMRI.filter(like='Self_Reported_diagnosis_')
df_fMRI['Self_Reported_diagnosis'] = columns_self_reported.apply(
    lambda row: row.any(), axis=1
)

# Convert True/False to 'True'/'False' strings
df_fMRI['Self_Reported_diagnosis'] = df_fMRI['Self_Reported_diagnosis'].replace(
    {True: 'True', False: 'False'}
)

# Print the number of self-reported diagnoses
num_self_reported = df_fMRI['Self_Reported_diagnosis'].eq('True').sum()
print("Number of self-reported diagnoses:", num_self_reported)

Number of self-reported diagnoses: 262


In [26]:
Self_Reported_MedicalCondition = participants_withfMRI.filter(like='20002_i2')

# Target anxiety codes for self-reported medical conditions
anxiety_codes = [1615, 1287]

# Function to check if a target code exists in the row
def contains_code(row, code):
    """
    Check if the given code exists in any column of the row.

    Parameters:
    - row: pd.Series, row of the DataFrame
    - code: int, target code to match

    Returns:
    - bool: True if code is found, False otherwise
    """
    return (row == code).any()

# Dynamically create binary columns for each code
for code in anxiety_codes:
    col_name = f"Self_Reported_MedicalCondition_{code}"
    df_fMRI[col_name] = Self_Reported_MedicalCondition.apply(
        lambda row: contains_code(row, code), axis=1
    )

# Combine all individual diagnosis columns into a final summary column
columns_med_conditions = df_fMRI.filter(like='Self_Reported_MedicalCondition_')
df_fMRI['Self_Reported_MedicalCondition'] = columns_med_conditions.any(axis=1)

# Convert True/False to 'True'/'False' strings
df_fMRI['Self_Reported_MedicalCondition'] = df_fMRI['Self_Reported_MedicalCondition'].replace(
    {True: 'True', False: 'False'}
)

# Print the number of self-reported medical conditions
num_self_reported = df_fMRI['Self_Reported_MedicalCondition'].eq('True').sum()
print("Number of self-reported medical conditions:", num_self_reported)

Number of self-reported medical conditions: 142


In [27]:
# 3. Self-reporting of Ever been offered/sought treatment for anxiety [Data-Field 21062]
Self_Reported_MedicalCondition = participants_withfMRI.filter(like='21062')
anxiety_code = [1]
for i in range(len(anxiety_code)):
    code = anxiety_code[i]
    print(code)
    added_item = 'Self_Reported_Treatment'
    df_fMRI[added_item] = Self_Reported_MedicalCondition['p21062'].apply(lambda row: (row == code)).replace((1,0),('True','False'))
print("number of self-reporting of medical conditions: ", df_fMRI.loc[df_fMRI[added_item]==True, added_item].shape[0])

1
number of self-reporting of medical conditions:  969


In [28]:
GAD_score_file_path = os.path.join(data_dir,'gad_score.csv')
df_fMRI[['eid','GAD7_score']].to_csv(GAD_score_file_path, index=False)

In [29]:
# subjects with anxiety
judgment_columns = ['history_anxiety', 'Diagnosed_ICD10', 'Diagnosed_ICD9', 'Self_Reported_diagnosis', 'Self_Reported_Treatment', 'Self_Reported_MedicalCondition', 'Current_Anxiety', ]
df_fMRI['anxiety'] = df_fMRI[judgment_columns].apply(lambda row: (row == True).any(), axis=1).replace((1,0),('True','False'))
print("number of anxiety: ", df_fMRI.loc[df_fMRI['anxiety']==True, 'anxiety'].shape[0])

number of anxiety:  1206


In [30]:
print(f'Number of participants with fMRI time series: {df_fMRI.shape[0]}')
for c in judgment_columns:
    n = sum(df_fMRI[c]==True)
    print(f'Number of participants with {c}: {n}')

Number of participants with fMRI time series: 5076
Number of participants with history_anxiety: 366
Number of participants with Diagnosed_ICD10: 176
Number of participants with Diagnosed_ICD9: 0
Number of participants with Self_Reported_diagnosis: 0
Number of participants with Self_Reported_Treatment: 969
Number of participants with Self_Reported_MedicalCondition: 0
Number of participants with Current_Anxiety: 193


In [31]:
#history_columns = ['history_anxiety', 'Diagnosed_ICD10', 'Diagnosed_ICD9','Current_Anxiety']
hospital_columns = ['Diagnosed_ICD10', 'Diagnosed_ICD9']
#hospital_columns = ['Diagnosed_ICD10']
df_fMRI['hospital_anxiety'] = df_fMRI[hospital_columns].apply(lambda row: (row == True).any(), axis=1).replace((1,0),('True','False'))
sum((df_fMRI['hospital_anxiety'] == True))

176