In [206]:
import pandas as pd
import numpy as np
import os
import re
import ast

In [207]:
data_dir = '/Users/xiaoqianxiao/UKB/data'
participantsInfo_file = 'participants.csv'
participantsInfo_file_path = os.path.join(data_dir,participantsInfo_file)
participantsInfo = pd.read_csv(participantsInfo_file_path)

In [208]:
#demographic factors' field ids, including: eid, sex, age at the first scan, IQ and EA (5)
demographic_fields = ['31','21003_2','20016_2','6138_2']
# current depression or anxiety status while scanning (3)
current_status_fields = ['2050_2','2060_2','2070_2']
# Self_Reported_Mental_Health (1)
Self_Reported_Mental_Health_fields = ['29000','20002_i2','21062']
# Ever_Diagnosed_Mental_Health_Problem (1)
Ever_Diagnosed_Mental_Health_Problem_fields = ['20544']
# Self reported history depression: CIDI (13)
history_depression_fields = ['20436','20439','20440','20446','20441','20449','20536','20532','20435','20450','20437']
# Self reported history anxiety: CIDI (18)
history_anxiety_fields = ['20421','20420','20538','20425','20542','20543','20540','20541','20539','20537','20418','20426','20423','20429','20419','20422','20417','20427']
# PHQ (9)
PHQ_fields = ['20514','20510','20517','20519','20511','20507','20508','20518','20513']
# GAD7 (7)
GAD7_fields = ['20506','20509','20520','20515','20516','20505','20512']
# hospital data: ICD10 and ICD9 (2)
hospital_data_fields = ['41270','41271']
# fMRI data
fMRI_fields = ['31016','31018','31019','31015','31014']

In [209]:
def fields_for_id_x(field_id):
    field_id = str(field_id)
    field_items = re.split(r'[,\s_]+', field_id)
    if len(field_items) == 1:
        fields = 'p{}'.format(field_items[0])
    elif len(field_items) == 2:
        fields = 'p{}_i{}'.format(field_items[0], field_items[1])
    return fields

def extract_strings(string_patten, strings_list):
    # Filter strings that start with "F4"
    return [s for s in strings_list if s.startswith(string_patten)]

def safe_literal_eval(x):
    try:
        return ast.literal_eval(x) if isinstance(x, str) else x
    except (ValueError, SyntaxError):
        return []

In [210]:
participantsInfo.head()

Unnamed: 0,eid,p31,p21003_i2,p20016_i2,p6138_i2,p2050_i2,p2060_i2,p2070_i2,p20544,p20436,...,p20002_i2_a25,p20002_i2_a26,p20002_i2_a27,p20002_i2_a28,p20002_i2_a29,p20002_i2_a30,p20002_i2_a31,p20002_i2_a32,p20002_i2_a33,p21062
0,1000011,0,,,,,,,,3.0,...,,,,,,,,,,0.0
1,1000047,1,65.0,5.0,[3],1.0,1.0,1.0,,,...,,,,,,,,,,0.0
2,1000190,0,,,,,,,,,...,,,,,,,,,,
3,1000240,1,,,,,,,,,...,,,,,,,,,,
4,1000258,1,,,,,,,,,...,,,,,,,,,,


In [211]:
participantsInfo.columns

Index(['eid', 'p31', 'p21003_i2', 'p20016_i2', 'p6138_i2', 'p2050_i2',
       'p2060_i2', 'p2070_i2', 'p20544', 'p20436',
       ...
       'p20002_i2_a25', 'p20002_i2_a26', 'p20002_i2_a27', 'p20002_i2_a28',
       'p20002_i2_a29', 'p20002_i2_a30', 'p20002_i2_a31', 'p20002_i2_a32',
       'p20002_i2_a33', 'p21062'],
      dtype='object', length=102)

In [212]:
# transfer EA
dl = []
for i in range(participantsInfo.shape[0]):
    # Prefer not to answer
    if participantsInfo['p6138_i2'].iloc[i] == -3:
        dl.append(participantsInfo.index[i])
    if participantsInfo['p6138_i2'].iloc[i] == 1:
        participantsInfo['p6138_i2'].iloc[i] = 20
    if participantsInfo['p6138_i2'].iloc[i] == 2:
        participantsInfo['p6138_i2'].iloc[i] = 13
    if participantsInfo['p6138_i2'].iloc[i] == 3:
        participantsInfo['p6138_i2'].iloc[i] = 10
    if participantsInfo['p6138_i2'].iloc[i] == 4:
        participantsInfo['p6138_i2'].iloc[i] = 10
    if participantsInfo['p6138_i2'].iloc[i] == 5:
        participantsInfo['p6138_i2'].iloc[i] = 19 
    if participantsInfo['p6138_i2'].iloc[i] == 6:
        participantsInfo['p6138_i2'].iloc[i] = 15
    # None of the above
    if participantsInfo['p6138_i2'].iloc[i] == -7:
        dl.append(participantsInfo.index[i])   
participantsInfo.drop(dl,inplace=True)

In [213]:
# get subset with participants have fMRI data series:
participants_withfMRI = participantsInfo.dropna(subset=['p31016_i2'])
print("number of participants with fMRI time series: {}".format(participants_withfMRI.shape[0]))
df_fMRI = pd.DataFrame(participants_withfMRI['eid'])

number of participants with fMRI time series: 40396


In [214]:
 # select individuals with depression or anxiety 
l1 = []
l2 = []
for i in range(participants_withfMRI.shape[0]):
    #depression
    if participants_withfMRI['p2050_i2'].iloc[i] >= 3 or participants_withfMRI['p2060_i2'].iloc[i] >= 3:
        l1.append(participants_withfMRI['eid'].iloc[i])
    #anxiety
    if participants_withfMRI['p2070_i2'].iloc[i] >= 3:
        l2.append(participants_withfMRI['eid'].iloc[i])

In [215]:
# Define history anxiety subjects
# Initialize an empty list to store all items
GAD_fields_name = []
# Iterate over each field_id in the list of all_fields_ids
for field_id in GAD7_fields:
    # Get the list of items for the current field_id using fields_for_id_x
    name_for_field = fields_for_id_x(field_id)
    GAD_fields_name.append(name_for_field)
print(GAD_fields_name)

['p20506', 'p20509', 'p20520', 'p20515', 'p20516', 'p20505', 'p20512']


In [216]:
history_anxiety_fields = ['20421','20420','20538','20425','20542','20543','20540','20541','20539','20537','20418','20426','20423','20429','20419','20422','20417','20427']

In [217]:
df_fMRI['history_anxiety_num_criterias'] = None
df_fMRI['history_anxiety_num_symptoms'] = None
history_anxiety_subjs = []
for i in range(participants_withfMRI.shape[0]):
    num_criterias = 0
    if participants_withfMRI['p20421'].iloc[i] == 1:
        num_criterias = num_criterias + 1
    if participants_withfMRI['p20420'].iloc[i] >= 6 or participants_withfMRI['p20420'].iloc[i] == -999:
        num_criterias = num_criterias + 1
    if participants_withfMRI['p20538'].iloc[i] == 1:
        num_criterias = num_criterias + 1
    if participants_withfMRI['p20425'].iloc[i] == 1 or participants_withfMRI['p20542'].iloc[i] == 1:
        num_criterias = num_criterias + 1
    if participants_withfMRI['p20543'].iloc[i] == 2 or participants_withfMRI['p20540'].iloc[i] == 1:
        num_criterias = num_criterias + 1
    if participants_withfMRI['p20541'].iloc[i] == 1 or participants_withfMRI['p20539'].iloc[i] == 3 or participants_withfMRI['p20537'].iloc[i] == 3:
        num_criterias = num_criterias + 1
    if participants_withfMRI['p20418'].iloc[i] >= 2:
        num_criterias = num_criterias + 1
    
    # 3 somatic symptoms out of
    num_symptoms = 0
    if participants_withfMRI['p20426'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
    if participants_withfMRI['p20423'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
    if participants_withfMRI['p20429'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
    if participants_withfMRI['p20419'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
    if participants_withfMRI['p20422'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
    if participants_withfMRI['p20417'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
    if participants_withfMRI['p20427'].iloc[i] == 1:
        num_symptoms = num_symptoms + 1
        
    if num_symptoms >= 3:
        num_criterias = num_criterias + 1
    
    if num_criterias == 8:
        history_anxiety_subjs.append(participants_withfMRI['eid'].iloc[i])
        
    df_fMRI.iloc[i, df_fMRI.columns.get_loc('history_anxiety_num_criterias')] = num_criterias
    df_fMRI.iloc[i, df_fMRI.columns.get_loc('history_anxiety_num_symptoms')] = num_symptoms
df_fMRI['history_anxiety'] = df_fMRI['history_anxiety_num_criterias'].apply(lambda x: (x == 8)).replace((1,0),('True','False'))
print("number of history anxiety: ", df_fMRI.loc[df_fMRI['history_anxiety']==True, 'history_anxiety'].shape[0])

number of history anxiety:  1909


In [218]:
df_fMRI['GAD7_score'] = participants_withfMRI[GAD_fields_name].sum(axis = 1)

In [219]:
# Current anxiety
df_fMRI['GAD7_score'] = participants_withfMRI[GAD_fields_name].sum(axis = 1)
df_fMRI['Current_Anxiety'] = False
df_fMRI.loc[df_fMRI['GAD7_score'] >= 10,'Current_Anxiety'] = True

In [220]:
# hospital data-linkage: 
# Data-Field 41270 
df_fMRI['ICD10'] = participants_withfMRI['p41270'].apply(lambda x: extract_strings('F4', safe_literal_eval(x)) if isinstance(safe_literal_eval(x), list) else [])
df_fMRI['Diagnosed_ICD10'] = False
df_fMRI.loc[df_fMRI['ICD10'].apply(lambda x: x != []), 'Diagnosed_ICD10'] = True
print('number of Diagnosed in ICD10:', df_fMRI['Diagnosed_ICD10'].sum())
# Data-Field 41271 
df_fMRI['ICD9'] = participants_withfMRI['p41271'].apply(lambda x: extract_strings('300', safe_literal_eval(x)) if isinstance(safe_literal_eval(x), list) else [])
df_fMRI['Diagnosed_ICD9'] = False
df_fMRI.loc[df_fMRI['ICD9'].apply(lambda x: x != []), 'Diagnosed_ICD9'] = True
print('number of Diagnosed in ICD9:', df_fMRI['Diagnosed_ICD9'].sum())

number of Diagnosed in ICD10: 1518
number of Diagnosed in ICD9: 3


In [221]:
# Self reported
# 1. self-reporting a lifetime professional diagnosis of one of the core five anxiety disorders [Data-Field 29000]
Self_Reported_MedicalCondition = participants_withfMRI.filter(like='29000')
anxiety_code = list(range(10,16))

for i in range(len(anxiety_code)):
    code = anxiety_code[i]
    #print(code)
    added_item = 'Self_Reported_diagnosis'+ str(code)
    df_fMRI[added_item] = Self_Reported_MedicalCondition['p29000'].apply(lambda row: (code in safe_literal_eval(row)) if isinstance(safe_literal_eval(row), list) else [])

columns_Self_Reported_diagnosis = df_fMRI.filter(like='Self_Reported_diagnosis')
df_fMRI['Self_Reported_diagnosis'] = columns_Self_Reported_diagnosis.apply(lambda row: (row == True).any(), axis=1).replace((1,0),('True','False'))
print("number of self-reporting of diagnosis: ", df_fMRI.loc[df_fMRI['Self_Reported_diagnosis']==True, 'Self_Reported_diagnosis'].shape[0])

number of self-reporting of diagnosis:  2025


In [222]:
# 2. Self-reporting of medical condition:Non-cancer illness code, self-reported[Data-Field 20002]
Self_Reported_MedicalCondition = participants_withfMRI.filter(like='20002_i2')
anxiety_code = [1615,1287]
for i in range(len(anxiety_code)):
    code = anxiety_code[i]
    #print(code)
    added_item = 'Self_Reported_MedicalCondition'+ str(code)
    df_fMRI[added_item] = Self_Reported_MedicalCondition.apply(lambda row: (row == code).any(), axis=1).replace((1,0),('True','False'))
    
columns_Self_Reported_MedicalCondition = df_fMRI.filter(like='Self_Reported_MedicalCondition')
df_fMRI['Self_Reported_MedicalCondition'] = columns_Self_Reported_MedicalCondition.apply(lambda row: (row == True).any(), axis=1).replace((1,0),('True','False'))
print("number of self-reporting of medical conditions: ", df_fMRI.loc[df_fMRI['Self_Reported_MedicalCondition']==True, 'Self_Reported_MedicalCondition'].shape[0])

number of self-reporting of medical conditions:  1096


In [223]:
# 3. Self-reporting of Ever been offered/sought treatment for anxiety [Data-Field 21062]
Self_Reported_MedicalCondition = participants_withfMRI.filter(like='21062')
anxiety_code = [1]
for i in range(len(anxiety_code)):
    code = anxiety_code[i]
    print(code)
    added_item = 'Self_Reported_Treatment'
    df_fMRI[added_item] = Self_Reported_MedicalCondition['p21062'].apply(lambda row: (row == code)).replace((1,0),('True','False'))
print("number of self-reporting of medical conditions: ", df_fMRI.loc[df_fMRI[added_item]==True, added_item].shape[0])

1
number of self-reporting of medical conditions:  6673


In [224]:
df_fMRI.columns

Index(['eid', 'history_anxiety_num_criterias', 'history_anxiety_num_symptoms',
       'history_anxiety', 'GAD7_score', 'Current_Anxiety', 'ICD10',
       'Diagnosed_ICD10', 'ICD9', 'Diagnosed_ICD9',
       'Self_Reported_diagnosis10', 'Self_Reported_diagnosis11',
       'Self_Reported_diagnosis12', 'Self_Reported_diagnosis13',
       'Self_Reported_diagnosis14', 'Self_Reported_diagnosis15',
       'Self_Reported_diagnosis', 'Self_Reported_MedicalCondition1615',
       'Self_Reported_MedicalCondition1287', 'Self_Reported_MedicalCondition',
       'Self_Reported_Treatment'],
      dtype='object')

In [225]:
# subjects with anxiety
judgment_columns = ['history_anxiety', 'Current_Anxiety', 'Diagnosed_ICD10', 'Diagnosed_ICD9', 'Self_Reported_diagnosis', 'Self_Reported_Treatment', 'Self_Reported_MedicalCondition']
df_fMRI['anxiety'] = df_fMRI[judgment_columns].apply(lambda row: (row == True).any(), axis=1).replace((1,0),('True','False'))
print("number of anxiety: ", df_fMRI.loc[df_fMRI['anxiety']==True, 'anxiety'].shape[0])

number of anxiety:  13324


In [230]:
anxiety_fMRI_eid = df_fMRI.loc[df_fMRI['anxiety']==True, 'eid']
output_path = os.path.join(data_dir, 'needed_participants.csv')
anxiety_fMRI_eid.to_csv(output_path, index=False)

In [288]:
# get fMRI data path
imagingData_file_path = os.path.join(data_dir, 'imaging_eids.csv')
imagingInfo = pd.read_csv(imagingData_file_path)
df_data = imagingInfo.loc[imagingInfo['eid'].isin(anxiety_fMRI_eid)]

output_path = os.path.join(data_dir, 'anxiety_fMRI_participants.csv')
df_data.to_csv('anxiety_fMRI_participants.csv', index=False)

In [286]:
df_data.columns

Index(['filepath', 'param', 'eid', 'field_id', 'ins'], dtype='object')

In [290]:
df_data.shape

(71604, 5)

In [291]:
612 * 71604 /(1024 * 1024)

41.79158020019531