In [1]:
import pandas as pd
import os

# Read the csv file with q-chat scores
csv_file = '../tor/bayley_data.csv'
df = pd.read_csv(csv_file, delimiter=',')

# Filter rows with 'redcap_event_name' and '18_month_assessment_arm_1'
filtered_df = df[df['redcap_event_name'] == '18_month_assessmen_arm_1'].copy()

# Create subject folder list by appending 'sub-' prefix to 'participationid'
filtered_df.loc[:, 'subj'] = 'sub-' + filtered_df['participationid']

# Create a new DataFrame with only 'new_filename' and 'qchat_total' columns
result_df = filtered_df[['subj', 'qchat_total']]

### Removing subject rows which do not have a folder in the dataset

In [2]:
mask = []

for index, row in result_df.iterrows():
    folder = row['subj']
    folder_path = os.path.join('../tor/rel3_dhcp_anat_pipeline/', folder)

    if os.path.isdir(folder_path):
        mask.append(True)
    else:
        mask.append(False)

df_new = result_df[mask]

### Removing subject rows with empty qchat-total value

In [3]:
# Remove empty qchat_total rows
df_new = df_new.dropna(subset=['qchat_total'])

### Removing subject rows with qchat_value of -999

In [4]:
# Remove rows with qchat_total of -999
df_new = df_new.loc[df['qchat_total'] != -999]

### Verify that the Q-CHAT subjects match the folder names

In [5]:
df_test = df_new
df_test['subj'] = df_test['subj'].astype(str)

# Now you can access the 'subj' and 'ses' values as strings
for index, row in df_test.iterrows():
    folder = row['subj']
    folder_path = os.path.join('../tor/rel3_dhcp_anat_pipeline/', folder)

    if os.path.isdir(folder_path):
        print(f"Folder {folder} exists")
    else:
        print(f"Folder {folder} does not exist")

Folder sub-CC00050XX01 exists
Folder sub-CC00052XX03 exists
Folder sub-CC00054XX05 exists
Folder sub-CC00057XX08 exists
Folder sub-CC00058XX09 exists
Folder sub-CC00059XX10 exists
Folder sub-CC00060XX03 exists
Folder sub-CC00061XX04 exists
Folder sub-CC00063AN06 exists
Folder sub-CC00063BN06 exists
Folder sub-CC00064XX07 exists
Folder sub-CC00065XX08 exists
Folder sub-CC00066XX09 exists
Folder sub-CC00067XX10 exists
Folder sub-CC00068XX11 exists
Folder sub-CC00071XX06 exists
Folder sub-CC00072XX07 exists
Folder sub-CC00073XX08 exists
Folder sub-CC00074XX09 exists
Folder sub-CC00075XX10 exists
Folder sub-CC00076XX11 exists
Folder sub-CC00077XX12 exists
Folder sub-CC00078XX13 exists
Folder sub-CC00080XX07 exists
Folder sub-CC00082XX09 exists
Folder sub-CC00084XX11 exists
Folder sub-CC00086XX13 exists
Folder sub-CC00087AN14 exists
Folder sub-CC00087BN14 exists
Folder sub-CC00091XX10 exists
Folder sub-CC00096XX15 exists
Folder sub-CC00097XX16 exists
Folder sub-CC00098AN17 exists
Folder sub

### Matching out.csv rows with filtered_data.csv rows

In [6]:
df_out = pd.read_csv('../tor/out.csv')

# Remove duplicate values for subj (it's removing duplicates with latter scans for preterm babies and keeping the first scan)
df_out = df_out.drop_duplicates(subset='subj')

# Merge the dataframes on the relevant columns
merged_df = pd.merge(df_new, df_out, how='inner', left_on='subj', right_on='subj', validate='one_to_one')

merged_df.to_csv('../tor/paths_labels.csv', index=False)

print('Data extracted and saved to ../tor/paths_labels.csv')

Data extracted and saved to ../tor/paths_labels.csv
