In [32]:
import numpy as np
import h5py as h5
import pandas as pd
import re

combined_path = '/projects/jp/adni-autoencoder/combined.h5'
cortical_path = '/projects/nikhil/ADNI_prediction/input_datasets/CT/scans_AAL.csv'
clinical_path = '/projects/francisco/data/ADNI/ADNI_Merge_filter.csv'

# ID patterns
id_participant = re.compile(r"""
 (?<=ADNI_)      # Match the first string after ADNI_
 (.*?)          # Lazy quantifier so it only grabs the first immediate match.
 (?=_MR)        # End at the _MR
""", re.VERBOSE)

id_image = re.compile('(?<=S)\d+_(.*?)(?=_)')

In [6]:
# Concatenate all the filenames
filenames = set([])
combined = h5.File(combined_path, 'r')

for split in ['train', 'valid', 'test']:
    filenames = filenames.union(combined['l_{}_files'.format(split)])

In [30]:
# part > img id lookup
participants = {}
for f in filenames:
    try:
        id = re.search(id_participant, f).group(0)
        img = re.search(id_image, f).group(1)
        participants[id] = img
    except:
        print f
print '{} unique mappings found'.format(len(participants.items()))

698 unique mappings found


In [77]:
from pandas import Series

# Load clinical and cortical datasets
clinical = pd.read_csv(clinical_path)
cortical = pd.read_csv(cortical_path)

# Filter ADNI1 Baseline subjects 
baseline_adni_1 = clinical[(clinical.ORIGPROT =='ADNI1') & (clinical.COLPROT=='ADNI1') & (clinical.VISCODE == 'bl')]

# Filter for subjects whom we have CT measurements for
baseline_adni_1 = baseline_adni_1.loc[baseline_adni_1['PTID'].isin(participants)]
img_id_col = [participants[id] for id in baseline_adni_1.PTID if id in participants.keys()]

# Add image id to clinical table:
baseline_adni_1.insert(2,'IID', img_id_col)

In [83]:
from pandas import merge

# Rename ID to IID in cortical df
cortical.rename(columns={'ID':'IID'}, inplace=True)

# Merge these suckas:
merged = merge(baseline_adni_1, cortical, on=['IID'])

# Save
merged.to_csv('/projects/francisco/data/ADNI/ct_clinical.csv')