In [42]:
import pandas as pd
import numpy as np
import scipy as sp
import math
pd.set_option('display.max_columns', None)
from sklearn import svm
import sklearn
from sklearn.model_selection import train_test_split

## Select ADNI GO/2 & baseline clinical rows

In [43]:
# key ADNI tables merged into one (ADNI1, ADNI GO, ADNI2) 
x = pd.read_csv('ADNIMERGE_MAY15.2014.csv')
# remove ADNI1
ADNI_GO_2 = x.loc[x['COLPROT'].isin(['ADNIGO','ADNI2'])]
# remove Dx.bl=nan
ADNI_GO_2 = ADNI_GO_2.loc[ADNI_GO_2['DX.bl'].isin(['CN','LMCI','AD','EMCI','SMC'])]
# select baseline rows
ADNI_GO_2_bl = ADNI_GO_2.loc[ADNI_GO_2['VISCODE'].isin(['bl'])]
ADNI_GO_2_bl = ADNI_GO_2_bl.reset_index(drop=True)

## Filter out missing values

In [44]:
# remove columns with too many missing values and non-feature columns
COLS_DROP = [ 'PTID', 'VISCODE', 'SITE', 'COLPROT', 'ORIGPROT', 'EXAMDATE', 'PIB.bl', 'MMSE.bl',
'Years.bl', 'Month.bl', 'Month', 'M', 'PIB', 'EXAMDATE.bl','DX']
d = ADNI_GO_2_bl.drop(COLS_DROP,axis=1)
d = d.reset_index(drop=True)

# Drop all rows with NaN
# d = d.dropna(how='any')
# d = d.reset_index(drop=True)

In [45]:
d['DX.bl'].value_counts()

EMCI    311
CN      187
LMCI    164
AD      148
SMC     106
Name: DX.bl, dtype: int64

In [None]:
# filter out samples with CN (Cognitively Normal) and AD(Alzheimers Disease) labels
# ADNI_GO_2_bl_label = ADNI_GO_2_bl.loc[ADNI_GO_2_bl['DX.bl'].isin(['CN','AD'])]

In [46]:
# convert categorical variables to dummy variables
COLS_CATEGO = ['PTGENDER','PTETHCAT','PTRACCAT','PTMARRY']
for m in COLS_CATEGO:
    categories = []
    for index, row in d.iterrows():
        element = row[m]
        if element not in categories:
            categories.append( element )
        d.loc[index, m] = categories.index(element)

In [47]:
d = d[[ 'RID','ADAS11','ADAS13','AGE', 'APOE4','AV45','CDRSB','EcogPtDivatt','EcogPtLang','EcogPtMem','EcogPtOrgan',\
       'EcogPtPlan','EcogPtTotal','EcogPtVisspat','EcogSPDivatt','EcogSPLang','EcogSPMem','EcogSPOrgan','EcogSPPlan',\
       'EcogSPTotal','EcogSPVisspat','Entorhinal','FAQ','FDG','Fusiform','Hippocampus','ICV','MMSE','MOCA','MidTemp',\
       'PTEDUCAT','PTETHCAT','PTGENDER','PTMARRY','PTRACCAT','RAVLT.forgetting','RAVLT.immediate','RAVLT.learning',
       'RAVLT.perc.forgetting','Ventricles','WholeBrain','DX.bl']]
d = d.rename(columns={'DX.bl':'label_clin'})

## Read in genomics SNPs data

In [48]:
gwas = pd.read_csv('adni_clin_gwas.csv')
gwas['RID'] = gwas['PTID'].str.split('_',expand=True)[2]
gcols = gwas.columns
gwas = gwas[[u'RID', u'APOE4',
       u'rs429358', u'rs7412', u'rs2075650', u'rs12449237', u'rs3851179',
       u'rs4420638', u'rs7561528', u'rs17817600', u'rs3748140', u'rs12808148',
       u'rs6856768', u'rs117383358', u'rs1357692']]
gwas['RID'] = [int(i) for i in gwas['RID']]

clinbio = gwas.merge(d,on='RID',how='inner')
clinbio['label_clin'].value_counts()
#clinbio = clinbio.drop('RID',axis=1)
#clinbio.to_csv('clinicalbio.csv',index=False)

EMCI    209
CN      126
LMCI     67
AD       29
Name: label_clin, dtype: int64

## Read in imaging measurements + Merging 2 or 3 data types

In [52]:
img = pd.read_csv('/Users/ja/Documents/BigDataAnalytics/BigData_ADNI_project/Data/ProcessedImaging/MergedProcessedMRI_filtered.csv')

In [53]:
# Merged based on Clinical and Imaging
merged2 = d.merge(img,on='RID')
merged2['label_clin'].value_counts()
merged2_nona = merged2.dropna(how='any')
merged2_nona = merged2_nona.drop(['ICV_y','label_clin'],axis=1)
merged2_nona.to_csv('merged_clinimg_NoNAs.csv',index=False)

In [54]:
# Merged based on Clinical, Genetic and Imaging
merged3 = clinbio.merge(img,on='RID')
merged3 = merged3.drop(['APOE4_y','ICV_y','label_clin'],axis=1)
merged3.columns
merged3 = merged3.rename(columns={'APOE4_x':'APOE4'})
merged3 = merged3.rename(columns={'ICV_x':'ICV'})
merged3 = merged3.dropna(how='any')
merged3.to_csv('Merged_clinbioimg_nona.csv',index=False)