In [1]:
import pandas as pd
import numpy as np
import numpy.matlib
from numpy import linalg as LA
import matplotlib.pyplot as plt
%matplotlib inline

from scipy import stats, integrate
import seaborn as sns
sns.set(color_codes=True)

from sklearn import decomposition
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA

In [2]:
t1 = pd.read_csv('FOXLABBSI_11_06_17.csv')
t2 = pd.read_csv('MAYOADIRL_MRI_FMRI_05_02_14.csv')
t3 = pd.read_csv('MRI_INFARCTS_11_16_15.csv')
t4 = pd.read_csv('UCD_ADNI2_WMH_10_26_15.csv')

t1 = t1.loc[t1.VISCODE2.isin(['sc','scmri']),] 
t1 = t1.loc[t1.MRSEQUENCE.isin(['ADNI1','Acc']),]
t2 = t2.loc[t2.VISCODE2.isin(['sc','scmri']),] 
t3 = t3.loc[t3.VISCODE2.isin(['bl','sc','scmri']),] 
t4 = t4.loc[t4.VISCODE2=='scmri',] 

In [3]:
# get labels from clinical data
c = pd.read_csv('../Clinical/ADNIMERGE_May15.2014.csv')
c = c[['RID','VISCODE','COLPROT','DX.bl',]]
# filter out the sample IDs with baseline data from ADNI GO or ADNI2
c = c.loc[c.VISCODE.isin(['bl']),]
c = c.drop_duplicates()
c = c.loc[c.COLPROT.isin(['ADNI2','ADNIGO']),]
c = c.reset_index(drop=True)

In [4]:
c['DX.bl'].value_counts()

EMCI    311
CN      187
LMCI    164
AD      148
SMC     106
Name: DX.bl, dtype: int64

## FOXLABBSI_11_06_17

In [5]:
t1_2 = pd.merge(t1,c,left_on='RID',right_on='RID')
t1_2 = t1_2.drop_duplicates()
t1_2 = t1_2.loc[t1_2.COLPROT.isin(['ADNI2','ADNIGO']),] 
t1_2 = t1_2.reset_index(drop=True)
t1_2['DX.bl'].value_counts()

EMCI    306
CN      184
LMCI    163
AD      146
SMC     107
Name: DX.bl, dtype: int64

In [6]:
t1_2 = t1_2.drop(['MRSEQUENCE','VISCODE2','VISCODE','COLPROT','DX.bl'],axis=1)

## MAYOADIRL_MRI_FMRI_05_02_14

In [7]:
t2_2 = pd.merge(t2,c,on='RID')
t2_2 = t2_2.drop_duplicates()
t2_2 = t2_2.loc[t2_2.COLPROT.isin(['ADNI2','ADNIGO']),] 
t2_2 = t2_2.reset_index(drop=True)
t2_2['DX.bl'].value_counts()

EMCI    51
LMCI    38
CN      36
AD      34
SMC     26
Name: DX.bl, dtype: int64

In [8]:
t2_2 = t2_2.drop(['VISCODE2','VISCODE','COLPROT','DX.bl'],axis=1)

## MRI_INFARCTS_11_16_15

In [7]:
t3_2 = pd.merge(t3,c,on='RID')
t3_2 = t3_2.drop_duplicates()
t3_2 = t3_2.loc[t3_2.COLPROT.isin(['ADNI2','ADNIGO']),] 
t3_2 = t3_2.reset_index(drop=True)

# replace the '-' values with 'None'
cols = ['SIDE','SIZE','BRAIN_REGION','STROKE_TYPE']
for i in range(0,len(cols)): 
    t3_2[cols[i]].replace('-','None',inplace=True)

t3_2['DX.bl'].value_counts()

EMCI    308
CN      181
LMCI    165
AD      134
SMC      63
Name: DX.bl, dtype: int64

In [8]:
t3_2 = t3_2.drop(['VISCODE2','VISCODE','COLPROT','DX.bl'],axis=1)

In [10]:
#t3_2

## UCD_ADNI2_WMH_10_26_15

In [11]:
t4_2 = pd.merge(t4,c,on='RID')
t4_2 = t4_2.drop_duplicates()
t4_2 = t4_2.loc[t4_2.COLPROT.isin(['ADNI2','ADNIGO']),] 
t4_2 = t4_2.reset_index(drop=True)
t4_2['DX.bl'].value_counts()

EMCI    225
CN      187
LMCI    163
AD      146
SMC     106
Name: DX.bl, dtype: int64

In [12]:
t4_2 = t4_2.drop(['VISCODE2','VISCODE','COLPROT','DX.bl'],axis=1)

## Merge table

In [13]:
# merge the 4 processed MRI tables
t = t1_2.merge(t2_2,on='RID',how='outer').merge(t3_2,on='RID',how='outer').merge(t4_2,on='RID',how='outer').merge(c,on='RID',how='inner')

In [14]:
# filter out samples with CN (Cognitively Normal) and AD(Alzheimers Disease) labels
# or keep patients of all disease stages
d = t#.loc[t['DX.bl'].isin(['CN','AD']),]
# remove columns with too many missing values and non-feature columns
COLS_DROP = ['VENTACCEPT','HPACCEPT_R','HPACCEPT_L','MEANTSNR','MEDTSNR','SDTSNR','PENCIL','VENETIAN','VISCODE','COLPROT']
d = d.drop(COLS_DROP,axis=1)
d = d.reset_index(drop=True)

# Drop all rows with NaN
d = d.dropna(how='any')
# reset dataframe index 
d = d.reset_index(drop=True)

In [16]:
RID_counts = d['RID'].value_counts().tolist()
RIDs = d['RID'].value_counts().index
RIDs_duplicates = [RIDs[i] for i in range(0,len(RIDs)) if RID_counts[i] > 1]

In [None]:
# convert categorical variables to dummy variables
COLS_CATEGO = ['SIDE','SIZE','BRAIN_REGION','STROKE_TYPE']
for m in COLS_CATEGO:
    categories = []
    for index, row in d.iterrows():
        element = row[m]
        if element not in categories:
            categories.append( element )
        d.loc[index, m] = categories.index(element)

In [None]:
d.to_csv('MergedProcessedMRI_filtered.csv',index=False)

## PCA

In [None]:
# get number of columns and rows
n_cols = d.shape[1]
n_sampels = d.shape[0]

In [None]:
# define features and labels
d_x = d.iloc[:,0:n_cols-2]
d_y = d.iloc[:,n_cols-1]

# get indexes for normal and alzheimers labels
CN_index = d_y == 'CN'
AD_index = d_y == 'AD'

In [None]:
# Convert features to numpy array for PCA transformation
d_x = d_x.as_matrix()
d_x = d_x.astype(float)

# Normalize features to mean=0, variance = 1
x_mean = np.mean(d_x, axis = 0)
x_std = np.std(d_x, axis = 0)

d_x = np.subtract( d_x, np.matlib.repmat(x_mean, n_sampels, 1) )
d_x = np.divide( d_x,np.matlib.repmat(x_std, n_sampels, 1) )

In [None]:
# PCA
row, col = np.shape(d_x)
cov = np.zeros([col, col])

for i in range(row):
    outer_prod = (1/row) * np.outer(d_x[i], d_x[i])
    cov = np.add(cov, outer_prod)
w, v = LA.eig( cov )

#Eigenvector transformation
d_x = np.matmul(d_x, v.real)

In [None]:
#a4_dims = (12, 9)
#fig, ax = plt.subplots(figsize=a4_dims)
plt.scatter(d_x[CN_index,0], d_x[CN_index,2], alpha=0.5)
plt.scatter(d_x[AD_index,0], d_x[AD_index,2], alpha=0.5)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend(['CN', 'AD'])
plt.title('PCA of neuroimaging measurements data')
plt.show()