-
Notifications
You must be signed in to change notification settings - Fork 24
/
genetic_preprocess.py
126 lines (100 loc) · 4.08 KB
/
genetic_preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
####################################################################
# Preprocess the Genetic Data
####################################################################
import os
import pandas as pd
import numpy as np
import numpy.matlib
from numpy import linalg as LA
import math
import matplotlib.pyplot as plt
####################################################################
#Script Parameters
####################################################################
COL_LABELS = ['DX_bl']
COL_IDS = ['PTID']
COL_REMOVE = [ 'PTID.bl', 'VISCODE', 'COLPROT', 'ORIGPROT']
CLI_FILEPATH = '../Data/Genetic/adni_clin_gwas.csv'
SAVE_FILEPATH = 'GENETIC_FILTERED.csv'
####################################################################
#Step 1-1: Grab the relevant data
####################################################################
df_cli_data = pd.read_csv( CLI_FILEPATH )
INDEX_TRAIN = []
for index, row in df_cli_data.iterrows():
if ( row['DX_bl'] == 'CN'
or row['DX_bl'] == 'AD'
or row['DX_bl'] == 'LMCI'
or row['DX_bl'] == 'EMCI'):
INDEX_TRAIN.append( index )
# Create new processed dataframe
df_cli_data = pd.DataFrame(df_cli_data.iloc[INDEX_TRAIN], columns=df_cli_data.columns)
df_cli_data = df_cli_data.reset_index(drop=True)
### Remove the non-feature columns except for DX_bl ##
df_features = df_cli_data.drop(COL_REMOVE, axis=1)
NUM_SAMPLES = len(df_features)
print( 'Number of Data Points: %d' % (NUM_SAMPLES) )
####################################################################
#Step 2-2: Record index of CN and AD patients
####################################################################
INDEX_CN = []
INDEX_AD = []
for index, row in df_features.iterrows():
if row['DX_bl'] == 'CN':
INDEX_CN.append( index )
elif row['DX_bl'] == 'AD' or row['DX_bl'] == 'LMCI'or row['DX_bl'] == 'EMCI':
INDEX_AD.append( index )
### Remove DX_bl ##
df_labels = df_features[ COL_LABELS ]
df_features = df_features.drop(COL_LABELS, axis=1)
### Remove RID, VISCODE ##
df_id = df_features[ COL_IDS ]
df_features = df_features.drop(COL_IDS, axis=1)
categories = []
for index, row in df_labels.iterrows():
element = row['DX_bl']
if element == 'EMCI' or element == 'LMCI':
element = 'AD'
if element not in categories:
categories.append( element )
df_labels.at[index, 'DX_bl'] = categories.index( element )
####################################################################
#Step 1-2: Record index of CN and AD patients
####################################################################
# Convert to a numpy array for PCA transformation
features = df_features.as_matrix()
features = features.astype(float)
### Each feature has roughly 0 mean, 1 var ###
feat_mean = np.mean(features, axis = 0)
feat_std = np.std(features, axis = 0)
features = np.subtract( features,
np.matlib.repmat(feat_mean, NUM_SAMPLES, 1) )
features = np.divide( features,
np.matlib.repmat(feat_std, NUM_SAMPLES, 1) )
### Now that the features have similar statistics, perform PCA ###
row, col = np.shape(features)
cov = np.zeros([col, col])
for i in range(row):
outer_prod = (1/row) * np.outer(features[i], features[i])
cov = np.add(cov, outer_prod)
w, v = LA.eig( cov )
# For now don't do any dimensionality reduction
#Apply the Eigenvector transformation
features = np.matmul(features, v.real)
####################################################################
#Step 2: Visualize and Cluster/Classify
####################################################################
plt.scatter(features[INDEX_CN,0], features[INDEX_CN,2], alpha=0.5)
plt.scatter(features[INDEX_AD,0], features[INDEX_AD,2], alpha=0.5)
# plt.scatter(features[INDEX_LMCI,0], features[INDEX_LMCI,2], alpha=0.5)
# plt.scatter(features[INDEX_EMCI,0], features[INDEX_EMCI,2], alpha=0.5)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend(['CN', 'AD'])
# plt.legend(['CN', 'AD', 'LMCI', 'EMCI'])
plt.show()
df_processed_feat = pd.DataFrame(features)
df_processed_feat = pd.concat([df_id, df_labels, df_processed_feat], axis=1)
# Save processed data
df_processed_feat.to_csv(SAVE_FILEPATH, sep=',', index=False)
print('Saved processed features as: %s' % (SAVE_FILEPATH))