In [302]:
# open files

# one-hot encode peptide chains

# normalize in each peptide column

# one-hot encode visit months

In [303]:
import pandas as pd
sup_clin_df = pd.read_csv('data/supplemental_clinical_data.csv')

no_month_5 = sup_clin_df[sup_clin_df['visit_month'] != 5]
visits = no_month_5.visit_month.unique()

len(sup_clin_df['patient_id'].unique())

771

In [304]:
# NaN values per column in sup_clin_df
sup_clin_df.isna().sum()


visit_id                                  0
patient_id                                0
visit_month                               0
updrs_1                                 213
updrs_2                                 214
updrs_3                                   5
updrs_4                                 928
upd23b_clinical_state_on_medication    1101
dtype: int64

In [305]:
counts = no_month_5.groupby('patient_id').size()
remove = list(counts[counts<2].index)

mask = ~no_month_5['patient_id'].isin(remove)
ts_data = no_month_5[mask]

ts_data = ts_data.rename(columns={'upd23b_clinical_state_on_medication': 'on_Levodopa'})

ts_data.loc[:, 'on_Levodopa'] = ts_data['on_Levodopa'].fillna(0)
ts_data.loc[:, 'on_Levodopa'] = ts_data['on_Levodopa'].replace('On', 1)

ts_data.fillna(-1, inplace=True)

def examine(patient_id):
    data = ts_data[ts_data['patient_id']==patient_id]

    return data


In [306]:
ts_data

Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,on_Levodopa
0,35_0,35,0,5.0,3.0,16.0,0.0,0
1,35_36,35,36,6.0,4.0,20.0,0.0,0
2,75_0,75,0,4.0,6.0,26.0,0.0,0
3,75_36,75,36,1.0,8.0,38.0,0.0,1
5,337_0,337,0,5.0,7.0,6.0,0.0,1
...,...,...,...,...,...,...,...,...
2215,65290_30,65290,30,4.0,16.0,13.0,0.0,1
2216,65303_0,65303,0,0.0,2.0,20.0,0.0,0
2217,65303_36,65303,36,4.0,1.0,26.0,0.0,0
2221,65530_0,65530,0,10.0,6.0,24.0,0.0,0


In [307]:
len(ts_data.patient_id.unique())

507

In [308]:
ts_data.isna().sum()

visit_id       0
patient_id     0
visit_month    0
updrs_1        0
updrs_2        0
updrs_3        0
updrs_4        0
on_Levodopa    0
dtype: int64

In [309]:
# Select the columns containing UPDRS scores, patient_id, and visit_month
cols_to_select = ['patient_id', 'visit_month', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4', 'on_Levodopa']
updrs_data = ts_data.loc[:, cols_to_select]

# Create a boolean mask to identify the valid values
valid_mask = updrs_data != -1
updrs_data = updrs_data.where(valid_mask, np.nan)

# Normalize the valid UPDRS values using a MinMaxScaler
scaler = MinMaxScaler()
updrs_cols_only = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']
updrs_data.loc[valid_mask[updrs_cols_only].all(axis=1), updrs_cols_only] = scaler.fit_transform(updrs_data[updrs_cols_only].dropna())

# Normalize the 'on_Levodopa' column separately
updrs_data.loc[valid_mask['on_Levodopa'], 'on_Levodopa'] = (updrs_data['on_Levodopa'] - updrs_data['on_Levodopa'].min()) / (updrs_data['on_Levodopa'].max() - updrs_data['on_Levodopa'].min())

# Convert the normalized data to a DataFrame with column names
normalized_df = updrs_data.rename(columns=lambda x: f"normalized_{x}")

# Replace NaN values with -1
normalized_df = normalized_df.fillna(-1)

In [310]:
normalized_df.isna().sum()

normalized_patient_id     0
normalized_visit_month    0
normalized_updrs_1        0
normalized_updrs_2        0
normalized_updrs_3        0
normalized_updrs_4        0
normalized_on_Levodopa    0
dtype: int64

In [311]:
import numpy as np
import pandas as pd

def extract_data(patient_data):
    visit_months = [0, 6, 12, 18, 24, 30, 36]
    columns_to_select = ['normalized_visit_month', 'normalized_updrs_1', 'normalized_updrs_2', 'normalized_updrs_3', 'normalized_updrs_4', 'normalized_on_Levodopa']
    data = pd.DataFrame(np.nan, columns=columns_to_select, index=visit_months)
    data.update(patient_data[columns_to_select].set_index('normalized_visit_month'))
    data.fillna(method='ffill', inplace=True)
    data.reset_index(inplace=True)
    data['normalized_visit_month'] = data.index * 6
    return data
    
patient_dict = {}
patients = normalized_df.normalized_patient_id
for id in patients.unique():
    patient_data = normalized_df[patients==id]
    patient_dict[id] = extract_data(patient_data)

# Combine the DataFrames into a single DataFrame
combined_patient_data = pd.concat(patient_dict.values(), keys=patient_dict.keys(), axis=0)

# Reset the index and add a new column for patient_id
combined_patient_data.reset_index(level=0, inplace=True)
combined_patient_data.rename(columns={'level_0': 'normalized_patient_id'}, inplace=True)


combined_patient_data = combined_patient_data.drop(columns='normalized_visit_month')
combined_patient_data = combined_patient_data.rename(columns={
    'normalized_patient_id': 'patient_id',
    'index': 'month'
})

# Print the combined DataFrame
print(combined_patient_data)


    patient_id  month  normalized_updrs_1  normalized_updrs_2   
0           35      0            0.185185            0.100000  \
1           35      6            0.185185            0.100000   
2           35     12            0.185185            0.100000   
3           35     18            0.185185            0.100000   
4           35     24            0.185185            0.100000   
..         ...    ...                 ...                 ...   
2        65530     12            0.370370            0.200000   
3        65530     18            0.370370            0.200000   
4        65530     24            0.370370            0.200000   
5        65530     30            0.370370            0.200000   
6        65530     36            0.296296            0.133333   

    normalized_updrs_3  normalized_updrs_4  normalized_on_Levodopa  
0             0.231884            0.000000                     0.0  
1             0.231884            0.000000                     0.0  
2           

In [313]:
import numpy as np
import pandas as pd

def extract_data(patient_data):
    # create an empty DataFrame with the desired structure
    visit_months = [0, 6, 12, 18, 24, 30, 36] #rows
    columns_to_select = ['visit_month', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4', 'on_Levodopa'] #columns
    new_column_names = ['normalized_visit_month', 'normalized_updrs_1', 'normalized_updrs_2', 'normalized_updrs_3', 'normalized_updrs_4', 'normalized_on_Levodopa']

    # create an empty DataFrame with NaN values
    data = pd.DataFrame(np.nan, columns=columns_to_select, index=visit_months)

    # update the DataFrame with selected data
    data.update(patient_data[columns_to_select].set_index('visit_month'))

    # fill forward to propagate the last observed value forward
    data.fillna(method='ffill', inplace=True)

    # reset the index and assign the index values to the 'visit_month' column
    data.reset_index(inplace=True)
    data['visit_month'] = data.index * 6

    # rename the columns to their normalized versions
    data.columns = ['normalized_patient_id'] + new_column_names

    return data

extract_data(ts_data[ts_data.patient_id==337])


Unnamed: 0,normalized_patient_id,normalized_visit_month,normalized_updrs_1,normalized_updrs_2,normalized_updrs_3,normalized_updrs_4,normalized_on_Levodopa
0,0,0,5.0,7.0,6.0,0.0,1
1,6,6,5.0,7.0,6.0,0.0,1
2,12,12,5.0,7.0,6.0,0.0,1
3,18,18,5.0,7.0,6.0,0.0,1
4,24,24,5.0,7.0,6.0,0.0,1
5,30,30,5.0,7.0,6.0,0.0,1
6,36,36,8.0,7.0,8.0,0.0,1


In [297]:
normalized_df

Unnamed: 0,normalized_patient_id,normalized_visit_month,normalized_updrs_1,normalized_updrs_2,normalized_updrs_3,normalized_updrs_4,normalized_on_Levodopa
0,35,0,0.185185,0.100000,0.231884,0.000000,0.0
1,35,36,0.222222,0.133333,0.289855,0.000000,0.0
2,75,0,0.148148,0.200000,0.376812,0.000000,0.0
3,75,36,0.037037,0.266667,0.550725,0.000000,1.0
5,337,0,0.185185,0.233333,0.086957,0.000000,1.0
...,...,...,...,...,...,...,...
2215,65290,30,0.148148,0.533333,0.188406,0.000000,1.0
2216,65303,0,0.000000,0.066667,0.289855,0.000000,0.0
2217,65303,36,0.148148,0.033333,0.376812,0.000000,0.0
2221,65530,0,0.370370,0.200000,0.347826,0.000000,0.0


In [301]:
patient_dict = {}
patients = normalized_df.normalized_patient_id
for id in patients.unique():
    patient_data = normalized_df[patients==id]
    patient_dict[id] = extract_data(patient_data)

print(patient_dict[35])

KeyError: "None of [Index(['visit_month', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4',\n       'on_Levodopa'],\n      dtype='object')] are in the [columns]"

In [293]:
patient_dict[337]

Unnamed: 0,index,normalized_visit_month,normalized_updrs_1,normalized_updrs_2,normalized_updrs_3,normalized_updrs_4,normalized_on_Levodopa
0,0,0,0.185185,0.233333,0.086957,0.0,1.0
1,6,6,0.185185,0.233333,0.086957,0.0,1.0
2,12,12,0.185185,0.233333,0.086957,0.0,1.0
3,18,18,0.185185,0.233333,0.086957,0.0,1.0
4,24,24,0.185185,0.233333,0.086957,0.0,1.0
5,30,30,0.185185,0.233333,0.086957,0.0,1.0
6,36,36,0.296296,0.233333,0.115942,0.0,1.0


In [292]:
import numpy as np
from sklearn.model_selection import train_test_split

# Convert the dictionary values into a list of 1D NumPy arrays
patient_data_list = list(patient_dict.values())

# Stack the list of 1D NumPy arrays along a new axis to create a 2D array
patient_data_array = np.stack(patient_data_list, axis=0)

# Split data into train and test sets
label_column = patient_data_array[:, 0]

X_train, X_test, y_train, y_test = train_test_split(patient_data_array, label_column, test_size=0.2, random_state=42)

# Split test set into test and validation sets
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# Print the shapes of the resulting arrays
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)
print('X_val shape:', X_val.shape)
print('y_val shape:', y_val.shape)

X_train shape: (405, 7, 7)
y_train shape: (405, 7)
X_test shape: (51, 7, 7)
y_test shape: (51, 7)
X_val shape: (51, 7, 7)
y_val shape: (51, 7)


In [106]:
from sklearn.model_selection import train_test_split

# Split data into train and test sets
label_column = patient_data_array[:, 0]

X_train, X_test, y_train, y_test = train_test_split(patient_data_array, label_column, test_size=0.2, random_state=42)

# Split test set into test and validation sets
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# Print the shapes of the resulting arrays
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)
print('X_val shape:', X_val.shape)
print('y_val shape:', y_val.shape)

X_train shape: (1472, 48, 5)
y_train shape: (1472, 5)
X_test shape: (184, 48, 5)
y_test shape: (184, 5)
X_val shape: (185, 48, 5)
y_val shape: (185, 5)


In [291]:
X_train

array([[[0.51851852, 0.51851852, 0.51851852, 0.51851852, 0.51851852],
        [0.06451613, 0.06451613, 0.06451613, 0.06451613, 0.06451613],
        [0.27142857, 0.27142857, 0.27142857, 0.27142857, 0.27142857],
        ...,
        [       nan,        nan,        nan,        nan,        nan],
        [       nan,        nan,        nan,        nan,        nan],
        [       nan,        nan,        nan,        nan,        nan]],

       [[0.48148148, 0.48148148, 0.48148148, 0.48148148, 0.48148148],
        [0.32258065, 0.32258065, 0.32258065, 0.32258065, 0.32258065],
        [0.47142857, 0.47142857, 0.47142857, 0.47142857, 0.47142857],
        ...,
        [       nan,        nan,        nan,        nan,        nan],
        [       nan,        nan,        nan,        nan,        nan],
        [       nan,        nan,        nan,        nan,        nan]],

       [[0.22222222, 0.22222222, 0.22222222, 0.22222222, 0.22222222],
        [0.5483871 , 0.5483871 , 0.5483871 , 0.5483871 , 0.5

In [78]:
# Reshape the train, test, and validation sets
X_train = X_train.reshape(-1, 35)
X_test = X_test.reshape(-1, 35)
X_val = X_val.reshape(-1, 35)

# Print the new shapes of the input arrays
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('X_val shape:', X_val.shape)



ValueError: cannot reshape array of size 353280 into shape (35)

In [61]:
X_train.shape

(405, 35)

In [63]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()

# Add layers to the model
model.add(Dense(64, activation='relu', input_shape=(35,)))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(5, activation='softmax'))

# Compile the model with appropriate loss function, optimizer and metrics
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 64)                2304      
                                                                 
 dense_5 (Dense)             (None, 64)                4160      
                                                                 
 dense_6 (Dense)             (None, 32)                2080      
                                                                 
 dense_7 (Dense)             (None, 5)                 165       
                                                                 
Total params: 8,709
Trainable params: 8,709
Non-trainable params: 0
_________________________________________________________________


In [64]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [65]:
import matplotlib.pyplot as plt

history.history

{'loss': [181.50848388671875,
  481.25030517578125,
  1202.8140869140625,
  2823.115966796875,
  6160.86474609375,
  12403.5048828125,
  22923.837890625,
  40327.828125,
  64808.20703125,
  101222.84375],
 'accuracy': [0.6716049313545227,
  0.9679012298583984,
  0.9679012298583984,
  0.9679012298583984,
  0.9679012298583984,
  0.9679012298583984,
  0.9679012298583984,
  0.9679012298583984,
  0.9679012298583984,
  0.9679012298583984],
 'val_loss': [294.9372253417969,
  794.1658935546875,
  1908.5364990234375,
  4321.4638671875,
  9160.662109375,
  17841.294921875,
  32169.029296875,
  53895.15625,
  85651.5234375,
  129292.28125],
 'val_accuracy': [0.9803921580314636,
  0.9803921580314636,
  0.9803921580314636,
  0.9803921580314636,
  0.9803921580314636,
  0.9803921580314636,
  0.9803921580314636,
  0.9803921580314636,
  0.9803921580314636,
  0.9803921580314636]}