In [1]:
# open files

# one-hot encode peptide chains

# normalize in each peptide column

# one-hot encode visit months

In [None]:
import pandas as pd
sup_clin_df = pd.read_csv('data/supplemental_clinical_data.csv')
len(sup_clin_df['patient_id'].unique())

In [None]:
no_month_5 = sup_clin_df[sup_clin_df['visit_month'] != 5]
visits = no_month_5.visit_month.unique()
visits

In [None]:
counts = no_month_5.groupby('patient_id').size()
remove = list(counts[counts<2].index)

mask = ~no_month_5['patient_id'].isin(remove)
ts_data = no_month_5[mask]

ts_data = ts_data.rename(columns={'upd23b_clinical_state_on_medication': 'on_Levodopa'})

ts_data.loc[:, 'on_Levodopa'] = ts_data['on_Levodopa'].fillna(0)
ts_data.loc[:, 'on_Levodopa'] = ts_data['on_Levodopa'].replace('On', 1)

In [None]:
ts_data

In [None]:
import numpy as np
import pandas as pd

def extract_data(patient_data):
    # create an empty DataFrame with the desired structure
    visit_months = [0, 6, 12, 18, 24, 30, 36]
    data = pd.DataFrame(np.nan, index=visit_months, columns=['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4', 'on_Levodopa'])
    
    # fill the data DataFrame with data from patient_data based on visit_month
    for _, row in patient_data.iterrows():
        visit_month = row['visit_month']
        if visit_month in visit_months:
            data.loc[visit_month] = row[['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4', 'on_Levodopa']].values
    
    # forward fill missing values down the rows
    data.fillna(method='ffill', inplace=True)
    
    # reset the index and rename the column to 'visit_month'
    data.reset_index(inplace=True)
    data.rename(columns={'index': 'visit_month'}, inplace=True)
    
    return data


In [None]:
print(extract_data(ts_data[ts_data['patient_id']==337]))

In [None]:
import numpy as np

# Assuming ts_data is a DataFrame with the time series data for all patients

# Create a dictionary to store patient data
patient_dict = {}
for id in ts_data['patient_id'].unique():
    # Get data for the current patient
    patient_data = ts_data[ts_data['patient_id'] == id]
    # Expand data to include all timesteps for each patient
    patient_dict[id] = extract_data(patient_data)

# Convert the dictionary to a list of DataFrames, dropping 'patient_id' and 'visit_month' columns
patient_data_list = [df.drop(columns=['visit_month']).values for df in patient_dict.values()]

# Stack the DataFrames along a new axis to create a 3D array
patient_data_array = np.stack(patient_data_list, axis=0)

# The resulting array will have the shape (number_of_patients, number_of_timesteps, number_of_features)
print(patient_data_array.shape)

In [None]:
from sklearn.model_selection import train_test_split

# Split data into train and test sets
label_column = patient_data_array[:, 0]

X_train, X_test, y_train, y_test = train_test_split(patient_data_array, label_column, test_size=0.2, random_state=42)

# Split test set into test and validation sets
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# Print the shapes of the resulting arrays
print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)
print('X_val shape:', X_val.shape)
print('y_val shape:', y_val.shape)


In [None]:
# Reshape the train, test, and validation sets
X_train = X_train.reshape(-1, 35)
X_test = X_test.reshape(-1, 35)
X_val = X_val.reshape(-1, 35)

# Print the new shapes of the input arrays
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('X_val shape:', X_val.shape)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()

# Add layers to the model
model.add(Dense(64, activation='relu', input_shape=(35,)))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(5, activation='softmax'))

# Compile the model with appropriate loss function, optimizer and metrics
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
model.summary()


In [None]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))
