In [1]:
import pandas as pd
sup_clin_df = pd.read_csv('data/supplemental_clinical_data.csv')

no_month_5 = sup_clin_df[sup_clin_df['visit_month'] != 5]
visits = no_month_5.visit_month.unique()

len(sup_clin_df['patient_id'].unique())

counts = no_month_5.groupby('patient_id').size()
remove = list(counts[counts<2].index)

mask = ~no_month_5['patient_id'].isin(remove)
ts_data = no_month_5[mask]

ts_data = ts_data.rename(columns={'upd23b_clinical_state_on_medication': 'on_Levodopa'})

ts_data.loc[:, 'on_Levodopa'] = ts_data['on_Levodopa'].fillna(0)
ts_data.loc[:, 'on_Levodopa'] = ts_data['on_Levodopa'].replace('On', 1)

ts_data.fillna(-1, inplace=True)

In [2]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler


# Select the columns containing UPDRS scores, patient_id, and visit_month
cols_to_select = ['patient_id', 'visit_month', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4', 'on_Levodopa']
updrs_data = ts_data.loc[:, cols_to_select]

# Create a boolean mask to identify the valid values
valid_mask = updrs_data != -1
updrs_data = updrs_data.where(valid_mask, np.nan)

# Normalize the valid UPDRS values using a MinMaxScaler
scaler = MinMaxScaler()
updrs_cols_only = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']
updrs_data.loc[valid_mask[updrs_cols_only].all(axis=1), updrs_cols_only] = scaler.fit_transform(updrs_data[updrs_cols_only].dropna())

# Normalize the 'on_Levodopa' column separately
updrs_data.loc[valid_mask['on_Levodopa'], 'on_Levodopa'] = (updrs_data['on_Levodopa'] - updrs_data['on_Levodopa'].min()) / (updrs_data['on_Levodopa'].max() - updrs_data['on_Levodopa'].min())

In [3]:
import numpy as np
import pandas as pd

def extract_data(patient_data):
    # create an empty DataFrame with the desired structure
    visit_months = [0, 6, 12, 18, 24, 30, 36] #rows

    # create an empty DataFrame with NaN values
    data = pd.DataFrame(np.nan, columns=patient_data.columns, index=visit_months)

    # update the DataFrame with selected data
    data.update(patient_data.set_index('visit_month'))

    # fill forward to propagate the last observed value forward
    data.fillna(method='ffill', inplace=True)

    # reset the index and assign the index values to the 'visit_month' column
    data.reset_index(inplace=True)
    data['visit_month'] = data.index * 6

    return data


#extract_data(ts_data[ts_data.patient_id==337])


In [48]:
# create an empty dictionary to store patient data
patient_dict = {}

# extract patient data and store in dictionary
patients = updrs_data.patient_id
for id in patients.unique():
    patient_data = updrs_data[patients==id]
    patient_dict[id] = extract_data(patient_data)


#list(patient_dict.values())

[   index  patient_id  visit_month   updrs_1   updrs_2   updrs_3  updrs_4   
 0      0        35.0            0  0.185185  0.100000  0.231884      0.0  \
 1      6        35.0            6  0.185185  0.100000  0.231884      0.0   
 2     12        35.0           12  0.185185  0.100000  0.231884      0.0   
 3     18        35.0           18  0.185185  0.100000  0.231884      0.0   
 4     24        35.0           24  0.185185  0.100000  0.231884      0.0   
 5     30        35.0           30  0.185185  0.100000  0.231884      0.0   
 6     36        35.0           36  0.222222  0.133333  0.289855      0.0   
 
    on_Levodopa  
 0          0.0  
 1          0.0  
 2          0.0  
 3          0.0  
 4          0.0  
 5          0.0  
 6          0.0  ,
    index  patient_id  visit_month   updrs_1   updrs_2   updrs_3  updrs_4   
 0      0        75.0            0  0.148148  0.200000  0.376812      0.0  \
 1      6        75.0            6  0.148148  0.200000  0.376812      0.0   
 2    

In [41]:
import numpy as np

patient_array = np.array(list(patient_dict.values()))
patient_array = patient_array[:, :, 2:]  # remove first two columns
patient_array = np.nan_to_num(patient_array, nan=-1)

nan_mask = np.isnan(patient_array)
num_nan_values = np.sum(nan_mask)

print(f"Found {num_nan_values} NaN values in patient_array.")


Found 0 NaN values in patient_array.


In [50]:
patient_array.shape

(507, 7, 6)