In [57]:
import pandas as pd
import numpy as np
#PLOT & MATH LIBS
import seaborn as sns
import matplotlib.pyplot as plt
#pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'

import random

### Latest TO DO in light of info from Tristan
##### ON OFF States
1. Same idea as originally (one-hot) BUT:
   1. Keep Same Month Visits, then select randomly from total visits
   2. In the case with both ON & OFF for the same visit
      1. Record the scores in an extra column so, u_on_0, u_off_o oooooorrrr
      2. **Duplicate the row, keeping all other entries the same, but with the ON & OFF entry different for each**
      3. Make Sure both of either same month or ON & OFF are in the same training or validation set
   3. Compare One-to-one with completely random selection from fist analysis (but including same month visits)


##### De-noising
1. Use calendar approach:
   1. Divide the PATNO's interval into 3 non-overlapping >6mo periods
   2. Take the mean of each interval as the interval UPDRS score
   3. Take the mean(median Date) as the date for that interval (if >1 date within!)
2. (Tristan's) Slightly Different:
   1. Intervals defined as:
      1.  t_0 + 6mo
      2.  (median date by index) = (index(t_0) + index(t_f) / 2) +/- 3mo
      3.  t_f - 6mo 
  1.  Same interval date and mean logic
3.  Compare results vs calendar approach for random choice for both (i.e. random selection within same intervals)

In [51]:
def interpolate_same_month(df: pd.DataFrame, method = 'rand') -> pd.DataFrame:
    # Select maximum score from same month measurements
    temp_df = df.copy()
    temp_df['YEAR_MONTH'] = temp_df['INFODT'].dt.to_period('M')

    #takes random selctions/maximum/minimum/mean of values which share same month and year
    if method == 'rand':
        sample = temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].apply(lambda x: x.sample(1)).reset_index()
        result = pd.merge(temp_df, sample, on=['PATNO', 'YEAR_MONTH', 'NP3TOT'])

        result.drop(columns=['YEAR_MONTH', 'level_2'], inplace=True)
        return result.drop_duplicates(subset=['PATNO', 'INFODT']).reset_index(drop=True)

    elif method == 'min':
        #result = pd.merge(temp_df, temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].min(), on=['PATNO', 'YEAR_MONTH', 'NP3TOT'])
        temp_df['NP3TOT'] = temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].transform('min')
    elif method == 'max':
        #result = pd.merge(temp_df, temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].max(), on=['PATNO', 'YEAR_MONTH', 'NP3TOT'])
        temp_df['NP3TOT'] = temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].transform('max')
    else:
        #result = pd.merge(temp_df, temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].mean(), on=['PATNO', 'YEAR_MONTH', 'NP3TOT'])
        temp_df['NP3TOT'] = temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].transform('mean')

    
    temp_df.drop(columns=['YEAR_MONTH'], inplace=True)
    temp_df.reset_index(drop=True, inplace=True)
    return temp_df.drop_duplicates(subset=['PATNO', 'INFODT'])

def pivot_wide(input: pd.DataFrame, cols: int = 3) -> pd.DataFrame:
    
    time_df = input[['PATNO', 'time_delta', 'time_index']]

    score_wide = input.pivot(
        index='PATNO',
        columns='time_index',
        values='score'
    ).reset_index()

    time_wide = time_df.pivot(
        index='PATNO',
        columns="time_index",
        values='time_delta'
    ).reset_index()

    score_wide.columns = ['PATNO'] + [f'u{i}' for i in range(cols)] # resets score column names correctly
    time_wide.columns = ['PATNO'] + [f't{i}' for i in range(cols)] # resets time_index column names accordingly

    merged = pd.merge(score_wide, time_wide, on='PATNO') # use PATNO to merge
    #print(merged)
    #merged.drop(columns='PATNO', inplace=True) # drop PATNO, no longer needed

    new_cols = []
    for i in range(cols): # re order columns for ML piplines
        new_cols.append(f't{i}')
        new_cols.append(f'u{i}')

    merged = merged[new_cols]

    return merged


def process_dates_caller(input: pd.DataFrame, entries: int = 3, same_month_flag = False) -> pd.DataFrame:
    # helps maintain random states of dataframes to compare to baselines, default is to assume input is already interpolated
    df = interpolate_same_month(input, 'rand') if same_month_flag else input

    patnos = df['PATNO'].unique().tolist()
    df = input.copy()
    limit = pd.Timedelta(6*30, unit='D') # 180 days ~ 6mo
    date_chunks = []

    for id in patnos:

        visits = df[df['PATNO'] == id]['INFODT'].tolist()
        soln = []
        lim = 0

        if len(visits) < entries:
            #print(f"Not Enough Entries for ID: {id}")
            continue

        sub_df = df[df['PATNO'] == id]
        random.seed(10)
        while True:
            soln = np.random.choice(visits, entries, replace=False)
            if (soln[1] - soln[0]) > limit and (soln[2] - soln[1]) > limit:
                break
            if lim > 100:
                #print(f"No Possible Calendar Combination Found for ID: {id}")
                break
            lim += 1
        
        if lim < 100:
            for i in range(entries):
                date_chunks.append([id, sub_df.loc[sub_df['INFODT'] == soln[i], 'INFODT'].values[0], sub_df.loc[sub_df['INFODT'] == soln[i], 'NP3TOT'].values[0], sub_df.loc[sub_df['INFODT'] == soln[i], 'PDSTATE'].values[0]])     

    res = pd.DataFrame(date_chunks, columns=['PATNO', 'INFODT', 'score', 'PDSTATE'])

    # Prepare dataframe for pivoting

    res['time_delta'] = (
        res.groupby('PATNO')['INFODT']
        .transform(
            lambda x: (x - x.min()) / np.timedelta64(30, 'D')
        )
    )

    res['time_index'] = (
            res.groupby('PATNO')['INFODT']
            .rank(method='first')
            .astype(int)-1
        ) 

    return res


def pivot_wide_with_states(input: pd.DataFrame, cols: int = 3) -> pd.DataFrame:
    
    time_df = input[['PATNO', 'time_delta', 'time_index']]
    
    score_wide = input.pivot(
        index='PATNO',
        columns='time_index',
        values='score'
    ).reset_index()

    time_wide = time_df.pivot(
        index='PATNO',
        columns="time_index",
        values='time_delta'
    ).reset_index()

    off_wide = input.pivot(
        index='PATNO',
        columns='time_index',
        values='OFF'
    ).reset_index()

    on_wide = input.pivot(
        index='PATNO',
        columns='time_index',
        values='ON'
    ).reset_index()

    score_wide.columns = ['PATNO'] + [f'u{i}' for i in range(cols)] # resets score column names correctly
    time_wide.columns = ['PATNO'] + [f't{i}' for i in range(cols)] # resets time_index column names accordingly
    off_wide.columns = ['PATNO'] + [f'off_{i}' for i in range(cols)] # resets time_index column names accordingly
    on_wide.columns = ['PATNO'] + [f'on_{i}' for i in range(cols)] # resets time_index column names accordingly

    #merged = pd.merge(score_wide, time_wide, on='PATNO') # use PATNO to merge
    #print(merged)
    #merged.drop(columns='PATNO', inplace=True) # drop PATNO, no longer needed

    merged = pd.concat([score_wide, time_wide, off_wide, on_wide], axis=1)
    merged.drop(columns='PATNO', inplace=True)


    new_cols = []
    for i in range(cols): # re order columns for ML piplines
        new_cols.append(f't{i}')
        new_cols.append(f'off_{i}')
        new_cols.append(f'on_{i}')
        new_cols.append(f'u{i}')

    merged = merged[new_cols]

    return merged

In [3]:

UPDRS3 = "data/MDS-UPDRS_Part_III_10Jun2024.csv"
patient_status = "data/Participant_Status_03Jun2024.csv"

df3 = pd.read_csv(UPDRS3)
df_pat_stat = pd.read_csv(patient_status) #patient status data
df3 = df3.dropna(subset=['NP3TOT']).reset_index() # will keep for now, might need to include nans
df3['INFODT'] = pd.to_datetime(df3['INFODT'], format="%m/%Y") #reformat INFODT (Assesment Date) to date-time objects
df3['PDSTATE'] =  df3['PDSTATE'].fillna("None")
df3 = df3[["PATNO", "EVENT_ID", "INFODT", "PDSTATE", "PAG_NAME", "NP3TOT"]]

desired_cols_df_pat = {'PATNO', 'COHORT', 'ENROLL_STATUS'}
pat_filtered = df_pat_stat.drop(columns=set(df_pat_stat.columns) - desired_cols_df_pat)
df3_full = pd.merge(df3, pat_filtered, on="PATNO")
df3_full = df3_full[df3_full['ENROLL_STATUS'].isin(['Enrolled', 'Withdrew', 'Complete'])]
df3_full.drop(columns=['ENROLL_STATUS'], inplace=True)
df3_full = df3_full.sort_values(['PATNO', 'INFODT'])

# Partition our data sets
upd3_control = df3_full[df3_full['COHORT'] == 2]
upd3_PD = df3_full[df3_full['COHORT'] == 1]
upd3_PD_nan = upd3_PD[(upd3_PD['PDSTATE'] != 'ON') & (upd3_PD['PDSTATE'] != 'OFF') & (upd3_PD['PAG_NAME'] != 'NUPDR3OF') & (upd3_PD['PAG_NAME'] != 'NUPDR3ON')]
upd3_PD_off = upd3_PD[(upd3_PD['PDSTATE'] == 'OFF') | (upd3_PD['PAG_NAME'] == 'NUPDR3OF')]
upd3_PD_on = upd3_PD[(upd3_PD['PDSTATE'] == 'ON') | (upd3_PD['PAG_NAME'] == 'NUPDR3ON')]
# Data set of interest...
upd3_PD_off_on = upd3_PD[(upd3_PD['PDSTATE'] == 'ON') | (upd3_PD['PDSTATE'] == 'OFF') | (upd3_PD['PAG_NAME'] == 'NUPDR3OF') | (upd3_PD['PAG_NAME'] == 'NUPDR3ON')].reset_index(drop=True)

### TO DO 
2. Can we use PD States as conditional features in our model?
   1. Format would be similar to the random one, but with a state_flag included: 
   2. t_0 | u_0 | s_0 | t_1 | ...
   3. OR another way: t_0 | ON_0 | OFF_0 | u_0 | .... where ON,OFF are in [0,1]
3. Can we use the Error Term from same-month visits in our model? Contidional Regression: **Looks to be a Ridge regression on descrete variables**
4. **Find and adjust model params (definitely ask Tristan or others for guidance here)**

### Including states with One-Hot-encoding
1. Structure is time, OFF state, ON state, and score (all) at time i : t0 | off_0 | on_0 | u0 | ...

**Observations**
1. Performs worse than examining the states separately, compared to the Datasets NP3TOT std

In [53]:
upd3_PD_off_on = upd3_PD[(upd3_PD['PDSTATE'] == 'ON') | (upd3_PD['PDSTATE'] == 'OFF') | (upd3_PD['PAG_NAME'] == 'NUPDR3OF') | (upd3_PD['PAG_NAME'] == 'NUPDR3ON')].reset_index(drop=True)
PD_on_off = interpolate_same_month(upd3_PD_off_on, method='rand')

In [55]:
test_on_off = process_dates_caller(PD_on_off, entries=3, same_month_flag=True)
test_on_off

Unnamed: 0,PATNO,INFODT,score,PDSTATE,time_delta,time_index
0,3001,2013-09-01,25.0,ON,0.000000,0
1,3001,2014-11-01,42.0,ON,14.200000,1
2,3001,2016-06-01,26.0,ON,33.466667,2
3,3002,2013-04-01,22.0,ON,0.000000,0
4,3002,2015-03-01,29.0,ON,23.300000,1
...,...,...,...,...,...,...
2083,164491,2023-11-01,7.0,ON,6.133333,1
2084,164491,2024-06-01,9.0,ON,13.233333,2
2085,182341,2023-01-01,10.0,ON,0.000000,0
2086,182341,2023-07-01,14.0,ON,6.033333,1


In [56]:
on_hot = pd.get_dummies(test_on_off['PDSTATE'], dtype=int)
on_hot.drop(columns=['None'], inplace=True)
PD_on_off_encoded = pd.merge(test_on_off, on_hot, left_index=True, right_index=True)
res_on_off = pivot_wide_with_states(PD_on_off_encoded, 3)
res_on_off

Unnamed: 0,t0,off_0,on_0,u0,t1,off_1,on_1,u1,t2,off_2,on_2,u2
0,0.0,0,1,25.0,14.200000,0,1,42.0,33.466667,0,1,26.0
1,0.0,0,1,22.0,23.300000,0,1,29.0,36.533333,0,1,30.0
2,0.0,0,1,53.0,12.200000,0,1,39.0,98.400000,0,1,43.0
3,0.0,0,1,39.0,9.100000,0,1,18.0,58.800000,1,0,58.0
4,0.0,1,0,35.0,13.166667,0,1,39.0,28.433333,0,1,38.0
...,...,...,...,...,...,...,...,...,...,...,...,...
691,0.0,0,1,14.0,7.133333,1,0,13.0,13.233333,1,0,11.0
692,0.0,1,0,16.0,6.033333,0,1,9.0,12.166667,0,1,14.0
693,0.0,0,1,7.0,6.033333,1,0,10.0,12.166667,1,0,12.0
694,0.0,0,1,10.0,6.133333,0,1,7.0,13.233333,0,1,9.0


In [74]:
# Simple stratified Cross Validation testing
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

# Pipline Data Sets
PD_w_states = interpolate_same_month(upd3_PD_nan, 'rand')

#BASELINES
base_w_states = np.std(PD_w_states['NP3TOT'])

# Processing and pivoting dataset
PD_w_states = process_dates_caller(PD_on_off, entries=3, same_month_flag=True)
on_hot = pd.get_dummies(PD_w_states['PDSTATE'], dtype=int)
on_hot.drop(columns=['None'], inplace=True)
PD_on_off_encoded = pd.merge(PD_w_states, on_hot, left_index=True, right_index=True)
res_on_off = pivot_wide_with_states(PD_on_off_encoded, 3)

def test_model(X, y, folds = 5):
    pipe_rf = Pipeline(steps=[('model', RandomForestRegressor())])
    pipe_ridge = Pipeline(steps=[('model', Ridge())])
    # .values will give the values in a numpy array (shape: (n,1))
    # .ravel will convert that array shape to (n, ) (i.e. flatten it)
    score_test_rf = -1 * cross_val_score(pipe_rf, X, y.values.ravel(), cv=folds, scoring="neg_root_mean_squared_error")
    score_test_ridge = -1 * cross_val_score(pipe_ridge, X, y.values.ravel(), cv=folds, scoring="neg_root_mean_squared_error")

    print("Baselines")
    print("----------------------------------------------------------------")
    print(f"STD(NP3TOT) With States: {base_w_states}")
    print("----------------------------------------------------------------")

    print(f"Startings models, Simple Cross Validation, k = {folds}, VS std(UPDRS): \n")
    print(" Ridge Regression RMSE by fold: ", score_test_ridge)
    print(" Random Forest RMSE by fold: ", score_test_rf, "\n")
    print(f" Ridge Regression mean RMSE: {score_test_ridge.mean()}", '\n', f"Random Forest, default 5-nodes, mean RMSE: {score_test_rf.mean()}")

In [75]:
X = res_on_off.iloc[: , : len(res_on_off.columns)-1]
y = res_on_off.iloc[: , len(res_on_off.columns)-1 : ]

print(X.shape, y.shape, "\n")
print("1st Run: PD With States")
test_model(X = X, y = y)

(696, 11) (696, 1) 

1st Run: PD With States
Baselines
----------------------------------------------------------------
STD(NP3TOT) With States: 10.364208177895836
----------------------------------------------------------------
Startings models, Simple Cross Validation, k = 5, VS std(UPDRS): 

 Ridge Regression RMSE by fold:  [11.37335506 10.60562307 12.07617329 12.86087622  9.10953348]
 Random Forest RMSE by fold:  [12.98462156 11.25990815 13.03134889 13.41765521  9.99748148] 

 Ridge Regression mean RMSE: 11.205112221018421 
 Random Forest, default 5-nodes, mean RMSE: 12.13820305957007
