In [233]:
import pandas as pd
import numpy as np
#PLOT & MATH LIBS
import seaborn as sns
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None  # default='warn'
import random as random

In [234]:
UPDRS3 = "data/MDS-UPDRS_Part_III_10Jun2024.csv"
patient_status = "data/Participant_Status_03Jun2024.csv"

df3 = pd.read_csv(UPDRS3)
df_pat_stat = pd.read_csv(patient_status) #patient status data
df3 = df3.dropna(subset=['NP3TOT']).reset_index() # will keep for now, might need to include nans
df3['INFODT'] = pd.to_datetime(df3['INFODT'], format="%m/%Y") #reformat INFODT (Assesment Date) to date-time objects
df3['PDSTATE'] =  df3['PDSTATE'].fillna("None")
df3 = df3[["PATNO", "EVENT_ID", "INFODT", "PDSTATE", "PAG_NAME", "NP3TOT"]]

desired_cols_df_pat = {'PATNO', 'COHORT', 'ENROLL_STATUS'}
pat_filtered = df_pat_stat.drop(columns=set(df_pat_stat.columns) - desired_cols_df_pat)
df3_full = pd.merge(df3, pat_filtered, on="PATNO")
df3_full = df3_full[df3_full['ENROLL_STATUS'].isin(['Enrolled', 'Withdrew', 'Complete'])]
df3_full.drop(columns=['ENROLL_STATUS'], inplace=True)
df3_full = df3_full.sort_values(['PATNO', 'INFODT'])

# Partition our data sets
upd3_control = df3_full[df3_full['COHORT'] == 2]
upd3_PD = df3_full[df3_full['COHORT'] == 1]
# Data set of interest...
upd3_PD_off_on = upd3_PD[(upd3_PD['PDSTATE'] == 'ON') | (upd3_PD['PDSTATE'] == 'OFF') | (upd3_PD['PAG_NAME'] == 'NUPDR3OF') | (upd3_PD['PAG_NAME'] == 'NUPDR3ON')].reset_index(drop=True)

#### Logic for same month entries
- iterate through all dates selected per PATNO
- if one belongs to the set of dates which have duplicate entries in compliment states -> 
  - append new entries to new dataframe which has that INFODT entry swapped with the compliment
  - so maximum three new entry sets per patno
- pivot and concat this dataframe to the original

**Note:** This code is buggy, if errors continue please re-run the notebook (will fix on my own time in the future)

In [235]:
def interpolate_same_month(df: pd.DataFrame, method = 'rand') -> pd.DataFrame:
    # Select maximum score from same month measurements
    temp_df = df.copy()
    temp_df['YEAR_MONTH'] = temp_df['INFODT'].dt.to_period('M')

    #takes random selctions/maximum/minimum/mean of values which share same month and year
    if method == 'rand':
        sample = temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].apply(lambda x: x.sample(1)).reset_index()
        result = pd.merge(temp_df, sample, on=['PATNO', 'YEAR_MONTH', 'NP3TOT'])

        result.drop(columns=['YEAR_MONTH', 'level_2'], inplace=True)
        return result.drop_duplicates(subset=['PATNO', 'INFODT']).reset_index(drop=True)

    elif method == 'min':
        #result = pd.merge(temp_df, temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].min(), on=['PATNO', 'YEAR_MONTH', 'NP3TOT'])
        temp_df['NP3TOT'] = temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].transform('min')
    elif method == 'max':
        #result = pd.merge(temp_df, temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].max(), on=['PATNO', 'YEAR_MONTH', 'NP3TOT'])
        temp_df['NP3TOT'] = temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].transform('max')
    else:
        #result = pd.merge(temp_df, temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].mean(), on=['PATNO', 'YEAR_MONTH', 'NP3TOT'])
        temp_df['NP3TOT'] = temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].transform('mean')

    
    temp_df.drop(columns=['YEAR_MONTH'], inplace=True)
    temp_df.reset_index(drop=True, inplace=True)
    return temp_df.drop_duplicates(subset=['PATNO', 'INFODT'])


def process_dates_caller(input: pd.DataFrame, entries: int = 3, same_month_flag = False) -> pd.DataFrame:
    # helps maintain random states of dataframes to compare to baselines, default is to assume input is already interpolated
    df = interpolate_same_month(input, 'rand') if same_month_flag else input

    patnos = df['PATNO'].unique().tolist()
    df = input.copy()
    limit = pd.Timedelta(6*30, unit='D') # 180 days ~ 6mo
    date_chunks = []

    for id in patnos:

        visits = df[df['PATNO'] == id]['INFODT'].tolist()
        soln = []
        lim = 0

        if len(visits) < entries:
            #print(f"Not Enough Entries for ID: {id}")
            continue

        sub_df = df[df['PATNO'] == id]
        random.seed(10)
        """
            Include same month check and processor here 
        """
        
        while True:
            soln = np.random.choice(visits, entries, replace=False)

            if (soln[1] - soln[0]) > limit and (soln[2] - soln[1]) > limit:
                break
            if lim > 100:
                #print(f"No Possible Calendar Combination Found for ID: {id}")
                break
            lim += 1
        
        # add additional same_month flag here? then double append the two rows?
        if lim < 100:
            for i in range(entries):
                date_chunks.append([id, sub_df.loc[sub_df['INFODT'] == soln[i], 'INFODT'].values[0], sub_df.loc[sub_df['INFODT'] == soln[i], 'NP3TOT'].values[0], sub_df.loc[sub_df['INFODT'] == soln[i], 'PDSTATE'].values[0]])     

    res = pd.DataFrame(date_chunks, columns=['PATNO', 'INFODT', 'score', 'PDSTATE'])

    # Prepare dataframe for pivoting

    res['time_delta'] = (
        res.groupby('PATNO')['INFODT']
        .transform(
            lambda x: (x - x.min()) / np.timedelta64(30, 'D')
        )
    )

    res['time_index'] = (
            res.groupby('PATNO')['INFODT']
            .rank(method='first')
            .astype(int)-1
        ) 

    return res



def pivot_wide_with_states(input: pd.DataFrame, cols: int = 3) -> pd.DataFrame:
    
    time_df = input[['PATNO', 'time_delta', 'time_index']]
    
    score_wide = input.pivot(
        index='PATNO',
        columns='time_index',
        values='score'
    ).reset_index()

    time_wide = time_df.pivot(
        index='PATNO',
        columns="time_index",
        values='time_delta'
    ).reset_index()

    off_wide = input.pivot(
        index='PATNO',
        columns='time_index',
        values='OFF'
    ).reset_index()

    on_wide = input.pivot(
        index='PATNO',
        columns='time_index',
        values='ON'
    ).reset_index()

    score_wide.columns = ['PATNO'] + [f'u{i}' for i in range(cols)] # resets score column names correctly
    time_wide.columns = ['PATNO'] + [f't{i}' for i in range(cols)] # resets time_index column names accordingly
    off_wide.columns = ['PATNO'] + [f'off_{i}' for i in range(cols)] # resets time_index column names accordingly
    on_wide.columns = ['PATNO'] + [f'on_{i}' for i in range(cols)] # resets time_index column names accordingly

    #merged = pd.merge(score_wide, time_wide, on='PATNO') # use PATNO to merge
    #print(merged)
    #merged.drop(columns='PATNO', inplace=True) # drop PATNO, no longer needed

    merged = pd.concat([score_wide, time_wide, off_wide, on_wide], axis=1)
    merged.drop(columns='PATNO', inplace=True)


    new_cols = []
    for i in range(cols): # re order columns for ML piplines
        new_cols.append(f't{i}')
        new_cols.append(f'off_{i}')
        new_cols.append(f'on_{i}')
        new_cols.append(f'u{i}')

    merged = merged[new_cols]

    return merged

def fill_same_month(og_df: pd.DataFrame, pre_proc: pd.DataFrame) -> pd.DataFrame:
    #filters on duplicate dates within patient ids
    same_mo_rows = og_df.groupby(['PATNO', 'INFODT']).filter(lambda x: len(x) > 1).index
    same_mo_df = og_df.iloc[same_mo_rows].copy()
    duplicate_patnos = same_mo_df['PATNO'].unique().tolist()
    final_res = pd.DataFrame(columns=['PATNO', 'INFODT', 'score', 'PDSTATE', 'time_delta', 'time_index'])

    offset = 0
    for id in duplicate_patnos:

        selected_dates = pre_proc[pre_proc['PATNO'] == id]['INFODT'].tolist()
        duplicate_dates = same_mo_df[same_mo_df['PATNO'] == id]['INFODT'].tolist()

        res_slice = pre_proc[pre_proc['PATNO'] == id].copy()
        dup_slice = same_mo_df[same_mo_df['PATNO'] == id].copy()
        temp = []
        for date in selected_dates:
            if date in duplicate_dates:
                #selected date = date
                selected_state = res_slice['PDSTATE'].values[0]
                
                temp.append([
                    id, 
                    date, 
                    dup_slice[(dup_slice['INFODT'] == date) & (dup_slice['PDSTATE'] != selected_state)]['NP3TOT'].values[0],
                    dup_slice[(dup_slice['INFODT'] == date) & (dup_slice['PDSTATE'] != selected_state)]['PDSTATE'].values[0]
                    ])
                
                #print(temp)

        if len(temp) > 0:
            for entry in temp:
                #print(res_slice[res_slice['INFODT'] != entry[1]][['PATNO', 'INFODT', 'score', 'PDSTATE']], entry[1])
                temp_res = res_slice[res_slice['INFODT'] != entry[1]][['PATNO', 'INFODT', 'score', 'PDSTATE']].copy()
                temp_res.loc[3] = entry
                #sort by date to get back to the correct time index
                temp_res.sort_values('INFODT', inplace=True)

                #add in the necessary columns for future pivoting
                temp_res['time_delta'] = (
                temp_res.groupby('PATNO')['INFODT']
                .transform(
                    lambda x: (x - x.min()) / np.timedelta64(30, 'D')
                    )
                )

                temp_res['time_index'] = (
                        temp_res.groupby('PATNO')['INFODT']
                        .rank(method='first')
                        .astype(int)-1
                    ) 
                
                # needed to add this due to my own stupidity
                offset += 1 if len(temp) > 1 else offset

                temp_res['PATNO'] = offset

                #print(temp_res)

                final_res = pd.concat([final_res, temp_res], axis=0, join='outer', ignore_index=True)

    return final_res

In [237]:
test = process_dates_caller(upd3_PD_off_on, 3, True)
on_hot = pd.get_dummies(test['PDSTATE'], dtype=int)
on_hot.drop(columns=['None'], inplace=True)
PD_on_off_encoded = pd.merge(test, on_hot, left_index=True, right_index=True)
res_on_off = pivot_wide_with_states(PD_on_off_encoded, 3)

duplicates = fill_same_month(upd3_PD_off_on, test)
one_hot_dup = pd.get_dummies(duplicates['PDSTATE'], dtype=int)
one_hot_dup.drop(columns=['None'], inplace=True)
dup_encoded = pd.merge(duplicates, one_hot_dup, left_index=True, right_index=True)
dup_res = pivot_wide_with_states(dup_encoded, 3)
dup_res.dropna(inplace=True)

analytical_set = pd.concat([res_on_off, dup_res], axis=0, join='outer', ignore_index=True)

analytical_set

  final_res = pd.concat([final_res, temp_res], axis=0, join='outer', ignore_index=True)


Unnamed: 0,t0,off_0,on_0,u0,t1,off_1,on_1,u1,t2,off_2,on_2,u2
0,0.0,1,0,39.0,14.233333,1,0,34.0,96.366667,0,1,50.0
1,0.0,0,1,23.0,18.200000,0,1,29.0,24.333333,1,0,38.0
2,0.0,1,0,47.0,103.433333,0,1,43.0,115.633333,0,1,44.0
3,0.0,0,1,39.0,9.100000,1,0,48.0,58.800000,1,0,58.0
4,0.0,0,1,39.0,15.266667,0,1,38.0,40.633333,0,1,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1872,0.0,0,1,7.0,6.033333,0,1,5.0,12.166667,1,0,12.0
1873,0.0,0,1,10.0,6.133333,1,0,14.0,13.233333,0,1,9.0
1874,0.0,0,1,10.0,6.033333,1,0,13.0,12.166667,0,1,27.0
1875,0.0,1,0,8.0,6.033333,0,1,14.0,12.166667,0,1,27.0


In [238]:
# Simple stratified Cross Validation testing
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

base_std = upd3_PD_off_on['NP3TOT'].std()
folds = 5
pipe_rf = Pipeline(steps=[('model', RandomForestRegressor())])
pipe_ridge = Pipeline(steps=[('model', Ridge())])

X = analytical_set.loc[:, analytical_set.columns != 'u2']
y = analytical_set['u2']
score_test_rf = -1 * cross_val_score(pipe_rf, X, y.values.ravel(), cv=folds, scoring="neg_root_mean_squared_error")
score_test_ridge = -1 * cross_val_score(pipe_ridge, X, y.values.ravel(), cv=folds, scoring="neg_root_mean_squared_error")

In [239]:
print(f"STD(NP3TOT) : {base_std}")
print(" Ridge Regression RMSE by fold: ", score_test_ridge)
print(" Random Forest RMSE by fold: ", score_test_rf, "\n")
print(f" Ridge Regression mean RMSE: {score_test_ridge.mean()}", '\n', f"Random Forest, default 5-nodes, mean RMSE: {score_test_rf.mean()}")

STD(NP3TOT) : 13.651124300403424
 Ridge Regression RMSE by fold:  [10.92387054 10.51875047 11.35378206 11.70293454 10.40530289]
 Random Forest RMSE by fold:  [7.38287297 6.89445109 9.08069068 9.05595292 8.3397948 ] 

 Ridge Regression mean RMSE: 10.980928100329532 
 Random Forest, default 5-nodes, mean RMSE: 8.150752491417943
