In [1]:
import pandas as pd
import numpy as np
#PLOT & MATH LIBS
import seaborn as sns
import matplotlib.pyplot as plt
#pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'

import random

### Final Research wrap up
##### ON OFF States
1. Same idea as originally (one-hot) BUT:
   1. Keep Same Month Visits, then select randomly from total visits
   2. In the case with both ON & OFF for the same visit
      1. Record the scores in an extra column so, u_on_0, u_off_o oooooorrrr
      2. **Duplicate the row, keeping all other entries the same, but with the ON & OFF entry different for each**
      3. Make Sure both of either same month or ON & OFF are in the same training or validation set
   3. Compare One-to-one with completely random selection from fist analysis (but including same month visits)


##### De-noising
1. Use calendar approach:
   1. Divide the PATNO's interval into 3 non-overlapping >6mo periods
   2. Take the mean of each interval as the interval UPDRS score
   3. Take the mean(median Date) as the date for that interval (if >1 date within!)
3.  Compare results vs calendar approach for random choice for both (i.e. random selection within same intervals)

#### BOTH
- Have the underlying entry selection and structure be the same for baseline and model

In [19]:
def median_date_calc(date_group: pd.DataFrame, interval: pd.Timedelta) -> list:
    dates = date_group['INFODT'].tolist()
    date1 = dates[0] + interval / 2
    date2 = dates[1] + interval / 2
    date3 = dates[2] + interval / 2
    return [(date1-date1).days, (date2-date1).days, (date3-date1).days]

def interpolate_same_month(df: pd.DataFrame, method = 'rand') -> pd.DataFrame:
    # Select maximum score from same month measurements
    temp_df = df.copy()
    temp_df['YEAR_MONTH'] = temp_df['INFODT'].dt.to_period('M')

    #takes random selctions/maximum/minimum/mean of values which share same month and year
    if method == 'rand':
        sample = temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].apply(lambda x: x.sample(1)).reset_index()
        result = pd.merge(temp_df, sample, on=['PATNO', 'YEAR_MONTH', 'NP3TOT'])

        result.drop(columns=['YEAR_MONTH', 'level_2'], inplace=True)
        return result.drop_duplicates(subset=['PATNO', 'INFODT']).reset_index(drop=True)

    elif method == 'min':
        temp_df['NP3TOT'] = temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].transform('min')
    elif method == 'max':
        temp_df['NP3TOT'] = temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].transform('max')
    else:
        temp_df['NP3TOT'] = temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].transform('mean')

    
    temp_df.drop(columns=['YEAR_MONTH'], inplace=True)
    temp_df.reset_index(drop=True, inplace=True)
    return temp_df.drop_duplicates(subset=['PATNO', 'INFODT'])

def pivot_wide(df: pd.DataFrame, cols: int = 3) -> pd.DataFrame:

    df['time_index'] = (
            df.groupby('PATNO')['INFODT']
            .rank(method='first')
            .astype(int)-1
        ) 
    
    time_df = df[['PATNO', 'INFODT', 'time_index']]

    score_wide = df.pivot(
        index='PATNO',
        columns='time_index',
        values='NP3TOT'
    ).reset_index()

    time_wide = time_df.pivot(
        index='PATNO',
        columns="time_index",
        values='INFODT'
    ).reset_index()

    score_wide.columns = ['PATNO'] + [f'u{i}' for i in range(cols)] # resets score column names correctly
    time_wide.columns = ['PATNO'] + [f't{i}' for i in range(cols)] # resets time_index column names accordingly

    merged = pd.merge(score_wide, time_wide, on='PATNO')
    merged.drop(columns='PATNO', inplace=True)

    new_cols = []
    for i in range(cols): # re order columns for ML piplines
        new_cols.append(f't{i}')
        new_cols.append(f'u{i}')

    merged = merged[new_cols]

    return merged.drop(columns='t0')



# keep_small_intervals flag allows us to keep some entries with <6mo intervals
# assumes process_same_dates has already been calles
def process_mean_intervals(df: pd.DataFrame, blocks: int = 3, keep_small_intervals = True) -> pd.DataFrame:
    patnos = df['PATNO'].unique().tolist()
    result = pd.DataFrame(columns=['PATNO', 'INFODT', 'NP3TOT'])
    for patno in patnos:
        df_pat = df[df['PATNO'] == patno]
        start = df_pat['INFODT'].min()
        end = df_pat['INFODT'].max()
        diff = end - start

        if (diff < pd.Timedelta(days=180)) or (df_pat.index.size < 3):
            continue

        interval = diff // blocks

        # drop small intervals < 6mo
        if not keep_small_intervals and interval < pd.Timedelta(days=180):
            continue

        group_counts = df_pat.groupby(pd.Grouper(key='INFODT', freq= f'{interval.days+1}D'))['NP3TOT'].count()
        temp = df_pat.groupby(pd.Grouper(key='INFODT', freq= f'{interval.days+1}D'))['NP3TOT'].mean().reset_index()
        temp['group_counts'] = group_counts.values
        # checks for Failed Time Group Function 
        if temp['NP3TOT'].isnull().values.any():
            continue
        
        dates = median_date_calc(df_pat.groupby(pd.Grouper(key='INFODT', freq= f'{interval.days}D'))['NP3TOT'].mean().reset_index(), interval)

        # this logic will create some entries with < 6mo intervals, but this is only due to the abover grouper function using a slightly different calendar
        temp['INFODT'] = dates
        temp['PATNO'] = patno

        result = pd.concat([result, temp], ignore_index=True)

    return result

def process_random_intervals(df: pd.DataFrame, blocks: int = 3, keep_small_intervals = True) -> pd.DataFrame:
    patnos = df['PATNO'].unique().tolist()
    patnos = df['PATNO'].unique().tolist()
    result = pd.DataFrame(columns=['PATNO', 'INFODT', 'NP3TOT'])
    for patno in patnos:
        df_pat = df[df['PATNO'] == patno]
        start = df_pat['INFODT'].min()
        end = df_pat['INFODT'].max()
        diff = end - start

        if (diff < pd.Timedelta(days=180)) or (df_pat.index.size < 3):
            continue

        interval = diff // blocks

        # drop small intervals < 6mo
        if not keep_small_intervals and interval < pd.Timedelta(days=180):
            continue

        temp = (
            df_pat.groupby(pd.Grouper(key='INFODT', freq=f'{interval.days+1}D'))['NP3TOT']
            .apply(lambda x: x.sample(n=1, random_state=1) if len(x) > 0 else None)
            .dropna()  # Drop groups where no sample was taken (i.e., empty groups)
            .reset_index()
        )

        # checks for Failed Time Group Function // when the groupby function can't return 3 groups (like it should)
        if temp.index.size != 3:
            #print("Failed Time Group Function")
            continue

        dates = median_date_calc(df_pat.groupby(pd.Grouper(key='INFODT', freq= f'{interval.days}D'))['NP3TOT'].mean().reset_index(), interval)

        # this logic will create some entries with < 6mo intervals, but this is only due to the abover grouper function using a slightly different calendar
        temp['INFODT'] = dates
        temp['PATNO'] = patno
        temp.drop(columns=['level_1'], inplace=True)

        result = pd.concat([result, temp], ignore_index=True)

    return result

In [20]:

UPDRS3 = "data/MDS-UPDRS_Part_III_10Jun2024.csv"
patient_status = "data/Participant_Status_03Jun2024.csv"

df3 = pd.read_csv(UPDRS3)
df_pat_stat = pd.read_csv(patient_status) #patient status data
df3 = df3.dropna(subset=['NP3TOT']).reset_index() # will keep for now, might need to include nans
df3['INFODT'] = pd.to_datetime(df3['INFODT'], format="%m/%Y") #reformat INFODT (Assesment Date) to date-time objects
df3['PDSTATE'] =  df3['PDSTATE'].fillna("None")
df3 = df3[["PATNO", "EVENT_ID", "INFODT", "PDSTATE", "PAG_NAME", "NP3TOT"]]

desired_cols_df_pat = {'PATNO', 'COHORT', 'ENROLL_STATUS'}
pat_filtered = df_pat_stat.drop(columns=set(df_pat_stat.columns) - desired_cols_df_pat)
df3_full = pd.merge(df3, pat_filtered, on="PATNO")
df3_full = df3_full[df3_full['ENROLL_STATUS'].isin(['Enrolled', 'Withdrew', 'Complete'])]
df3_full.drop(columns=['ENROLL_STATUS'], inplace=True)
df3_full = df3_full.sort_values(['PATNO', 'INFODT'])

# Partition our data sets
upd3_control = df3_full[df3_full['COHORT'] == 2]
upd3_PD = df3_full[df3_full['COHORT'] == 1]
upd3_PD_nan = upd3_PD[(upd3_PD['PDSTATE'] != 'ON') & (upd3_PD['PDSTATE'] != 'OFF') & (upd3_PD['PAG_NAME'] != 'NUPDR3OF') & (upd3_PD['PAG_NAME'] != 'NUPDR3ON')].reset_index(drop=True)
upd3_PD_off = upd3_PD[(upd3_PD['PDSTATE'] == 'OFF') | (upd3_PD['PAG_NAME'] == 'NUPDR3OF')].reset_index(drop=True)
upd3_PD_on = upd3_PD[(upd3_PD['PDSTATE'] == 'ON') | (upd3_PD['PAG_NAME'] == 'NUPDR3ON')].reset_index(drop=True)

### Analysis on Median Time interpolation
- Results below will be the same for both, as they use exactly the same logic on the grouper function

In [21]:
res_mean = process_mean_intervals(interpolate_same_month(upd3_PD_nan, method='mean'), blocks=3, keep_small_intervals=True)
res_rand = process_random_intervals(interpolate_same_month(upd3_PD_nan, method='rand'), blocks=3, keep_small_intervals=True)

sub_optimal_time_deltas = {"res_mean": [res_mean[(res_mean['INFODT'] < 180) & (res_mean['INFODT'] > 0)]['INFODT'].mean(),
                           res_mean[(res_mean['INFODT'] < 180) & (res_mean['INFODT'] > 0)]['INFODT'].median()],
                           "res_rand": [res_rand[(res_rand['INFODT'] < 180) & (res_rand['INFODT'] > 0)]['INFODT'].mean(),
                           res_rand[(res_rand['INFODT'] < 180) & (res_rand['INFODT'] > 0)]['INFODT'].median()]}

print(sub_optimal_time_deltas)

  result = pd.concat([result, temp], ignore_index=True)
  result = pd.concat([result, temp], ignore_index=True)


{'res_mean': [116.46666666666667, 121.0], 'res_rand': [116.46666666666667, 121.0]}


### New Base-Lines

In [22]:
# Simple stratified Cross Validation testing
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

# Pipline Data Sets
nan = interpolate_same_month(upd3_PD_nan, 'rand')
off = interpolate_same_month(upd3_PD_off, 'rand')
on = interpolate_same_month(upd3_PD_on, 'rand')
control = interpolate_same_month(upd3_control, 'rand')

#BASELINES RANDOM
base_nan_rand = np.std(nan['NP3TOT'])
base_off_rand = np.std(off['NP3TOT'])
base_on_rand  = np.std(on['NP3TOT'])
base_control_rand = np.std(control['NP3TOT'])

PD_nan_rand = pivot_wide(process_random_intervals(nan, 3, keep_small_intervals=True))
PD_off_rand = pivot_wide(process_random_intervals(off, 3, keep_small_intervals=True))
PD_on_rand = pivot_wide(process_random_intervals(on, 3, keep_small_intervals=True))
control_rand = pivot_wide(process_random_intervals(control, 3, keep_small_intervals=True))

#DE-NOISED
nan_mean = interpolate_same_month(upd3_PD_nan, 'mean')
off_mean = interpolate_same_month(upd3_PD_off, 'mean')
on_mean = interpolate_same_month(upd3_PD_on, 'mean')
control_mean = interpolate_same_month(upd3_control, 'mean')

base_nan_mean = np.std(nan_mean['NP3TOT'])
base_off_mean = np.std(off_mean['NP3TOT'])
base_on_mean  = np.std(on_mean['NP3TOT'])
base_control_mean = np.std(control_mean['NP3TOT'])

PD_nan_mean = pivot_wide(process_mean_intervals(nan_mean, 3, keep_small_intervals=True))
PD_off_mean = pivot_wide(process_mean_intervals(off_mean, 3, keep_small_intervals=True))
PD_on_mean = pivot_wide(process_mean_intervals(on_mean, 3, keep_small_intervals=True))
control_mean = pivot_wide(process_mean_intervals(control_mean, 3, keep_small_intervals=True))

  result = pd.concat([result, temp], ignore_index=True)
  result = pd.concat([result, temp], ignore_index=True)
  result = pd.concat([result, temp], ignore_index=True)
  result = pd.concat([result, temp], ignore_index=True)
  result = pd.concat([result, temp], ignore_index=True)
  result = pd.concat([result, temp], ignore_index=True)
  result = pd.concat([result, temp], ignore_index=True)
  result = pd.concat([result, temp], ignore_index=True)


In [24]:
def test_models(df:pd.DataFrame, base_line: float, identifier: str = "DEFAULT", folds = 5):
    X = df.iloc[: , : len(df.columns)-1]
    y = df.iloc[: , len(df.columns)-1 : ]

    pipe_rf = Pipeline(steps=[('model', RandomForestRegressor())])
    pipe_ridge = Pipeline(steps=[('model', Ridge())])
    # .values will give the values in a numpy array (shape: (n,1))
    # .ravel will convert that array shape to (n, ) (i.e. flatten it)
    score_test_rf = -1 * cross_val_score(pipe_rf, X, y.values.ravel(), cv=folds, scoring="neg_root_mean_squared_error")
    score_test_ridge = -1 * cross_val_score(pipe_ridge, X, y.values.ravel(), cv=folds, scoring="neg_root_mean_squared_error")

    print(f"Testing {identifier} Model")
    print("----------------------------------------------------------------")
    print(f"STD(NP3TOT) {identifier}: {base_line}")
    print("----------------------------------------------------------------")

    print(f"Startings models, Simple Cross Validation, k = {folds}, VS std(UPDRS): \n")
    print(" Ridge Regression RMSE by fold: ", score_test_ridge)
    print(" Random Forest RMSE by fold: ", score_test_rf, "\n")
    print(f" Ridge Regression mean RMSE: {score_test_ridge.mean()}", '\n', f"Random Forest, default 5-nodes, mean RMSE: {score_test_rf.mean()}")

    return [score_test_rf.mean(), score_test_ridge.mean()]

def compare_models(df_mean: pd.DataFrame, df_rand: pd.DataFrame, base_mean: float, base_rand: float):
    mean = test_models(df_mean, base_mean, "MEAN")
    rand = test_models(df_rand, base_rand, "Random Selection")
    print("----------------------------------------------------------------")
    print("Comparing Models")
    print("----------------------------------------------------------------")
    print(f"Mean Model: {mean}")
    print(f"Random Model: {rand}")

#### Testing

In [25]:
compare_models(PD_nan_mean, PD_nan_rand, base_nan_mean, base_nan_rand)

Testing MEAN Model
----------------------------------------------------------------
STD(NP3TOT) MEAN: 10.364208177895836
----------------------------------------------------------------
Startings models, Simple Cross Validation, k = 5, VS std(UPDRS): 

 Ridge Regression RMSE by fold:  [6.91107141 5.58562419 6.08251447 6.20402827 5.90124459]
 Random Forest RMSE by fold:  [7.22851021 6.10489088 6.5504219  6.43666519 6.07639646] 

 Ridge Regression mean RMSE: 6.136896584608074 
 Random Forest, default 5-nodes, mean RMSE: 6.479376926535873
Testing Random Selection Model
----------------------------------------------------------------
STD(NP3TOT) Random Selection: 10.362838637802493
----------------------------------------------------------------
Startings models, Simple Cross Validation, k = 5, VS std(UPDRS): 

 Ridge Regression RMSE by fold:  [8.70071038 6.68757251 7.94589857 6.52360109 5.56118326]
 Random Forest RMSE by fold:  [9.0342922  6.86086849 8.54392526 6.76253569 6.41627607] 

 R

### Results of De-noising
- Have some improvement!

In [34]:
test_models(PD_nan_rand, base_nan_rand, "PD_nan - Random Selection")

Testing PD_nan - Random Selection Model
----------------------------------------------------------------
STD(NP3TOT) PD_nan - Random Selection: 10.362838637802493
----------------------------------------------------------------
Startings models, Simple Cross Validation, k = 5, VS std(UPDRS): 

 Ridge Regression RMSE by fold:  [8.70071038 6.68757251 7.94589857 6.52360109 5.56118326]
 Random Forest RMSE by fold:  [9.04437412 6.93225207 8.41778543 6.67907911 6.36091929] 

 Ridge Regression mean RMSE: 7.0837931632457725 
 Random Forest, default 5-nodes, mean RMSE: 7.486882004401413


[7.486882004401413, 7.0837931632457725]

In [26]:
test_models(PD_nan_mean, base_nan_mean, "PD_nan - mean")

Testing PD_nan - mean Model
----------------------------------------------------------------
STD(NP3TOT) PD_nan - mean: 10.364208177895836
----------------------------------------------------------------
Startings models, Simple Cross Validation, k = 5, VS std(UPDRS): 

 Ridge Regression RMSE by fold:  [6.91107141 5.58562419 6.08251447 6.20402827 5.90124459]
 Random Forest RMSE by fold:  [7.23937371 6.09191799 6.38802719 6.49733997 6.07188633] 

 Ridge Regression mean RMSE: 6.136896584608074 
 Random Forest, default 5-nodes, mean RMSE: 6.4577090381394395


[6.4577090381394395, 6.136896584608074]

In [29]:
test_models(PD_off_rand, base_control_rand, "PD_off - Random Selection")

Testing PD_off - Random Selection Model
----------------------------------------------------------------
STD(NP3TOT) PD_off - Random Selection: 3.6578525543510483
----------------------------------------------------------------
Startings models, Simple Cross Validation, k = 5, VS std(UPDRS): 

 Ridge Regression RMSE by fold:  [11.66845595 10.76481325 13.69169393 12.43510625  9.1594407 ]
 Random Forest RMSE by fold:  [12.16504084 12.64765307 15.00681165 13.60191722 10.97588654] 

 Ridge Regression mean RMSE: 11.54390201663226 
 Random Forest, default 5-nodes, mean RMSE: 12.87946186493274


[12.87946186493274, 11.54390201663226]

In [33]:
test_models(PD_off_mean, base_off_mean, "PD_off - Mean")

Testing PD_off - Mean Model
----------------------------------------------------------------
STD(NP3TOT) PD_off - Mean: 14.29971406609739
----------------------------------------------------------------
Startings models, Simple Cross Validation, k = 5, VS std(UPDRS): 

 Ridge Regression RMSE by fold:  [ 8.63199575  9.14592362 10.39531376 12.40404826  8.25601125]
 Random Forest RMSE by fold:  [ 9.26253158  9.88510082 11.24920481 12.33954202  8.90627716] 

 Ridge Regression mean RMSE: 9.766658527427321 
 Random Forest, default 5-nodes, mean RMSE: 10.328531276973072


[10.328531276973072, 9.766658527427321]

In [30]:
test_models(PD_on_rand, base_on_rand, "PD_on - Random Selection")

Testing PD_on - Random Selection Model
----------------------------------------------------------------
STD(NP3TOT) PD_on - Random Selection: 12.3650437267417
----------------------------------------------------------------
Startings models, Simple Cross Validation, k = 5, VS std(UPDRS): 

 Ridge Regression RMSE by fold:  [ 9.45682032 10.04298727  9.62934988 11.32655765  8.07995708]
 Random Forest RMSE by fold:  [11.14985652 10.49618616 11.30439734 12.07316683  8.57845384] 

 Ridge Regression mean RMSE: 9.70713444051464 
 Random Forest, default 5-nodes, mean RMSE: 10.720412137123907


[10.720412137123907, 9.70713444051464]

In [32]:
test_models(PD_on_mean, base_on_mean, "PD_on - mean")

Testing PD_on - mean Model
----------------------------------------------------------------
STD(NP3TOT) PD_on - mean: 12.36565687843349
----------------------------------------------------------------
Startings models, Simple Cross Validation, k = 5, VS std(UPDRS): 

 Ridge Regression RMSE by fold:  [8.76227789 9.10094415 8.36509702 8.55911554 7.67864504]
 Random Forest RMSE by fold:  [9.66598723 9.44188011 8.98107257 9.14247181 7.92152757] 

 Ridge Regression mean RMSE: 8.493215928304085 
 Random Forest, default 5-nodes, mean RMSE: 9.030587857706752


[9.030587857706752, 8.493215928304085]

In [31]:
test_models(control_rand, base_control_rand, "Control - Random Selection")

Testing Control - Random Selection Model
----------------------------------------------------------------
STD(NP3TOT) Control - Random Selection: 3.6578525543510483
----------------------------------------------------------------
Startings models, Simple Cross Validation, k = 5, VS std(UPDRS): 

 Ridge Regression RMSE by fold:  [4.55208934 6.26859076 2.75340235 3.10779985 1.80226811]
 Random Forest RMSE by fold:  [4.07982884 6.69047586 6.21206404 3.15122124 1.98719542] 

 Ridge Regression mean RMSE: 3.69683008370965 
 Random Forest, default 5-nodes, mean RMSE: 4.424157083317743


[4.424157083317743, 3.69683008370965]

In [36]:
test_models(control_mean, base_control_mean, "Control - mean")

Testing Control - mean Model
----------------------------------------------------------------
STD(NP3TOT) Control - mean: 3.6681035622266616
----------------------------------------------------------------
Startings models, Simple Cross Validation, k = 5, VS std(UPDRS): 

 Ridge Regression RMSE by fold:  [4.3711837  4.8588078  1.92876078 2.54946409 2.20902345]
 Random Forest RMSE by fold:  [4.37547551 4.78111847 1.87291086 2.50478465 2.17510759] 

 Ridge Regression mean RMSE: 3.1834479658368107 
 Random Forest, default 5-nodes, mean RMSE: 3.1418794166724355


[3.1418794166724355, 3.1834479658368107]