In [1]:
import pandas as pd
import numpy as np
#PLOT & MATH LIBS
import seaborn as sns
import matplotlib.pyplot as plt
#pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'

import random
random.seed(1)

In [4]:

UPDRS3 = "data/MDS-UPDRS_Part_III_10Jun2024.csv"
patient_status = "data/Participant_Status_03Jun2024.csv"

df3 = pd.read_csv(UPDRS3)
df_pat_stat = pd.read_csv(patient_status) #patient status data
df3 = df3.dropna(subset=['NP3TOT']).reset_index() # will keep for now, might need to include nans
df3['INFODT'] = pd.to_datetime(df3['INFODT'], format="%m/%Y") #reformat INFODT (Assesment Date) to date-time objects
df3['PDSTATE'] =  df3['PDSTATE'].fillna("None")
df3 = df3[["PATNO", "EVENT_ID", "INFODT", "PDSTATE", "PAG_NAME", "NP3TOT"]]

desired_cols_df_pat = {'PATNO', 'COHORT', 'ENROLL_STATUS'}
pat_filtered = df_pat_stat.drop(columns=set(df_pat_stat.columns) - desired_cols_df_pat)
df3_full = pd.merge(df3, pat_filtered, on="PATNO")
df3_full = df3_full[df3_full['ENROLL_STATUS'].isin(['Enrolled', 'Withdrew', 'Complete'])]
df3_full.drop(columns=['ENROLL_STATUS'], inplace=True)
df3_full = df3_full.sort_values(['PATNO', 'INFODT'])

upd3_control = df3_full[df3_full['COHORT'] == 2]
upd3_PD = df3_full[df3_full['COHORT'] == 1]
upd3_PD_nan = upd3_PD[(upd3_PD['PDSTATE'] != 'ON') & (upd3_PD['PDSTATE'] != 'OFF') & (upd3_PD['PAG_NAME'] != 'NUPDR3OF') & (upd3_PD['PAG_NAME'] != 'NUPDR3ON')]
upd3_PD_off = upd3_PD[(upd3_PD['PDSTATE'] == 'OFF') | (upd3_PD['PAG_NAME'] == 'NUPDR3OF')]
upd3_PD_on = upd3_PD[(upd3_PD['PDSTATE'] == 'ON') | (upd3_PD['PAG_NAME'] == 'NUPDR3ON')]

In [5]:
# helper function for interpolating between same month scores

def interpolate_same_month(df: pd.DataFrame, method = 'max') -> pd.DataFrame:
    # Select maximum score from same month measurements
    temp_df = df.copy()
    temp_df['YEAR_MONTH'] = temp_df['INFODT'].dt.to_period('M')

    #takes maximum/minimum/mean of values which share same month and year
    if method == 'max':
        result = pd.merge(temp_df, temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].max(), on=['PATNO', 'YEAR_MONTH', 'NP3TOT'])
    elif method == 'min':
        result = pd.merge(temp_df, temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].min(), on=['PATNO', 'YEAR_MONTH', 'NP3TOT'])
    else:
        result = pd.merge(temp_df, temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].mean(), on=['PATNO', 'YEAR_MONTH', 'NP3TOT'])

    result = pd.merge(temp_df, temp_df.groupby(['PATNO', 'YEAR_MONTH'])['NP3TOT'].max(), on=['PATNO', 'YEAR_MONTH', 'NP3TOT'])
    result.drop(columns=['YEAR_MONTH'], inplace=True)

    return result

In [7]:
test = interpolate_same_month(upd3_PD_nan)
test = test[test['PATNO'] == 3003]

test

Unnamed: 0,PATNO,EVENT_ID,INFODT,PDSTATE,PAG_NAME,NP3TOT,COHORT
11,3003,SC,2011-03-01,,NUPDRS3,26.0,1
12,3003,BL,2011-04-01,,NUPDRS3,29.0,1
13,3003,V01,2011-07-01,,NUPDRS3,37.0,1
14,3003,V02,2011-10-01,,NUPDRS3,37.0,1
15,3003,V03,2012-02-01,,NUPDRS3,33.0,1
16,3003,V04,2012-04-01,,NUPDRS3,44.0,1
17,3003,ST,2012-10-01,,NUPDRS3,40.0,1
18,3003,V06,2013-04-01,,NUPDRS3,43.0,1
19,3003,V07,2013-11-01,,NUPDRS3,37.0,1
20,3003,V08,2014-04-01,,NUPDRS3,46.0,1


#### Final Approach, using the library dummy

In [8]:
patnos = interpolate_same_month(upd3_PD_nan)['PATNO'].unique().tolist()
df = test.copy()
limit = pd.Timedelta(6*30, unit='D') # 180 days ~ 6mo
date_chunks = []

for id in patnos:

    visits = df[df['PATNO'] == id]['INFODT'].tolist()
    soln = []
    lim = 0

    if len(visits) < 3:
        continue

    sub_df = df[df['PATNO'] == id]

    while True:
        soln = np.random.choice(visits, 3, replace=False)
        if (soln[1] - soln[0]) > limit and (soln[2] - soln[1]) > limit:
            break
        if lim > 100:
            print(f"No Possible Calendar Found: ID {id}")
            break
        lim += 1

    print(sub_df.loc[sub_df['INFODT'] == soln[0]]['NP3TOT'], '\n')
    
    if lim < 100:
        pass
        date_chunks.append([id, sub_df.loc[sub_df['INFODT'] == soln[0], 'INFODT'].values[0], sub_df.loc[sub_df['INFODT'] == soln[0], 'NP3TOT'].values[0]]) 
        date_chunks.append([id, sub_df.loc[sub_df['INFODT'] == soln[1], 'INFODT'].values[0], sub_df.loc[sub_df['INFODT'] == soln[1], 'NP3TOT'].values[0]]) 
        date_chunks.append([id, sub_df.loc[sub_df['INFODT'] == soln[2], 'INFODT'].values[0], sub_df.loc[sub_df['INFODT'] == soln[2], 'NP3TOT'].values[0]]) 
res = pd.DataFrame(date_chunks, columns=['PATNO', 'INFODT', 'score'])

12    29.0
Name: NP3TOT, dtype: float64 



#### Approach one, sliding window for random selection

In [None]:
#random.seed(1)
df = test.copy()
limit = pd.Timedelta(6*30, unit='D') # 180 days ~ 6mo
visits = df['INFODT'].tolist()

v1 = random.sample(visits[0 : len(visits) // 3], 1)[0] #select 1st visit at random from 1st 1/3 of visits

visits = visits[visits.index(v1) + 1: len(visits) + 1]

k = 0
while k < len(visits):
    if (visits[k] - v1) < limit:
        k += 1
        continue
    break

if (k == len(visits)):
    print("No other visit within 6 months")
    raise ValueError("No other visit within 6 months")

visits_left = visits[k : k + len(visits) // 2]

v2 = random.sample(visits_left, 1)[0]

k = 0
while k < len(visits):
    if (visits[k] - v2) < limit:
        k += 1
        continue
    break

if (k == len(visits)):
    print("No other visit within 6 months")
    raise ValueError("No other visit within 6 months")
visits_right = visits[k : k + len(visits)]

v3 = random.sample(visits_right, 1)[0]

print(v1, v2, v3)

#### Approach 2: Pick 1st and Last date, then randomly select internal dates

In [None]:
#random.seed(1)
df = test.copy()
limit = pd.Timedelta(6*30, unit='D') # 180 days ~ 6mo
visits = df['INFODT'].tolist()

v1 = visits[0]
v3 = visits[-1]

visits = visits[1 : len(visits) - 1]

i,j = 0, len(visits) - 1

print(visits)

while i < len(visits):
    if (visits[i] - v1) < limit:
        i += 1
        continue
    break

while j >= 0:
    if (v3 - visits[j]) < limit:
        j -= 1
        continue
    break

v2 = None

if i < j:
    v2 = random.sample(visits[i : j + 1], 1)[0]
elif i == j:
    v2 = visits[i]
else:
    print("No other visit within 6 months")

print(v1, v2, v3)