In [146]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [147]:
UPDRS3 = "data/MDS-UPDRS_Part_III_10Jun2024.csv"
patient_status = "data/Participant_Status_03Jun2024.csv"

df3 = pd.read_csv(UPDRS3)
df_pat_stat = pd.read_csv(patient_status) #patient status data
df3 = df3.dropna(subset=['NP3TOT']).reset_index()
df3['INFODT'] = pd.to_datetime(df3['INFODT'], format="%m/%Y") #reformat INFODT (Assesment Date) to date-time objects

#df3[['OFFEXAM', 'ONEXAM']] = df3[['OFFEXAM', 'ONEXAM']].fillna(0)

df3['year'] = pd.DatetimeIndex(df3['INFODT']).year

#### Accounting for ON/OFF states
- ONOFFORDER :: First Part III exam OFF or ON (1 = OFF)
- OFFEXAM :: OFF exam performed
- ONEXAM :: ON exam performed

PDSTATE :: Which functional state is the participant currently in :: 
  - OFF :: Off is the typical functional state when participants have a poor response in spite of taking medications 
  - ON :: ON is the typical functional state when patients are receiving medication and have a good response
  
  - NUPDR3OF :: "OFF STATE"
  - NUPDR3ON :: "ON STATE"
  - NUPDRDOSE3 :: PDS UPDRS Part 3 treatment determination and part 3 motor exam
  - NUPDRS3 :: Part 3 (no treatment)

Keys: OFFEXAM, ONEXAM, maybe ONOFFORDER

In [148]:
desired_cols_df3 = {'NP3TOT', 'PATNO', 'EVENT_ID', 'ORIG_ENTRY', 'INFODT', 'PDSTATE', 'PAG_NAME','year', 'ONEXAM', 'OFFEXAM'}
desired_cols_df_pat = {'PATNO', 'COHORT', 'ENROLL_STATUS', 'ENROLL_AGE'}

pat_filtered = df_pat_stat.drop(columns=set(df_pat_stat.columns) - desired_cols_df_pat)
df3_filtered = df3.drop(columns=set(df3.columns) - desired_cols_df3)

df3_full_filtered = pd.merge(df3_filtered, pat_filtered, on="PATNO")
df3_full_filtered = df3_full_filtered[df3_full_filtered['ENROLL_STATUS'].isin(['Enrolled', 'Withdrew', 'Complete'])]
full_sorted = df3_full_filtered.sort_values(['PATNO', 'INFODT']) #accounts for months as well
full_sorted[['OFFEXAM','ONEXAM']] = full_sorted[['OFFEXAM','ONEXAM']].fillna(0)

full_sorted

Unnamed: 0,PATNO,EVENT_ID,PAG_NAME,INFODT,PDSTATE,OFFEXAM,ONEXAM,NP3TOT,ORIG_ENTRY,year,COHORT,ENROLL_STATUS,ENROLL_AGE
0,3000,BL,NUPDRS3,2011-02-01,,0.0,0.0,4.0,02/2011,2011,2,Enrolled,69.1
1,3000,V04,NUPDRS3,2012-03-01,,0.0,0.0,1.0,03/2012,2012,2,Enrolled,69.1
2,3000,V06,NUPDRS3,2013-02-01,,0.0,0.0,4.0,02/2013,2013,2,Enrolled,69.1
3,3000,V08,NUPDRS3,2014-03-01,,0.0,0.0,2.0,05/2014,2014,2,Enrolled,69.1
4,3000,V10,NUPDRS3,2015-03-01,,0.0,0.0,19.0,03/2015,2015,2,Enrolled,69.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22915,324862,BL,NUPDRDOSE3,2024-04-01,,0.0,0.0,13.0,04/2024,2024,1,Enrolled,56.9
22917,325234,BL,NUPDRDOSE3,2024-04-01,,0.0,0.0,3.0,04/2024,2024,4,Enrolled,59.4
22918,325566,BL,NUPDRDOSE3,2024-04-01,,0.0,0.0,13.0,04/2024,2024,1,Enrolled,51.4
22919,329289,BL,NUPDRDOSE3,2024-04-01,,0.0,0.0,27.0,04/2024,2024,1,Enrolled,62.1


### ADJUSTMENTS
- the PDSTATE variable reflects the ONEXAM and OFFEXAM columns, so it’s redundant to include them
- Adjusted filtering on Patient_status, was excluding patients who'd completed or withdrawn, no large effects on outcome of analysis

In [92]:
group = full_sorted.groupby('PATNO')
#OG code, doesn't make a difference
#full_sorted['delta_NP3TOT'] = group['NP3TOT'].diff()
#full_sorted['delta_years'] = group['INFODT'].diff().dt.days / 365
#full_sorted['delta_norm'] = full_sorted['delta_NP3TOT'] / full_sorted['delta_years']
# Let's see how these behave
#full_sorted['delta_NP3TOT'] = group['NP3TOT'].diff()

full_sorted['delta_norm'] = group['NP3TOT'].diff() / (group['INFODT'].diff().dt.days / 365)
full_sorted['delta_norm_2yrs'] = group['NP3TOT'].diff(2) / (group['INFODT'].diff(2).dt.days / 365)
full_sorted['delta_norm_3yrs'] = group['NP3TOT'].diff(3) / (group['INFODT'].diff(3).dt.days / 365)
full_sorted_w_NaN = full_sorted
full_sorted.replace([np.inf, -np.inf], np.nan, inplace=True)
full_sorted = full_sorted.dropna(subset=['delta_norm']).reset_index(drop=True)

### Original Logic, includes NaNs

In [56]:

#PD_full_sorted = full_sorted[full_sorted['COHORT'] == 1]
#PD_on = PD_full_sorted[(PD_full_sorted['PDSTATE'] == 'ON') | (PD_full_sorted['PAG_NAME'] == 'NUPDR3ON')]['delta_norm']
#PD_off_or_nan = PD_full_sorted[(PD_full_sorted['PDSTATE'] != 'ON') & (PD_full_sorted['PAG_NAME'] != 'NUPDR3ON')]['delta_norm']
#control = full_sorted[full_sorted['COHORT']==2]['delta_norm']

### New Logic Partitioning into OFF Excluding NaNs, NaNs, and Control

In [6]:
PD_test = full_sorted[full_sorted['COHORT'] == 1]

PD_test_on = PD_test[(PD_test['PDSTATE'] == 'ON') | (PD_test['PAG_NAME'] == 'NUPDR3ON') | (PD_test['ONEXAM'] == 1)]['delta_norm']
PD_test_off = PD_test[(PD_test['PDSTATE'] == 'OFF') | (PD_test['PAG_NAME'] == 'NUPDR3OF') | (PD_test['OFFEXAM'] == 1)]['delta_norm']
PD_test_nan = PD_test[(PD_test["PDSTATE"].isna()) & ((PD_test['PAG_NAME'] != 'NUPDR3OF') | (PD_test['PAG_NAME'] != 'NUPDR3ON'))]['delta_norm']
PD_test_off_nan = PD_test[(PD_test['PDSTATE'] != 'ON') & (PD_test['PAG_NAME'] != 'NUPDR3ON')]['delta_norm']
control = full_sorted[full_sorted['COHORT'] == 2]['delta_norm']

#Check, both return Null
#PD_test_off[(PD_test_off['OFFEXAM'] == 1) & (PD_test_off['PDSTATE'] != "OFF")]
#PD_test_on[(PD_test_on['ONEXAM'] == 1) & (PD_test_on['PDSTATE'] != "ON")]

### Distributions

In [98]:
import plotly.express as px
import plotly.graph_objects as go

In [99]:
ZERO_LINE = 0

mean_on = np.mean(PD_test_on)
mean_off = np.mean(PD_test_off)
mean_nan = np.mean(PD_test_nan)
mean_off_nan = np.mean(PD_test_off_nan)
mean_control = np.mean(control)

In [39]:
"""
ON state vs
Flagged OFF vs
NaN's or Undefined
"""

hist1 = go.Histogram(
    x= PD_test_on,
    name='ON State',
    opacity=0.80
)

hist2 = go.Histogram(
    x=PD_test_off,
    name='OFF Ex NaNs',
    opacity=0.80
)

hist3 = go.Histogram(
    x=PD_test_nan,
    name='NaNs',
    opacity=0.80
)

fig = go.Figure()
fig.add_trace(hist1)
fig.add_trace(hist2)
fig.add_trace(hist3)

fig.add_vline(x = mean_on, line_width = 1, line_dash = 'dash', line_color = 'blue')
fig.add_vline(x = mean_off, line_width = 1, line_dash = 'dash', line_color = 'red')
fig.add_vline(x = mean_nan, line_width = 1, line_dash = 'dash', line_color = 'green')
fig.add_vline(x = ZERO_LINE, line_width = 1, line_dash = 'solid', line_color = 'black')

fig.update_layout(
    barmode='overlay', # Use 'stack' if you want stacked histograms
    title='ON vs OFF vs NaN',
    xaxis_title='Value (%)',
    yaxis_title='Count'
)

fig.show()

In [38]:
"""
Control vs
Flagged OFF
"""

hist1 = go.Histogram(
    x= control,
    name='Control',
    opacity=0.80
)

hist2 = go.Histogram(
    x=PD_test_off,
    name='OFF Ex NaNs',
    opacity=0.80
)

fig = go.Figure()
fig.add_trace(hist1)
fig.add_trace(hist2)

fig.add_vline(x = mean_control, line_width = 1, line_dash = 'dash', line_color = 'blue')
fig.add_vline(x = mean_off, line_width = 1, line_dash = 'dash', line_color = 'red')
fig.add_vline(x = ZERO_LINE, line_width = 1, line_dash = 'solid', line_color = 'black')

fig.update_layout(
    barmode='overlay', # Use 'stack' if you want stacked histograms
    title='Conrol vs OFF ex NaNs',
    xaxis_title='Value (%)',
    yaxis_title='Count'
)

fig.show()

In [40]:
"""
Control vs
NaNs (Undefined)
"""

hist1 = go.Histogram(
    x= control,
    name='Control',
    opacity=0.80
)

hist2 = go.Histogram(
    x=PD_test_nan,
    name='NaNs',
    opacity=0.80
)

fig = go.Figure()
fig.add_trace(hist1)
fig.add_trace(hist2)

fig.add_vline(x = mean_control, line_width = 1, line_dash = 'dash', line_color = 'blue')
fig.add_vline(x = mean_nan, line_width = 1, line_dash = 'dash', line_color = 'red')
fig.add_vline(x = ZERO_LINE, line_width = 1, line_dash = 'solid', line_color = 'black')

fig.update_layout(
    barmode='overlay', # Use 'stack' if you want stacked histograms
    title='Conrol vs NaNs (Undefined)',
    xaxis_title='Value (%)',
    yaxis_title='Count'
)

fig.show()

In [41]:
"""
Control vs
OFF with NaNs
"""

hist1 = go.Histogram(
    x= control,
    name='Control',
    opacity=0.80
)

hist2 = go.Histogram(
    x=PD_test_off_nan,
    name='OFF With NaNs',
    opacity=0.80
)

fig = go.Figure()
fig.add_trace(hist1)
fig.add_trace(hist2)

fig.add_vline(x = mean_control, line_width = 1, line_dash = 'dash', line_color = 'blue')
fig.add_vline(x = mean_off_nan, line_width = 1, line_dash = 'dash', line_color = 'red')
fig.add_vline(x = ZERO_LINE, line_width = 1, line_dash = 'solid', line_color = 'black')

fig.update_layout(
    barmode='overlay', # Use 'stack' if you want stacked histograms
    title='Conrol vs OFF Including NaNs',
    xaxis_title='Value (%)',
    yaxis_title='Count'
)

fig.show()

In [43]:
"""
OFF + NaNs vs
Flagged OFF
"""

hist1 = go.Histogram(
    x= PD_test_off_nan,
    name='OFF + NaNs',
    opacity=0.80
)

hist2 = go.Histogram(
    x=PD_test_off,
    name='OFF Ex NaNs',
    opacity=0.80
)

fig = go.Figure()
fig.add_trace(hist1)
fig.add_trace(hist2)

fig.add_vline(x = mean_off_nan  , line_width = 1, line_dash = 'dash', line_color = 'blue')
fig.add_vline(x = mean_off, line_width = 1, line_dash = 'dash', line_color = 'red')
fig.add_vline(x = ZERO_LINE, line_width = 1, line_dash = 'solid', line_color = 'black')

fig.update_layout(
    barmode='overlay', # Use 'stack' if you want stacked histograms
    title='OFF with NaNs vs OFF Ex NaNs',
    xaxis_title='Value (%)',
    yaxis_title='Count'
)

fig.show()

### One-Sample T-Test
- OFF With NaNs != 0 (mean)
- OFF Ex NaNs != 0 (mean)

In [8]:
from scipy.stats import t

In [45]:
"""ONE SAMPLE T-TEST FUNTION DEFINITION"""

def one_sample_t_test(sample, assumed_population_mean, alpha=0.05, tail="two"):
    # Calculate T-score
    sample_mean = np.mean(sample)
    print(f"Sample Mean: {sample_mean}")
    sample_std = np.std(sample, ddof=1)
    sample_size = len(sample)
 
    t_score = (sample_mean - assumed_population_mean) / (sample_std / np.sqrt(sample_size))
 
    df = sample_size - 1
 
    #Finding P-Value
    if tail.lower() == "two":
        p_value = t.sf(np.abs(t_score), df) * 2  
    elif tail.lower() == "left":
        p_value = t.sf(t_score, df)  
    elif tail.lower() == "right":
        p_value = t.sf(-t_score, df)  
    else:
        raise ValueError(
            "Invalid tail argument. Use 'two', 'left', or 'right'.")
 
    # Step 5: Interpret the p-value
    print("P-value:", p_value)
 
    if p_value < alpha:
        print(f"Null Rejected: P-value = {p_value}, Alpha = {0.05}")
    else:
        print(f"Failed to reject the null hypothesis: P-value = {p_value}, Alpha = {0.05}")

In [56]:
"""
H_0 : mean(OFF+NaN) = 0, or UPDRS doesn't grow with PD-progression
H_1 : mean(OFF+NaN) != 0, or UPDRS grows with PD-progression (in some manner)
"""
one_sample_t_test(PD_test_off_nan, 0)

one_sample_t_test(PD_test_off_nan, mean_control)

one_sample_t_test(PD_test_off_nan, 0, alpha=0.05, tail="left")

Sample Mean: 6.896014900065573
P-value: 6.603926345722062e-68
Null Rejected: P-value = 6.603926345722062e-68, Alpha = 0.05
Sample Mean: 6.896014900065573
P-value: 4.001286171828883e-61
Null Rejected: P-value = 4.001286171828883e-61, Alpha = 0.05
Sample Mean: 6.896014900065573
P-value: 3.301963172861031e-68
Null Rejected: P-value = 3.301963172861031e-68, Alpha = 0.05


In [65]:
"""
H_0 : mean(OFF+NaN) = 0, or UPDRS doesn't grow with PD-progression
H_1 : mean(OFF+NaN) > 0, or UPDRS higher on average than Control, or meaningful positive difference over control
"""

def two_sample_t_test(sample1, sample2, alpha = 0.05):
    mean1 = np.mean(sample1)
    mean2 = np.mean(sample2)
    std1 = np.std(sample1, ddof=1) 
    std2 = np.std(sample2, ddof=1)

    n1 = len(sample1)
    n2 = len(sample2)
    df = len(sample1) + len(sample2) - 2

    t_score = (mean1 - mean2) / (np.sqrt((std1**2 / n1) + (std2**2 / n2)))

    p_value = 2 * (1 - t.cdf(np.abs(t_score), df))

    if p_value < alpha:
        print(f"Reject the null hypothesis. P-Value: {p_value}, Alpha = {alpha}")
    else:
        print(f"Fail to reject the null hypothesis. P-Value: {p_value}, Alpha = {alpha}")


In [70]:
two_sample_t_test(PD_test_off_nan, control)

print("\nOFF Ex NaN vs NaN")
two_sample_t_test(PD_test_off, PD_test_nan)

Reject the null hypothesis. P-Value: 0.0, Alpha = 0.05

OFF Ex NaN vs NaN
Fail to reject the null hypothesis. P-Value: 0.6125442569228956, Alpha = 0.05


### Random Selection of 10 Participants

In [78]:
import random

In [87]:
sample_size = 10
patient_nos_control = list(full_sorted[full_sorted['COHORT'] == 2]["PATNO"].unique())
patient_nos_PD = list(PD_test["PATNO"].unique())

#rand_control_sample = random.sample(patient_nos_control, sample_size)
#rand_PD_sample = random.sample(patient_nos_PD, sample_size)
#print(control_sample)
#print(PD_sample)

In [141]:
PD_test_off_nan_sample= PD_test[(PD_test['PDSTATE'] != 'ON') & (PD_test['PAG_NAME'] != 'NUPDR3ON')]

control_sample = [3104, 3004, 3428, 3917, 3478, 3411, 3100, 3069, 3453, 3320]
PD_sample = [3307, 40555, 56744, 3056, 3793, 58141, 3378, 40703, 3757, 3451]
control_sample_vals = full_sorted[full_sorted['PATNO'].isin(control_sample)][['PATNO','INFODT', 'NP3TOT']]
PD_sample_vals = PD_test_off_nan_sample[PD_test_off_nan_sample['PATNO'].isin(PD_sample)][['PATNO','INFODT', 'NP3TOT']]

In [143]:
def plot_progression(df, sample, index):
    s = df[df['PATNO'] == sample[index]]
    fig = px.scatter(s, x='INFODT', y='NP3TOT', title='(PD-Patient) Delta Norm over Time for ' + str(s['PATNO'].unique()[0]),
                 labels={'INFODT': 'Date', 'NP3TOT': 'Delta Norm'})
    fig.show()

### Sample PD Patient Score Progression in OFF or NaN State

In [144]:
for i in range(len(PD_sample)):
    plot_progression(PD_sample_vals, PD_sample, i)

### Sample Control Score Progression

In [145]:
for i in range(len(PD_sample)):
    plot_progression(control_sample_vals, control_sample, i)

### Testing Flaggd OFFs vs Undefined NaNs
- should include Two-Sample T-test

In [55]:
hist1 = go.Histogram(
    x=PD_test_off,
    name='OFF Ex NaNs',
    opacity=0.80
)

hist2 = go.Histogram(
    x=PD_test_nan,
    name='NaNs',
    opacity=0.80
)

fig = go.Figure()
fig.add_trace(hist1)
fig.add_trace(hist2)

fig.add_vline(x = mean_off, line_width = 1, line_dash = 'dash', line_color = 'blue')
fig.add_vline(x = mean_nan, line_width = 1, line_dash = 'dash', line_color = 'red')
fig.add_vline(x = ZERO_LINE, line_width = 1, line_dash = 'solid', line_color = 'black')

fig.update_layout(
    barmode='overlay', # Use 'stack' if you want stacked histograms
    title='OFF Ex NaNs vs NaNs',
    xaxis_title='Value',
    yaxis_title='Count'
)

fig.show()