# PPMI CLINICAL DATA CONCATENATION

In [63]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats.anova import AnovaRM
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrix, PatsyError
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)

## Step 1: Transposing Blood Chemistry

In [64]:
BloodChem = "/Volumes/PARK2023-Q5758/SAA Systematic Review (Human Data)/IDA_PPMI_database_12Nov2023/Blood/Blood_Chemistry___Hematology-Archived_12Nov2023.csv"  # File relative path
df_BloodChem = pd.read_csv(BloodChem)
print("BloodChem_InitialRows: ",len(df_BloodChem),"BloodChem_InitialColumns: ",len(df_BloodChem.columns))
columns_to_drop = ["PAG_NAME", "LCOLLDT","COLLTM","LRECDT","RECTM","LRPTDT","RPTTM","LABCODE","LGROUP","LTSTCODE","LVISTYPE","LSIUNIT","LSILORNG","LSIHIRNG","LUSRES","LUSUNIT","LUSLORNG","LUSHIRNG","LRESFLG"]
df_BloodChem = df_BloodChem.drop(columns=columns_to_drop)
# df_BloodChem = df_BloodChem.drop(["PAG_NAME", "LCOLLDT","COLLTM","LRECDT","RECTM","LRPTDT","RPTTM","LABCODE","LGROUP","LTSTCODE","LVISTYPE","LSIUNIT","LSILORNG","LSIHIRNG","LUSRES","LUSUNIT","LUSLORNG","LUSHIRNG","LRESFLG", axis="columns") #Alternative method
print("BloodChem_AfterDroppedRows: ",len(df_BloodChem),"BloodChem_AfterDroppedColumns: ",len(df_BloodChem.columns))

## Tranpspose the 'LTSTNAME' and 'LSIRES' columns and compress to fit within the same row as defined by corresponding 'PATNO' AND 'EVENT_ID'
BloodChem_Transposed = df_BloodChem.pivot_table(index=['PATNO', 'EVENT_ID'], columns='LTSTNAME', values='LSIRES', aggfunc='first')
BloodChem_Transposed.reset_index(inplace=True)
# pivoted_df_BloodChem.fillna(0, inplace=True) # Fill missing values with 0 (or any other appropriate value)
# pivoted_df = pivoted_df_BloodChem.loc[:, ~pivoted_df_BloodChem.columns.duplicated()] # Remove the duplicate 'PATNO' and 'EVENT_ID' columns
BloodChem_Transposed = pd.concat([df_BloodChem, BloodChem_Transposed], axis=1) # Merge the pivoted DataFrame with the original DataFrame based on the index
BloodChem_Transposed= BloodChem_Transposed.drop(["LTSTNAME", "LSIRES"], axis="columns") #Drop the unneccessary cols

#Drop the PATNO and EVENT_ID cols that were duplicated in the merge step
BloodChem_Transposed.columns.values[0] = "DROP_THIS"
BloodChem_Transposed.columns.values[1] = "DROP_THIS"
BloodChem_Transposed = BloodChem_Transposed.drop("DROP_THIS", axis=1)

# Replace 'SC' with 'BL' to pool for consistency
BloodChem_Transposed['EVENT_ID'] = BloodChem_Transposed['EVENT_ID'].replace('SC', 'BL') 
BloodChem_Transposed.to_csv("/Volumes/PARK2023-Q5758/SAA Systematic Review (Human Data)/Code_Output/BloodChem_Transposed.csv")
print("BloodChem_AfterTranspose_Rows: ",len(BloodChem_Transposed),"BloodChem_AfterTranspose_Columns: ",len(BloodChem_Transposed.columns))


BloodChem_InitialRows:  217477 BloodChem_InitialColumns:  23
BloodChem_AfterDroppedRows:  217477 BloodChem_AfterDroppedColumns:  4
BloodChem_AfterTranspose_Rows:  217477 BloodChem_AfterTranspose_Columns:  45


## Step 2: Merge all datasets together

In [65]:
SAA = "/Volumes/PARK2023-Q5758/SAA Systematic Review (Human Data)/IDA_PPMI_database_12Nov2023/Biospecimen/Biospecimen_Analysis/SAA_Biospecimen_Analysis_Results_12Nov2023.csv"
SAA_prodromal = "/Volumes/PARK2023-Q5758/SAA Systematic Review (Human Data)/Tiers3 Data/SAA_Internal_20231220.csv"
Curated_Data = "/Volumes/PARK2023-Q5758/SAA Systematic Review (Human Data)/IDA_PPMI_database_12Nov2023/Curated_Data_Cuts/PPMI_Curated_Data_Cut_Public_20230612_rev.csv"
BloodChem_Transposed = "/Volumes/PARK2023-Q5758/SAA Systematic Review (Human Data)/Code_Output/BloodChem_Transposed.csv"
DATSCAN_prodromal = "/Volumes/PARK2023-Q5758/SAA Systematic Review (Human Data)/Tiers3 Data/PPMI_SEQUESTERED_SPECT&PET_CSVs_20231121/inv_spect_sbr_sequestered_merge_20231121.csv"
#DATSCAN_CAUDATE_R, DATSCAN_CAUDATE_L, DATSCAN_PUTAMEN_R, DATSCAN_PUTAMEN_L

# inv_spect_vi_sequestered_merge_20231121
# DATSCAN_VISINTRP = positive or negative

#Merge prodromal SAA with other SAA
df_SAA = pd.read_csv(SAA)
print("SAA_Rows: ",len(df_SAA),"SAA_Columns: ",len(df_SAA.columns))
df_SAA_prodromal = pd.read_csv(SAA_prodromal)
print("SAA_prodromal_Rows: ",len(df_SAA_prodromal),"SAA_prodromal_Columns: ",len(df_SAA_prodromal.columns))
if list(df_SAA.columns) == list(df_SAA_prodromal.columns):
    unique_rows = df_SAA_prodromal[~df_SAA_prodromal.set_index(['PATNO', 'CLINICAL_EVENT']).index.isin(df_SAA.set_index(['PATNO', 'CLINICAL_EVENT']).index)]
    df_SAA_final = pd.concat([df_SAA, unique_rows]) #TODO: n.b. different functions... pd.merge instead 
    df_SAA_final.to_csv("/Volumes/PARK2023-Q5758/SAA Systematic Review (Human Data)/Code_Output/Final_SAA.csv")
else:
    print("Error: The columns of the original and new dataframes do not match.")
print("SAA_final_Rows: ",len(df_SAA_final),"SAA_final_Columns: ",len(df_SAA_final.columns))

# Merge BloodChem with Curated Dataframe,also based on duplicated values in "PATNO" and "EVENT_ID"
df_Curated = pd.read_csv(Curated_Data)
print("CuratedData_Rows: ",len(df_Curated),"CuratedData_Columns: ",len(df_Curated.columns))
BloodChem_Transposed = pd.read_csv(BloodChem_Transposed)
df_FinalConcat = pd.merge(df_Curated, BloodChem_Transposed, on=["PATNO","EVENT_ID"], how='outer')

# Merge the asyn CSF data with Curated Daframe,also based on duplicated values in "PATNO" and "EVENT_ID"
df_SAA_final.columns = ['EVENT_ID' if col == 'CLINICAL_EVENT' else col for col in df_SAA.columns] 
df_FinalConcat = pd.merge(df_FinalConcat, df_SAA_final, on=["PATNO","EVENT_ID"], how='outer')
print("FinalConcat_Rows: ",len(df_FinalConcat),"FinalConcat_Columns: ",len(df_FinalConcat.columns))

# Merge in (existing) neuro data with prodromal patient data
df_DATSCAN_prodromal = pd.read_csv(DATSCAN_prodromal)
df_DATSCAN_prodromal['DATSCAN_PUTAMEN_R'] = pd.to_numeric(df_DATSCAN_prodromal['DATSCAN_PUTAMEN_R'], errors='coerce').astype(float)
df_DATSCAN_prodromal['DATSCAN_PUTAMEN_L'] = pd.to_numeric(df_DATSCAN_prodromal['DATSCAN_PUTAMEN_L'], errors='coerce').astype(float)
df_DATSCAN_prodromal['mean_putamen'] = df_DATSCAN_prodromal[['DATSCAN_PUTAMEN_R', 'DATSCAN_PUTAMEN_L']].mean(axis=1)
df_DATSCAN_selected = df_DATSCAN_prodromal[['PATNO', 'EVENT_ID', 'mean_putamen','DATSCAN_CAUDATE_R', 'DATSCAN_CAUDATE_L', 'DATSCAN_PUTAMEN_R', 'DATSCAN_PUTAMEN_L']]
df_FinalConcat = pd.merge(df_FinalConcat, df_DATSCAN_selected, on=['PATNO', 'EVENT_ID'], how='left', suffixes=('', '_new'))

# Replacing existing columns with the new ones where values are not NaN
for column in ['mean_putamen','DATSCAN_CAUDATE_R', 'DATSCAN_CAUDATE_L','DATSCAN_PUTAMEN_R', 'DATSCAN_PUTAMEN_L']:
    df_FinalConcat[column] = df_FinalConcat[column].combine_first(df_FinalConcat[column + '_new'])
    df_FinalConcat.drop(columns=[column + '_new'], inplace=True)
print("FinalConcat_Rows: ",len(df_FinalConcat),"FinalConcat_Columns: ",len(df_FinalConcat.columns))

SAA_Rows:  1362 SAA_Columns:  47
SAA_prodromal_Rows:  2823 SAA_prodromal_Columns:  47
SAA_final_Rows:  2819 SAA_final_Columns:  47


  df_Curated = pd.read_csv(Curated_Data)


CuratedData_Rows:  10152 CuratedData_Columns:  155


  BloodChem_Transposed = pd.read_csv(BloodChem_Transposed)


FinalConcat_Rows:  222285 FinalConcat_Columns:  244
FinalConcat_Rows:  222285 FinalConcat_Columns:  244


### QC step: checking primdiag is consistent across patients

In [66]:
# Merge the dataframes on 'PATNO' and 'EVENT_ID', and keep the 'PRIMDIAG' columns
Prodromal_Elligibility = "/Volumes/PARK2023-Q5758/SAA Systematic Review (Human Data)/IDA_PPMI_database_12Nov2023/Prodromal/Prodromal_Diagnostic_Questionnaire-Archived_12Nov2023.csv"
df_Prodromal_Elligibility = pd.read_csv(Prodromal_Elligibility)
merged_df = pd.merge(
    df_Prodromal_Elligibility[['PATNO', 'EVENT_ID', 'PRIMDIAG']], 
    df_FinalConcat[['PATNO', 'EVENT_ID', 'PRIMDIAG']], 
    on=['PATNO', 'EVENT_ID'], 
    suffixes=('_eligibility', '_final')
)
# Exclude rows with NaN values in either PRIMDIAG column
merged_df = merged_df.dropna(subset=['PRIMDIAG_eligibility', 'PRIMDIAG_final'])
# Identify rows where the PRIMDIAG values differ
discrepancies = merged_df[merged_df['PRIMDIAG_eligibility'] != merged_df['PRIMDIAG_final']]
# Determine the amount of discrepancies out of the total comparisons
total_comparisons = merged_df.shape[0]
discrepancy_count = discrepancies.shape[0]

print(f"Total comparisons: {total_comparisons}")
print(f"Number of discrepancies: {discrepancy_count}")

if not discrepancies.empty:
    print("There are discrepancies in PRIMDIAG values between the dataframes:")
    print(discrepancies.to_string())  # Ensures all rows are printed
else:
    print("All PRIMDIAG values match between the dataframes.")

Total comparisons: 1335
Number of discrepancies: 0
All PRIMDIAG values match between the dataframes.


## Step 3: Cohort categorisations

In [67]:
# Define the mapping for subgroups to broader categories
df_FinalConcat['subgroup_simplified'] = df_FinalConcat['subgroup']
subgroup_mapping = {
    #Genetic
    'PRKN': 'Genetic', 
    'LRRK2': 'Genetic',
    'LRRK2 + GBA': 'Genetic',  
    'GBA': 'Genetic',
    'SNCA': 'Genetic', 
    'PINK1': 'Genetic', 
    'LRRK2 + Hyposmia': 'Genetic',
    'SNCA + Hyposmia': 'Genetic', 
    'GBA + Hyposmia': 'Genetic',
    'LRRK2 + GBA + Hyposmia': 'Genetic', 
    'GBA + RBD + Hyposmia': 'Genetic',
    #Prodromal
    'RBD': 'RBD +/- Hyposomia',
    'RBD + Hyposmia': 'RBD +/- Hyposomia'
    # 'Hyposmia': 'Hyposomia +/- RBD'

}
color_dict = {"Genetic":"green",
              "Hyposomia":"purple",
              "RBD +/- Hyposomia": "pink",
              "Sporadic":"blue",
              "Healthy Control":"black"
              }
# Apply mapping to the 'subgroup' column
df_FinalConcat['subgroup_simplified'] = df_FinalConcat['subgroup_simplified'].replace(subgroup_mapping)
print(df_FinalConcat['subgroup'].unique())
print(df_FinalConcat['subgroup_simplified'].unique())

['Sporadic' 'Hyposmia' 'Healthy Control' 'PRKN' 'LRRK2' 'GBA'
 'RBD + Hyposmia' 'RBD' 'LRRK2 + Hyposmia' 'PINK1' 'SNCA + Hyposmia'
 'LRRK2 + GBA' 'SNCA' 'GBA + Hyposmia' 'LRRK2 + GBA + Hyposmia'
 'GBA + RBD + Hyposmia' nan]
['Sporadic' 'Hyposmia' 'Healthy Control' 'Genetic' 'RBD +/- Hyposomia' nan]


## Step 4: Specific wrangling and creation of additional parameters

In [68]:
#Dealing with the updrs behavioural data
df = df_FinalConcat.dropna(subset=['updrs1_score']) # remove rows that contain a NaN in updrs1_score
df_FinalConcat['updrs2_score'].fillna(0, inplace=True) # make NaNs in updrs2_score ==0
df_FinalConcat['updrs4_score'].fillna(0, inplace=True) # make NaNs in updrs4_score ==0
# df_FinalConcat['updrs3_score_on'].fillna(df_FinalConcat['updrs3_score'], inplace=True) # If updrs3_score_on contains a NaN, then merge the value in column updrs3_score from the same row
# df_FinalConcat= df_FinalConcat.drop(["updrs3_score", "updrs_totscore", "updrs_totscore_on"], axis="columns") # remove updrs3_score
df_FinalConcat['updrs3_score_on'].fillna(0, inplace=True) # make NaNs in updrs3_score ==0
df_FinalConcat['updrs1_score'] = pd.to_numeric(df_FinalConcat['updrs1_score'], errors='coerce') # 'coerce' converts non-numeric values to NaN
df_FinalConcat['updrs2_score'] = pd.to_numeric(df_FinalConcat['updrs2_score'], errors='coerce')  
df_FinalConcat['updrs3_score_on'] = pd.to_numeric(df_FinalConcat['updrs3_score_on'], errors='coerce')
df_FinalConcat['updrs4_score'] = pd.to_numeric(df_FinalConcat['updrs4_score'], errors='coerce')
df_FinalConcat['updrs_totscore_I-III'] = df_FinalConcat[['updrs1_score', 'updrs2_score', 'updrs3_score_on']].sum(axis=1) #Sum the first 3 updrs score categories in new column
df_FinalConcat['updrs_totscore_I-IV'] = df_FinalConcat[['updrs1_score', 'updrs2_score', 'updrs3_score_on','updrs4_score']].sum(axis=1) #Sum all the updrs score categories in new column

#Dealing with the FmaxRep asynuclein assay data
df_FinalConcat['FmaxRep_av'] = df_FinalConcat[['FmaxRep1', 'FmaxRep2', 'FmaxRep3']].mean(axis=1) 
print("Concat_Rows: ",len(df_FinalConcat),"Concat_Columns: ",len(df_FinalConcat.columns))

# Make NLR column
df_FinalConcat['Neutrophils'] = pd.to_numeric(df_FinalConcat['Neutrophils'], errors='coerce')
df_FinalConcat['Lymphocytes'] = pd.to_numeric(df_FinalConcat['Lymphocytes'], errors='coerce')
df_FinalConcat['NLR'] = df_FinalConcat['Neutrophils'] / df_FinalConcat['Lymphocytes']
# print("1FinalConcat_Rows: ",len(df_FinalConcat),"1FinalConcat_Columns: ",len(df_FinalConcat.columns))
# df_FinalConcat.to_csv("/Volumes/PARK2023-Q5758/SAA Systematic Review (Human Data)/Code_Output/Final_Merge.csv")

#quantify EVENTIDS in number of months
unwanted_visits = ['ST','PW','RETEST','RS1','U01']
df_FinalConcat = df_FinalConcat[~df_FinalConcat['EVENT_ID'].isin(unwanted_visits)]
event_id_map = {
    'BL': 0,
    'V01': 3,
    'V02': 6,
    'V03': 9,
    'V04': 12,
    'V05': 15,
    'V06': 18,
    'V07': 21,
    'V08': 24,
    'V09': 27,
    'V10': 30,
    'V12': 36,
    'V13': 39,
    'V14': 42,
    'V15': 45,
    'V16': 48,
    'V17': 51,
    'V18': 54,
    'V19': 57
}

df_FinalConcat['EVENT_NUM'] = df_FinalConcat['EVENT_ID'].map(event_id_map)
print("Concat_Rows: ",len(df_FinalConcat),"Concat_Columns: ",len(df_FinalConcat.columns))

df_FinalConcat.to_csv("/Volumes/PARK2023-Q5758/SAA Systematic Review (Human Data)/Code_Output/FinalConcat.csv")

####PHENOCONVERTED PX
PRIMDIAG_1 = df_FinalConcat[df_FinalConcat['PRIMDIAG'] == 1.0]
unique_patnos = PRIMDIAG_1['PATNO'].unique()
Phenoconverted = df_FinalConcat[df_FinalConcat['PATNO'].isin(unique_patnos)]
Phenoconverted.to_csv("/Volumes/PARK2023-Q5758/SAA Systematic Review (Human Data)/Code_Output/Phenoconverted.csv")

#######Quantify Time Since Diagnosis (v1)
# df_FinalConcat['age_at_visit'] = pd.to_numeric(df_FinalConcat['age_at_visit'], errors='coerce').astype(float)
# df_FinalConcat['agediag'] = pd.to_numeric(df_FinalConcat['agediag'], errors='coerce').astype(float)
# df_FinalConcat['ageonset'] = pd.to_numeric(df_FinalConcat['ageonset'], errors='coerce').astype(float)

# df_FinalConcat['years_since_diagnosis'] = df_FinalConcat['age_at_visit'] - df_FinalConcat['agediag']
# df_FinalConcat['years_since_onset'] = df_FinalConcat['age_at_visit'] - df_FinalConcat['ageonset']

Concat_Rows:  222285 Concat_Columns:  248
Concat_Rows:  222230 Concat_Columns:  250
      PATNO mean_putamen
0  100001.0        1.080
1  100001.0        1.040
2  100001.0        0.825
3  100002.0        0.530
4  100002.0        0.590


In [96]:

######Normalise the putamen values by age and sex
#mean_putamen is getting affected
#still not working...
# Create a summary DataFrame with one row per patient
# Convert mean_putamen to numeric, coerce errors to NaN
# Convert mean_putamen to numeric, coerce errors to NaN
df_FinalConcat['mean_putamen'] = pd.to_numeric(df_FinalConcat['mean_putamen'], errors='coerce')

# Define age bins
age_bins = list(range(0, 105, 5))  # 0-5, 5-10, ..., 100-105 (Define age ranges by 5 years)
df_FinalConcat['age_group'] = pd.cut(df_FinalConcat['age_at_visit'], bins=age_bins, right=False)

# Create a summary DataFrame with one row per patient
df_summary = df_FinalConcat.sort_values(by=['PATNO', 'age_at_visit']).drop_duplicates(subset=['PATNO'], keep='last')

# Filter Healthy Control subgroup
healthy_controls = df_summary[df_summary['subgroup'] == 'Healthy Control']

# Calculate mean and std for each age group and sex in Healthy Control
age_sex_group_stats = healthy_controls.groupby(['age_group', 'SEX_x'])['mean_putamen'].agg(['mean', 'std']).reset_index()
age_sex_group_stats.rename(columns={'mean': 'mean_putamen_healthy', 'std': 'std_putamen_healthy'}, inplace=True)

# Merge statistics back into the summary DataFrame
df_summary = df_summary.merge(age_sex_group_stats, on=['age_group', 'SEX_x'], how='left')


# Function for normalizing mean_putamen
def normalise_putamen(row):
    if pd.isnull(row['mean_putamen']) or pd.isnull(row['mean_putamen_healthy']) or pd.isnull(row['std_putamen_healthy']):
        return np.nan
    return (row['mean_putamen'] - row['mean_putamen_healthy']) / row['std_putamen_healthy']

# Apply normalization
df_summary['normalised_mean_putamen'] = df_summary.apply(normalise_putamen, axis=1)

# Drop irrelevant columns
df_summary.drop(columns=['age_group', 'mean_putamen_healthy', 'std_putamen_healthy'], inplace=True)


# Check the resulting DataFrame
print(df_FinalConcat[['PATNO', 'mean_putamen']].head())

      PATNO  mean_putamen
0  100001.0         1.080
1  100001.0         1.040
2  100001.0         0.825
3  100002.0         0.530
4  100002.0         0.590


In [107]:

######Normalise the putamen values by age and sex
#mean_putamen is getting affected
#still not working...
# Create a summary DataFrame with one row per patient
# Convert mean_putamen to numeric, coerce errors to NaN
df_FinalConcat['mean_putamen'] = pd.to_numeric(df_FinalConcat['mean_putamen'], errors='coerce')

# Define age bins
age_bins = list(range(20, 90, 10))  # 0-5, 5-10, ..., 100-105 (Define age ranges by 5 years)
df_FinalConcat['age_group'] = pd.cut(df_FinalConcat['age_at_visit'], bins=age_bins, right=False)

# Create a summary DataFrame with one row per patient
df_summary = df_FinalConcat.sort_values(by=['PATNO', 'age_at_visit']).drop_duplicates(subset=['PATNO'], keep='last')

# Filter Healthy Control subgroup
healthy_controls = df_summary[df_summary['subgroup'] == 'Healthy Control']

# Calculate mean and std for each age group and sex in Healthy Control
age_sex_group_stats = healthy_controls.groupby(['age_group', 'SEX_x'])['mean_putamen'].agg(['mean', 'std']).reset_index()
age_sex_group_stats.rename(columns={'mean': 'mean_putamen_healthy', 'std': 'std_putamen_healthy'}, inplace=True)

# Merge statistics back into the summary DataFrame
df_summary = df_summary.merge(age_sex_group_stats, on=['age_group', 'SEX_x'], how='left')

# Function for normalizing mean_putamen
def normalise_putamen(row):
    if pd.isnull(row['mean_putamen']) or pd.isnull(row['mean_putamen_healthy']) or pd.isnull(row['std_putamen_healthy']):
        return np.nan
    return (row['mean_putamen'] - row['mean_putamen_healthy']) / row['std_putamen_healthy']

# Apply normalization
df_summary['normalised_mean_putamen'] = df_summary.apply(normalise_putamen, axis=1)

# Drop irrelevant columns
df_summary.drop(columns=['age_group', 'mean_putamen_healthy', 'std_putamen_healthy'], inplace=True)

# Normalize between 0-1
min_normalised = df_summary['normalised_mean_putamen'].min()
max_normalised = df_summary['normalised_mean_putamen'].max()
df_summary['percent_mean_putamen'] = (df_summary['normalised_mean_putamen'] - min_normalised) / (max_normalised - min_normalised)


# Merge normalized data back into the original DataFrame
df_FinalConcat = df_FinalConcat.merge(df_summary[['PATNO', 'normalised_mean_putamen', 'percent_mean_putamen']], on='PATNO', how='left')

# Check the resulting DataFrame
print(df_summary[['PATNO', 'mean_putamen']].head())

MergeError: Passing 'suffixes' which cause duplicate columns {'normalised_mean_putamen_x', 'percent_mean_putamen_x'} is not allowed.

In [70]:
# ########Quantify Time Since Diagnosis (v2 - not final yet)
# df_FinalConcat['age_at_visit'] = pd.to_numeric(df_FinalConcat['age_at_visit'], errors='coerce').astype(float)
# df_FinalConcat['agediag'] = pd.to_numeric(df_FinalConcat['agediag'], errors='coerce').astype(float)
# df_FinalConcat['ageonset'] = pd.to_numeric(df_FinalConcat['ageonset'], errors='coerce').astype(float)

# df_FinalConcat['years_since_diagnosis'] = df_FinalConcat['age_at_visit'] - df_FinalConcat['agediag']
# df_FinalConcat['years_since_onset'] = df_FinalConcat['age_at_visit'] - df_FinalConcat['ageonset']


# #For sporadic patients
# df_FinalConcat['years_since_diagnosis'] = df_FinalConcat.apply(
#     lambda row: row['age_at_visit'] - row['agediag'] if row['subgroup'] == 'Sporadic' or 'Genetic' else None,
#     axis=1
# )
# # For prodromal patients, calculate the diagnosis point based on first 'PRIMDIAG' of 01
# def calculate_years_since_diagnosis(row, primdiag_dates):
#     if row['subgroup'] in ['Hyposomia', 'RBD +/- Hyposomia']:
#         diagnosis_age = primdiag_dates.get(row['PATNO'], None)
#         if diagnosis_age:
#             return row['age_at_visit'] - diagnosis_age
#     return None
# # Find the first occurrence of 'PRIMDIAG' == 01 for each patient
# prodromal_patients = df_FinalConcat[df_FinalConcat['PRIMDIAG'] == 1]
# primdiag_dates = prodromal_patients.groupby('PATNO')['age_at_visit'].min().to_dict()
# # Apply the function to calculate years since diagnosis for prodromal patients
# df_FinalConcat['years_since_diagnosis'] = df_FinalConcat.apply(
#     lambda row: calculate_years_since_diagnosis(row, primdiag_dates) if pd.isna(row['years_since_diagnosis']) else row['years_since_diagnosis'],
#     axis=1
# )
# df_FinalConcat['years_since_diagnosis'] = pd.to_numeric(df_FinalConcat['years_since_diagnosis'], errors='coerce')
# print("FinalConcat_Rows: ",len(df_FinalConcat),"FinalConcat_Columns: ",len(df_FinalConcat.columns))