In [1]:
import pandas as pd 
import numpy as np 

In [2]:
metadata = pd.read_csv('../../data/metadata_demographic.csv')
metadata = metadata.drop_duplicates(subset='Participant_ID')
metadata = metadata.dropna(subset=['pd'])
metadata

Unnamed: 0,Protocol,Participant_ID,Task,gender,age,race,pd
0,SuperPD,NIHFT628PHTAY,ahhhh,female,70.0,White,no
1,SuperPD,NIHNT179KNNF4,ahhhh,female,70.0,White,yes
2,SuperPD,NIHYM875FLXFF,ahhhh,female,73.0,White,no
3,SuperPD,NIHBV117HUCTC,ahhhh,female,60.0,White,no
4,SuperPD,NIHZY217YWJA8,ahhhh,female,69.0,White,yes
...,...,...,...,...,...,...,...
1860,ValorPD,MOyJjyLX9hPvP3FSlLNYaG28xA23,,M,78.0,White,yes
1861,ValorPD,zn6iI7uiq0U0xWTG5fuwR7yX9IH3,,F,62.0,White,no
1862,ValorPD,XFJkSNMEpNgUnWg5Ouc6AWMOnQ82,,F,62.0,White,yes
1863,ValorPD,j4GZI8ZFugZHRPxCKLBHC3DFwZE2,,M,70.0,White,yes


In [3]:
def create_distribution_table_with_order(df, diagnosis_col, target_col, first_row_text, value_order=None):
    """
    Create a table showing counts and percentages for a specified column split by a diagnosis column.
    
    Parameters:
        df (pd.DataFrame): The input dataset.
        diagnosis_col (str): The column that indicates diagnosis (e.g., "Diagnosis").
        target_col (str): The column for which distribution is calculated (e.g., "Sex").
        first_row_text (str): The first row text for the table header (e.g., "Sex, n (%)").
        value_order (list): The preferred order of column values (e.g., ['Male', 'Female', ...]).
        
    Returns:
        pd.DataFrame: The distribution table.
    """
    # Calculate counts within each group (With PD and Without PD)
    with_pd_total = df[df[diagnosis_col] == 1].shape[0]
    without_pd_total = df[df[diagnosis_col] != 1].shape[0]

    # Counts for each target value in the groups
    with_pd_counts = df[df[diagnosis_col] == 1][target_col].value_counts()
    without_pd_counts = df[df[diagnosis_col] != 1][target_col].value_counts()
    total_counts = df[target_col].value_counts()

    # Use value_order or infer from the data
    if value_order is None:
        value_order = total_counts.index.tolist()

    # Create the rows for each unique value in the preferred order
    rows = []
    for value in value_order:
        # Get counts for each group
        with_pd_count = with_pd_counts.get(value, 0)
        without_pd_count = without_pd_counts.get(value, 0)
        total_count = total_counts.get(value, 0)

        # Calculate percentages within each group
        with_pd_pct = (with_pd_count / with_pd_total) * 100 if with_pd_total > 0 else 0
        without_pd_pct = (without_pd_count / without_pd_total) * 100 if without_pd_total > 0 else 0
        total_pct = (total_count / df.shape[0]) * 100

        # Add row to the table
        rows.append([
            first_row_text,
            value,
            f"{with_pd_count} ({with_pd_pct:.1f}%)",
            f"{without_pd_count} ({without_pd_pct:.1f}%)",
            f"{total_count} ({total_pct:.1f}%)"
        ])

    # Create the table DataFrame
    table = pd.DataFrame(rows, columns=["Demographic Property", "Attribute", "With PD", "Without PD", "Total"])

    # # Add the first row text
    # first_row = pd.DataFrame([{
    #     "Demographic Property": first_row_text,
    #     "Attribute": "",
    #     "With PD": "",
    #     "Without PD": "",
    #     "Total": ""
    # }])

    # # Concatenate the header and the data rows
    # table = pd.concat([first_row, table], ignore_index=True)

    for index in range(1, len(table)):
        table.loc[index, 'Demographic Property'] = ''
    
    return table


In [4]:
metadata['pd'].value_counts(dropna=False)

pd
no          1121
yes          613
Unlikely      64
PD            38
Possible      18
Control       10
Probable       1
Name: count, dtype: int64

In [5]:
# Mapping the 'pd' column to binary values (0 and 1)
# Assuming 'no', 'Control', and 'Unlikely' are mapped to 0 (non-PD)
# and the rest are mapped to 1 (PD)

pd_map = {
    'no': 0,
    'Control': 0,
    'Unlikely': 0,
    'yes': 1,
    'Possible': 1,
    'PD': 1,
    'Probable': 1
}

metadata['Diagnosis'] = metadata['pd'].map(pd_map)


In [6]:
metadata['Diagnosis'].value_counts(dropna=False)

Diagnosis
0    1195
1     670
Name: count, dtype: int64

In [7]:
# Calculate counts and percentages for 'With PD' and 'Without PD'
with_pd_count = (metadata['Diagnosis'] == 1).sum()  # Assuming '1' indicates 'With PD'
without_pd_count = (metadata['Diagnosis'] != 1).sum()
total_count = with_pd_count + without_pd_count

with_pd_percentage = (with_pd_count / total_count) * 100
without_pd_percentage = (without_pd_count / total_count) * 100

# Create the final table structure
table_dict = {
    "Demographic Property": "",
    "Attribute": ["Number of Participants, n (%)"],
    "With PD": [f"{with_pd_count} ({with_pd_percentage:.1f}%)"],
    "Without PD": [f"{without_pd_count} ({without_pd_percentage:.1f}%)"],
    "Total": [f"{total_count} (100%)"]
}

# Convert to a DataFrame for display
pd_count_table = pd.DataFrame(table_dict)

pd_count_table

Unnamed: 0,Demographic Property,Attribute,With PD,Without PD,Total
0,,"Number of Participants, n (%)",670 (35.9%),1195 (64.1%),1865 (100%)


In [8]:
metadata['gender'].value_counts()

gender
female                   777
male                     708
Female                   111
Male                      99
F                         88
M                         79
Prefer not to respond      2
Nonbinary                  1
Name: count, dtype: int64

In [9]:
gender_map = {
    "female": "Female",
    "Female": "Female",
    'F': 'Female',
    "male": "Male",
    "Male": "Male",
    'M': 'Male',
    "Prefer not to respond": "Unknown",
    "Nonbinary": "Non-Binary"
}
metadata['gender_normalized'] = metadata['gender'].map(gender_map)
metadata['gender_normalized'].value_counts(dropna=False)

gender_normalized
Female        976
Male          886
Unknown         2
Non-Binary      1
Name: count, dtype: int64

In [10]:
# Define the preferred order
preferred_order = ['Female', 'Male', 'Non-Binary', 'Unknown']

# Call the function with the preferred order
sex_table_with_order = create_distribution_table_with_order(
    df=metadata,
    diagnosis_col="Diagnosis",
    target_col="gender_normalized",
    first_row_text="Sex",
    value_order=preferred_order
)


sex_table_with_order

Unnamed: 0,Demographic Property,Attribute,With PD,Without PD,Total
0,Sex,Female,287 (42.8%),689 (57.7%),976 (52.3%)
1,,Male,380 (56.7%),506 (42.3%),886 (47.5%)
2,,Non-Binary,1 (0.1%),0 (0.0%),1 (0.1%)
3,,Unknown,2 (0.3%),0 (0.0%),2 (0.1%)


In [11]:
metadata['age'].value_counts(dropna=False)

age
60.000000    90
NaN          85
66.000000    81
67.000000    78
62.000000    75
             ..
72.365620     1
76.100125     1
78.816129     1
75.169237     1
68.653018     1
Name: count, Length: 124, dtype: int64

In [12]:
metadata['age'] = metadata['age'].apply(lambda x: np.nan if x < 15 or x > 100 else x)
# Display the value counts of the 'age' column sorted by ascending order of the age values
metadata['age'].value_counts(dropna=False).sort_index()


age
16.0     2
17.0     3
18.0     2
19.0     8
20.0    10
        ..
87.0     4
89.0     2
91.0     1
93.0     1
NaN     86
Name: count, Length: 123, dtype: int64

In [13]:
# Initialize 'age_normalized' column with NaN
metadata['age_normalized'] = np.nan

# Ensure 'age' is numeric where possible
def safe_numeric(x):
    try:
        return float(x)
    except (ValueError, TypeError):
        return np.nan

metadata['age_numeric'] = metadata['age'].apply(safe_numeric)

# Define conditions for numeric age ranges
conditions = [
    metadata['age_numeric'] < 20,
    (metadata['age_numeric'] >= 20) & (metadata['age_numeric'] <= 39),
    (metadata['age_numeric'] >= 40) & (metadata['age_numeric'] <= 59),
    (metadata['age_numeric'] >= 60) & (metadata['age_numeric'] <= 79),
    metadata['age_numeric'] >= 80
]

# Define labels for the conditions
age_labels = [
    '< 20',
    '20 - 39',
    '40 - 59',
    '60 - 79',
    '>= 80'
]

# Apply conditions to normalize 'age'
metadata['age_normalized'] = np.select(
    conditions,
    age_labels,
    default='Not Mentioned'
)


In [14]:
import numpy as np

def process_age(age):
    try:
        # If the age is already numeric, return it
        if isinstance(age, (int, float)):
            return float(age)
        # If the age is a range like "50-60", calculate the mean
        if isinstance(age, str) and '-' in age:
            start, end = map(float, age.split('-'))
            return (start + end) / 2
    except:
        pass
    # Return NaN for invalid entries
    return np.nan

# Apply the processing function to handle all cases
metadata['age_processed'] = metadata['age'].apply(process_age)

# Calculate the mean and range (ignoring NaN values)
mean_age = metadata['age_processed'].mean()
min_age = metadata['age_processed'].min()
max_age = metadata['age_processed'].max()

# Print the results
print(f"Mean age: {mean_age:.2f}")
print(f"Age range: {min_age:.2f} - {max_age:.2f}")


Mean age: 60.80
Age range: 16.00 - 93.00


In [15]:
metadata['age_normalized'].value_counts(dropna=False)

age_normalized
60 - 79          1124
40 - 59           428
20 - 39           162
Not Mentioned      89
>= 80              47
< 20               15
Name: count, dtype: int64

In [16]:
# Define the preferred order
preferred_order = [
    '< 20',
    '20 - 39',
    '40 - 59',
    '60 - 79',
    '>= 80',
    'Not Mentioned'
]



# Call the function with the preferred order
age_table_with_order = create_distribution_table_with_order(
    df=metadata,
    diagnosis_col="Diagnosis",
    target_col="age_normalized",
    first_row_text=f"Age in years (range: {min_age:.1f} - {max_age:.1f}, mean: {mean_age:.1f}), n (%)",
    value_order=preferred_order
)


age_table_with_order

Unnamed: 0,Demographic Property,Attribute,With PD,Without PD,Total
0,"Age in years (range: 16.0 - 93.0, mean: 60.8),...",< 20,0 (0.0%),15 (1.3%),15 (0.8%)
1,,20 - 39,9 (1.3%),153 (12.8%),162 (8.7%)
2,,40 - 59,119 (17.8%),309 (25.9%),428 (22.9%)
3,,60 - 79,481 (71.8%),643 (53.8%),1124 (60.3%)
4,,>= 80,38 (5.7%),9 (0.8%),47 (2.5%)
5,,Not Mentioned,23 (3.4%),66 (5.5%),89 (4.8%)


In [17]:
metadata['race'].value_counts(dropna=False)

race
white,                                           478
white                                            342
White                                            340
['White']                                        219
NaN                                              163
white,race                                       108
asian,race                                        41
black,                                            39
['Black or African American']                     17
black                                             15
asian                                             15
asian,                                            11
['Asian']                                          7
on,                                                7
asian,white,                                       7
Prefer Not to Answer                               7
other,race                                         6
black,race                                         5
['Other']                                

In [18]:
def map_race_simplified(value):
    if isinstance(value, str):
        value = value.lower()  # Make case-insensitive
        if 'asian' in value:
            return "Asian"
        elif 'nativeamerican' in value or 'american indian' in value:
            return "American Indian or Alaska Native"
        elif 'black' in value:
            return "Black or African American"
        elif 'nativepacific' in value or 'hawaiian' in value:
            return "Native Hawaiian or Other Pacific Islander"
        elif 'white' in value:
            return "White"
        elif 'other' in value or 'on' in value:
            return "Others"
        elif 'prefer not to' in value or '[]' in value:
            return "Not Mentioned"
    return "Not Mentioned"  # Default to Unknown if value is not a string or doesn't match

# Apply the function to the 'race' column
metadata['race_normalized'] = metadata['race'].apply(map_race_simplified)

# Display the cleaned race column counts
print(metadata['race_normalized'].value_counts(dropna=False))

race_normalized
White                                        1487
Not Mentioned                                 175
Asian                                          88
Black or African American                      80
Others                                         24
American Indian or Alaska Native               10
Native Hawaiian or Other Pacific Islander       1
Name: count, dtype: int64


In [19]:
# Define the preferred order
preferred_order = [
    "White",
    "Asian",
    "Black or African American",
    "American Indian or Alaska Native",
    "Others",
    "Not Mentioned"
]

# Call the function with the preferred order
race_table_with_order = create_distribution_table_with_order(
    df=metadata,
    diagnosis_col="Diagnosis",
    target_col="race_normalized",
    first_row_text="Race, n (%)",
    value_order=preferred_order
)


race_table_with_order

Unnamed: 0,Demographic Property,Attribute,With PD,Without PD,Total
0,"Race, n (%)",White,503 (75.1%),984 (82.3%),1487 (79.7%)
1,,Asian,11 (1.6%),77 (6.4%),88 (4.7%)
2,,Black or African American,12 (1.8%),68 (5.7%),80 (4.3%)
3,,American Indian or Alaska Native,3 (0.4%),7 (0.6%),10 (0.5%)
4,,Others,8 (1.2%),16 (1.3%),24 (1.3%)
5,,Not Mentioned,133 (19.9%),42 (3.5%),175 (9.4%)


In [20]:
metadata['Protocol'].value_counts(dropna=False)

Protocol
ParkTest           1102
ValidationStudy     213
ValorPD             167
InMotion            149
ClusterPD            83
SuperPD_old          64
RoutePD              48
SuperPD              39
Name: count, dtype: int64

In [21]:
env_map = {
    'ParkTest':'Home-Global',
    'ValorPD': 'Clinic',
    'InMotion': 'PD Care Facility',
    'ValidationStudy': 'Clinic',
    'SuperPD': 'Clinic',
    'ClusterPD': 'Clinic',
    'SuperPD_old': 'Clinic',
    'SuperPD': 'Clinic',
    'RoutePD': 'PD Care Facility'
    
}

metadata['env'] = metadata['Protocol'].map(env_map)
metadata['env'].value_counts(dropna=False)

env
Home-Global         1102
Clinic               566
PD Care Facility     197
Name: count, dtype: int64

In [22]:
# Define the preferred order
preferred_order = [
    "Home-Global",
    "PD Care Facility",
    "Clinic",
]

# Call the function with the preferred order
env_table_with_order = create_distribution_table_with_order(
    df=metadata,
    diagnosis_col="Diagnosis",
    target_col="env",
    first_row_text="Recording Environment, n (%)",
    value_order=preferred_order
)
env_table_with_order

Unnamed: 0,Demographic Property,Attribute,With PD,Without PD,Total
0,"Recording Environment, n (%)",Home-Global,290 (43.3%),812 (67.9%),1102 (59.1%)
1,,PD Care Facility,162 (24.2%),35 (2.9%),197 (10.6%)
2,,Clinic,218 (32.5%),348 (29.1%),566 (30.3%)


In [23]:
total_table = pd.concat([
    pd_count_table, 
    sex_table_with_order, 
    age_table_with_order, 
    race_table_with_order, 
    env_table_with_order
], ignore_index=True)
total_table

Unnamed: 0,Demographic Property,Attribute,With PD,Without PD,Total
0,,"Number of Participants, n (%)",670 (35.9%),1195 (64.1%),1865 (100%)
1,Sex,Female,287 (42.8%),689 (57.7%),976 (52.3%)
2,,Male,380 (56.7%),506 (42.3%),886 (47.5%)
3,,Non-Binary,1 (0.1%),0 (0.0%),1 (0.1%)
4,,Unknown,2 (0.3%),0 (0.0%),2 (0.1%)
5,"Age in years (range: 16.0 - 93.0, mean: 60.8),...",< 20,0 (0.0%),15 (1.3%),15 (0.8%)
6,,20 - 39,9 (1.3%),153 (12.8%),162 (8.7%)
7,,40 - 59,119 (17.8%),309 (25.9%),428 (22.9%)
8,,60 - 79,481 (71.8%),643 (53.8%),1124 (60.3%)
9,,>= 80,38 (5.7%),9 (0.8%),47 (2.5%)


In [24]:
total_table.to_csv('../../data/demographic_table.csv', index=False)  