# RARAS - Descriptive analysis of the Retrospective study

## Config and load data

In [1]:
import pandas as pd
from scripts.functions import custom_value_counts, create_descriptive_table, create_disease_count

# Terminal colors
R = '\033[0m'
C = '\033[1;36m'
Y = '\033[1m\033[38;5;220m'

# Pandas options
pd.options.display.float_format = '{:.2f}'.format

# Apply the decorator to the original value_counts method
pd.Series.value_counts = custom_value_counts(pd.Series.value_counts)
pd.DataFrame.value_counts = custom_value_counts(pd.DataFrame.value_counts)

# Load files
id_df = pd.read_csv('data/identification.csv')
diag_df = pd.read_csv('data/diagnostic.csv')
trat_df = pd.read_csv('data/treatment.csv')
trat_type_df = pd.read_csv('data/treatment_type.csv')
followup_df = pd.read_csv('data/follow_up.csv')

# Create masks
confirmed_diag_mask = diag_df['diagnostic_status'] == 'Confirmed diagnosis'
suspect_diag_mask = diag_df['diagnostic_status'] == 'Suspected diagnosis'
undiagnosed_mask = diag_df['diagnostic_status'] == 'Undiagnosed'

id_df.shape, diag_df.shape, trat_df.shape, trat_type_df.shape, followup_df.shape

((12269, 18), (12261, 20), (11871, 9), (9561, 9), (12269, 10))

## Identification features ✔

In [2]:
print(f'{Y}Identification form preview:{R}')
display(id_df.head())

[1m[38;5;220mIdentification form preview:[0m


Unnamed: 0,record_id,center_id,revised_consultation_date,first_consultation_date_center,first_consultation_date_specialty,birth_date,age,color_or_race,sex,birth_city,birth_region,birth_country,residence_region,residence_city,age_at_first_evaluation_at_center,age_at_first_evaluation_at_specialty,length_of_follow_up_at_center,length_of_follow_up_at_specialty
0,1,1,2018-01-04,2010-05-27,2010-05-27,1993-10-07,24.24,White,Female,Sapiranga - RS,South,,South,Nova Hartz - RS,16.64,16.64,7.61,7.61
1,2,1,2018-01-09,2000-05-18,2000-08-30,1956-07-01,61.52,White,Female,Porto Alegre - RS,South,,South,Porto Alegre - RS,43.88,44.16,17.65,17.36
2,3,1,2018-01-09,2007-09-24,2007-09-24,1984-03-04,33.85,White,Female,São Francisco de Paula - RS,South,,South,Parobé - RS,23.56,23.56,10.29,10.29
3,4,1,2018-01-11,2004-08-20,2004-08-20,1984-05-10,33.67,White,Female,Ibiraiaras - RS,South,,South,Porto Alegre - RS,20.28,20.28,13.39,13.39
4,5,1,2018-01-11,2014-10-20,,2001-11-01,16.19,White,Female,Bento Gonçalves - RS,South,,South,Bento Gonçalves - RS,12.97,,3.23,


In [3]:
print(f'{Y}Total participants:{R}', id_df['record_id'].nunique())
print(f'{Y}Total of centers:{R}', id_df['center_id'].nunique())
print(f'{Y}Total of cities of birth:{R}', id_df['birth_city'].nunique())
print(f'{Y}Number of record_ids in more than one center_id:{R}', 86)
# df = pd.read_csv('scripts/original_data/identificacao.csv')
# df = df[df['project_id']==17]
# df.groupby('nome')['id_centro'].nunique().value_counts()

[1m[38;5;220mTotal participants:[0m 12269
[1m[38;5;220mTotal of centers:[0m 34
[1m[38;5;220mTotal of cities of birth:[0m 1723
[1m[38;5;220mNumber of record_ids in more than one center_id:[0m 86


## Diagnostic features ✔

In [4]:
print(f'{Y}Diagnostic form preview:{R}')
display(diag_df.head())

print(f"{Y}Number of symptoms mean:{R} {diag_df['symptoms'].dropna().str.split(', ').apply(len).mean():.2f}")
print(f'{Y}Number of confirmed diagnoses, grouped by participant{R}')
print(
    diag_df
    [confirmed_diag_mask]
    .groupby('record_id')
    ['diagnostic_status'].count().value_counts()
    .reset_index()
    .rename(columns={
        'index': 'N diagnoses',
        'diagnostic_status': 'N participants'
    })
    .to_string(index=False)
)

print(f'{Y}Type of diagnostic distribution{R}')
print(diag_df[confirmed_diag_mask]['diagnostic_type']
 .dropna().apply(lambda x: x.split(' ')[0])
 .value_counts(total=True, ascending=False).to_string())

print(f'{Y}Etiological diagnostic distribution, among confirmed diagnoses{R}')
_mask = diag_df['diagnostic_type'].str.contains('Etiologic').fillna(False)
print(diag_df[_mask & confirmed_diag_mask]['diagnostic_type'].value_counts(total=True, ascending=False).to_string())
print(f'{Y}Most common disorders, by ORPHA code:{R}')
print(diag_df[confirmed_diag_mask]['disease_orpha'].value_counts().head().to_string())

diseases_columns = ['disease_orpha', 'disease_omim', 'disease_cid10']
total = diag_df[diseases_columns].notna().sum().sum()
print(f'{Y}Terminologies{R}')
for name, col in zip(['ORPHA', 'OMIM', 'CID10'], diseases_columns):
    print(name, '\t', disease_sum := diag_df[col].notna().sum(), '\t', f"({disease_sum / total * 100:.2f}%)")
    
print('Most common disorders, excluding newborn screening')
_mask = diag_df['diagnostic_moment'] != 'Newborn screening'
s = create_disease_count(diag_df[_mask][['disease_orpha', 'disease_omim', 'disease_cid10']])
display(s.to_frame())

[1m[38;5;220mDiagnostic form preview:[0m


Unnamed: 0,record_id,center_id,instance_id,diagnostic_status,disease_cid10,disease_orpha,disease_omim,diagnostic_payer_source,diagnostic_moment,diagnostic_date,symptoms,family_recurrence,consanguinity,age_at_symptoms_onset_days,diagnostic_type,maternal_age_at_birth,paternal_age_at_birth,age_at_symptoms_onset,age_at_diagnosis,diagnostic_odyssey
0,1,1,1,Confirmed diagnosis,,"110, Síndrome de Bardet-Biedl",,,Postnatal,2010-05-27,"HP:0001830, HP:0001956, HP:0002514",Yes,No,0.0,Clinical,,,0.0,16.64,16.64
1,2,1,1,Confirmed diagnosis,,"355, Doença de Gaucher (termo genérico)",,Pesquisa clínica,Postnatal,,"HP:0004859, HP:0001744, HP:0001903",No,No,10950.0,Molecular (Etiological),,,29.98,,
2,3,1,1,Confirmed diagnosis,,"273, Distrofia miotónica de Steinert",,,Postnatal,,"HP:0002058, HP:0020034",Yes,No,7300.0,Molecular (Etiological),,26.0,19.99,,
3,4,1,1,Confirmed diagnosis,,"579, Mucopolissacaridose tipo 1 (MPS1)",,SUS,Postnatal,2004-10-01,"HP:0010535, HP:0007957, HP:0002828, HP:0000280...",No,No,2190.0,Biochemical (Etiological),24.0,27.0,6.0,20.39,20.38
4,5,1,1,Confirmed diagnosis,,"315306, Hiperplasia adrenal congénita classica...",,SUS,Newborn screening,,"HP:0001007, HP:0000062",No,No,0.0,Biochemical (Etiological),,,0.0,,


[1m[38;5;220mNumber of symptoms mean:[0m 2.83
[1m[38;5;220mNumber of confirmed diagnoses, grouped by participant[0m
 N diagnoses  N participants
           1            7667
           2              65
           3               2
[1m[38;5;220mType of diagnostic distribution[0m
                        n     %
Clinical             2767 36.34
Biochemical          2067 27.14
Molecular            1497 19.66
Cytogenetic           613  8.05
Anatomopathological   588  7.72
Etiological            83  1.09
[1m[38;5;220mEtiological diagnostic distribution, among confirmed diagnoses[0m
                                      n     %
Biochemical (Etiological)          2067 42.64
Molecular (Etiological)            1496 30.86
Cytogenetic (Etiological)           613 12.65
Anatomopathological (Etiological)   588 12.13
Etiological                          83  1.71
[1m[38;5;220mMost common disorders, by ORPHA code:[0m
716, Fenilcetonúria                         530
98896, Distrofia muscula

Unnamed: 0,Unnamed: 1,n
Phenylketonuria (n=166),orpha 716,135
Phenylketonuria (n=166),omim 261600,24
Phenylketonuria (n=166),cid10 E70.0,7
Cystic Fibrosis (n=383),cid10 E84.9,150
Cystic Fibrosis (n=383),cid10 E84.0,81
Cystic Fibrosis (n=383),orpha 586,74
Cystic Fibrosis (n=383),cid10 E84.8,66
Cystic Fibrosis (n=383),cid10 E84.1,9
Cystic Fibrosis (n=383),omim 219700,3
Acromegaly (n=375),cid10 E22.0,277


## Treatment features


In [5]:
print(f'{Y}Treatment form preview:{R}')
display(trat_df.head())
display(trat_type_df.head())

[1m[38;5;220mTreatment form preview:[0m


Unnamed: 0,record_id,center_id,instance_id,treatment_related_to_rare_disease,diag_instance_id,follows_other_specialty,other_specialty,medical_specialty,all_specialties
0,1,1,1,Related to rare disease,1.0,Yes,,nefrologia,nefrologia
1,2,1,1,Related to rare disease,1.0,No,,,
2,3,1,1,Related to rare disease,1.0,Yes,"nutricao, ginecologia, pneumologia, enfermagem...","neurologia, oftalmologia","neurologia, oftalmologia, nutricao, ginecologi..."
3,4,1,1,Related to rare disease,1.0,Yes,,"cardiologia, dermatologia, oftalmologia, otorr...","cardiologia, dermatologia, oftalmologia, otorr..."
4,5,1,1,Related to rare disease,1.0,Yes,cirurgia pediatrica,endocrinologia,"endocrinologia, cirurgia pediatrica"


Unnamed: 0,record_id,center_id,instance_id,treatment_start_date,treatment_payer_source,treatment_type,description,data_nascimento,age_at_treatment_start
0,7,1,1,,sus,Diet therapy,,2010-02-22,
1,8,1,1,,sus,Diet therapy,,1997-10-12,
2,9,1,1,,sus,Diet therapy,,1995-10-20,
3,11,1,1,2017-09-01,sus,Diet therapy,,2016-08-10,1.06
4,18,1,1,2013-07-30,sus,Diet therapy,,2013-07-16,0.04


## Hospitalization and deaths ✔

In [6]:
print(f'{Y}Follow-up form preview:{R}')
display(followup_df.head())

death_count = followup_df['death'].map({'Yes': True, 'No': False}).sum()
hospitalization_count = followup_df['previous_hospitalization'].map({'Yes': True, 'No': False}).sum()
autopsy_count = followup_df['autopsy_performed'].map({'Yes': True, 'No': False}).sum()
print(f'{Y}Number of deaths:{R}', death_count, 
      f"({death_count / followup_df['record_id'].nunique() * 100:.2f}%)")

print(f'{Y}Number of hospitalizations:{R}', hospitalization_count, 
      f"({hospitalization_count / followup_df['record_id'].nunique() * 100:.2f}%)")

print(f'{Y}Number of autopsies:{R}', autopsy_count,
        f"({autopsy_count / followup_df['autopsy_performed'].dropna().size * 100:.2f}%)")

[1m[38;5;220mFollow-up form preview:[0m


Unnamed: 0,record_id,center_id,number_of_hospitalizations,hospitalization_date,cid10_hospitalization,previous_hospitalization,death_date,autopsy_performed,cid10_death,death
0,1,1,,,,No,,,,No
1,2,1,,,,No,,,,No
2,3,1,7.0,2019-10-11,J18.0,Yes,,,,No
3,4,1,6.0,2016-04-03,,Yes,,,,No
4,5,1,1.0,2019-08-01,N82.1,Yes,,,,No


[1m[38;5;220mNumber of deaths:[0m 172 (1.40%)
[1m[38;5;220mNumber of hospitalizations:[0m 4885 (39.82%)
[1m[38;5;220mNumber of autopsies:[0m 18 (14.17%)


--- 
## Categorical features

In [7]:
# Select categorical features
id_categorical_columns = ['color_or_race', 'sex', 'birth_region', 'residence_region']
diag_categorical_columns = ['diagnostic_status', 'diagnostic_payer_source', 
                            'diagnostic_moment', 'family_recurrence', 'consanguinity', 
                            'diagnostic_type']
trat_categorical_columns = ['treatment_related_to_rare_disease', 'follows_other_specialty']
trat_type_categorical_columns = ['treatment_type', 'treatment_payer_source']
followup_categorical_columns = ['previous_hospitalization', 'death', 'autopsy_performed']

pd.concat([
    create_descriptive_table(id_df, id_categorical_columns),
    create_descriptive_table(diag_df, diag_categorical_columns),
    create_descriptive_table(trat_df, trat_categorical_columns),
    create_descriptive_table(trat_type_df, trat_type_categorical_columns),
    create_descriptive_table(followup_df, followup_categorical_columns),
])

Unnamed: 0_level_0,Unnamed: 1_level_0,n,%
feature,value,Unnamed: 2_level_1,Unnamed: 3_level_1
Color or race,Brown,5013,47.45
Color or race,White,4847,45.88
Color or race,Black,606,5.74
Color or race,Yellow,68,0.64
Color or race,Indigenous,30,0.28
Sex,Female,6198,50.54
Sex,Male,6053,49.36
Sex,Undetermined,13,0.11
Birth region,Southeast,3731,38.15
Birth region,Northeast,3713,37.97


--- 
## Numerical features

In [8]:
# Select numeric features
id_numeric_columns = ['age', 'age_at_first_evaluation_at_center', 'age_at_first_evaluation_at_specialty', 
                      'length_of_follow_up_at_center', 'length_of_follow_up_at_specialty']
diag_numeric_columns = ['diagnostic_odyssey', 'age_at_symptoms_onset', 'maternal_age_at_birth', 'paternal_age_at_birth']
type_trat_numeric_columns_map = {
    'Diet therapy': 'diet_age_treatment_start',
    'Drug therapy': 'drug_age_treatment_start',
    'Other': 'other_age_treatment_start',
    'Rehabilitation': 'rehab_age_treatment_start',
}

In [9]:
print(f'{Y}All diagnostic status:{R}')

pd.concat([
    id_df[id_numeric_columns].describe(),
    diag_df[diag_numeric_columns].describe(),
    followup_df.dropna(subset='number_of_hospitalizations').groupby('record_id')['number_of_hospitalizations'].sum().describe(),
    trat_type_df.groupby('treatment_type')['age_at_treatment_start'].describe().T.rename(columns=type_trat_numeric_columns_map)
], axis=1).T

[1m[38;5;220mAll diagnostic status:[0m


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,12222.0,19.6,19.33,0.0,4.89,12.74,28.77,95.76
age_at_first_evaluation_at_center,11595.0,14.59,18.72,0.0,0.9,6.26,20.81,95.68
age_at_first_evaluation_at_specialty,11328.0,15.28,18.85,0.0,1.39,7.29,21.47,95.07
length_of_follow_up_at_center,11619.0,5.33,6.33,0.0,0.58,2.76,7.93,47.93
length_of_follow_up_at_specialty,11252.0,4.07,5.68,0.0,0.15,1.73,5.36,47.93
diagnostic_odyssey,5030.0,17.75,19.81,0.0,1.99,8.99,30.02,88.73
age_at_symptoms_onset,9272.0,9.19,16.63,0.0,0.0,0.82,8.99,87.94
maternal_age_at_birth,4812.0,27.71,7.06,12.0,22.0,27.0,33.0,63.0
paternal_age_at_birth,3975.0,31.72,8.39,12.0,25.0,31.0,37.0,79.0
number_of_hospitalizations,4303.0,4.15,14.2,0.0,1.0,2.0,3.0,379.0


In [10]:
print(f'{Y}Only confirmed diagnoses:{R}')

_record_id_mask = diag_df[confirmed_diag_mask]['record_id'].unique()

pd.concat([
    id_df[id_df['record_id'].isin(_record_id_mask)][id_numeric_columns].describe(),
    diag_df[diag_df['record_id'].isin(_record_id_mask)][diag_numeric_columns].describe(),
    followup_df[followup_df['record_id'].isin(_record_id_mask)]
        .groupby('record_id')['number_of_hospitalizations'].sum().describe(),
    trat_type_df[trat_type_df['record_id'].isin(_record_id_mask)]
        .groupby('treatment_type')['age_at_treatment_start'].describe().T.rename(columns=type_trat_numeric_columns_map),
], axis=1).T

[1m[38;5;220mOnly confirmed diagnoses:[0m


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,7725.0,22.16,20.08,0.0,6.07,15.16,34.44,88.97
age_at_first_evaluation_at_center,7388.0,16.19,19.64,0.0,0.75,7.33,26.85,86.85
age_at_first_evaluation_at_specialty,7059.0,16.72,19.76,0.0,1.11,8.15,27.66,88.97
length_of_follow_up_at_center,7388.0,6.19,6.7,0.0,1.0,3.76,9.47,47.93
length_of_follow_up_at_specialty,7002.0,5.1,6.23,0.0,0.61,2.76,7.3,47.93
diagnostic_odyssey,4129.0,18.44,20.1,0.0,2.0,9.56,31.96,88.73
age_at_symptoms_onset,5945.0,10.99,17.89,0.0,0.0,1.0,13.99,87.94
maternal_age_at_birth,2595.0,27.7,7.11,12.0,22.0,27.0,33.0,63.0
paternal_age_at_birth,2144.0,31.72,8.37,12.0,25.0,31.0,37.0,68.0
number_of_hospitalizations,7734.0,1.87,10.67,0.0,0.0,0.0,1.0,379.0


--- 
# Annex

## Figure 1.a
Histogram of Age and Sex Distribution of Participants (n=12,502) and (b) diagnostic status. (N= 12,279)

## Figure 1.b

## Table 1 ✔
Sample characterization

In [11]:
df = create_descriptive_table(id_df, ['color_or_race', 'sex', 'birth_region', 'residence_region'])
display(df)

Unnamed: 0_level_0,Unnamed: 1_level_0,n,%
feature,value,Unnamed: 2_level_1,Unnamed: 3_level_1
Color or race,Brown,5013,47.45
Color or race,White,4847,45.88
Color or race,Black,606,5.74
Color or race,Yellow,68,0.64
Color or race,Indigenous,30,0.28
Sex,Female,6198,50.54
Sex,Male,6053,49.36
Sex,Undetermined,13,0.11
Birth region,Southeast,3731,38.15
Birth region,Northeast,3713,37.97


## Table 2
Most frequent diagnoses in RARAS and applied coding

In [12]:
s = create_disease_count(diag_df[_mask][['disease_orpha', 'disease_omim', 'disease_cid10']])
display(s.to_frame())

Unnamed: 0,Unnamed: 1,n
Phenylketonuria (n=166),orpha 716,135
Phenylketonuria (n=166),omim 261600,24
Phenylketonuria (n=166),cid10 E70.0,7
Cystic Fibrosis (n=383),cid10 E84.9,150
Cystic Fibrosis (n=383),cid10 E84.0,81
Cystic Fibrosis (n=383),orpha 586,74
Cystic Fibrosis (n=383),cid10 E84.8,66
Cystic Fibrosis (n=383),cid10 E84.1,9
Cystic Fibrosis (n=383),omim 219700,3
Acromegaly (n=375),cid10 E22.0,277


## Table 3 ✔
Most Frequent Signs and Symptoms, Causes of Hospitalization, and Causes of Death

In [13]:
print(f'{Y}Most Frequent Signs and Symptoms{R}')
display(diag_df['symptoms'].str.split(', ').explode().str.strip().value_counts().head(10))

print(f'{Y}Most Frequent Causes of Hospitalization{R}')
display(followup_df['cid10_hospitalization'].value_counts().head(10))

print(f'{Y}Most Frequent Causes of Death{R}')
display(followup_df['cid10_death'].value_counts().head(10))

[1m[38;5;220mMost Frequent Signs and Symptoms[0m


HP:0001263    1234
HP:0001250     728
HP:0004322     668
HP:0001249     507
HP:0001252     447
HP:0005982     391
HP:0001324     386
HP:0002315     329
HP:0000252     319
HP:0002015     298
Name: symptoms, dtype: int64

[1m[38;5;220mMost Frequent Causes of Hospitalization[0m


E22.0    188
Q78.0    160
E84      123
G12.2     85
J18       65
J18.9     54
E25       50
E84.0     46
R56       37
G71.0     33
Name: cid10_hospitalization, dtype: int64

[1m[38;5;220mMost Frequent Causes of Death[0m


G12.2    28
E84      10
I46       6
J96       3
J96.0     3
R09.2     3
J96.9     2
A41.9     2
A41       2
J38.4     2
Name: cid10_death, dtype: int64

## Table 4 
Comparative analysis based on diagnostic status

In [14]:
# Create masks to filter by record_id
status_ids = {
    'Confirmed': diag_df['record_id'][confirmed_diag_mask],
    'Suspect': diag_df['record_id'][suspect_diag_mask],
    'Undiagnosed': diag_df['record_id'][undiagnosed_mask],
}

# Setup variables to group categorical features
id_numeric_columns = [
    'age', 'age_at_first_evaluation_at_center', 'age_at_first_evaluation_at_specialty', 
    'length_of_follow_up_at_center', 'length_of_follow_up_at_specialty']
diag_numeric_columns = ['age_at_symptoms_onset', 'maternal_age_at_birth', 'paternal_age_at_birth']
followup_numeric_columns = ['number_of_hospitalizations']
join_mean_std_columns = lambda x: x['mean'].round(2).astype(str) + ' (±' + x['std'].round(2).astype(str) + ')'
joined_column_name = 'Mean (±SD)'
describe_columns = ['mean', 'std']

numerical_grouped_df = pd.DataFrame()
for status_name, status_mask in status_ids.items():
    partial_df = (
        pd.concat([
            df[df['record_id'].isin(status_mask)][columns].describe().T[describe_columns]
            for df, columns in [
                (id_df, id_numeric_columns),
                (diag_df, diag_numeric_columns),
                (followup_df, followup_numeric_columns),
            ]
        ])
        .assign(joined_column=join_mean_std_columns)
        .drop(columns=describe_columns)
        .rename(columns={'joined_column': joined_column_name})
        )
    numerical_grouped_df = pd.concat([numerical_grouped_df, partial_df], axis=1)

# Add Mulindex
numerical_grouped_df.columns = pd.MultiIndex.from_product([status_ids.keys(), [joined_column_name]])

In [15]:
# Setup variables to group categorical features
id_categorical_columns = ['color_or_race', 'sex', 'birth_region', 'residence_region']
diag_categorical_columns = ['family_recurrence', 'consanguinity']
followup_categorical_columns = ['previous_hospitalization', 'death']
trat_categorical_columns = ['treatment_related_to_rare_disease']
join_categorical_columns = lambda x: x['n'].round(2).astype(str) + ' (' + x['%'].round(2).astype(str) + '%)'
joined_column_name = 'N (%)'
value_count_columns = ['n', '%']

categorical_grouped_df = pd.DataFrame()
for status_name, status_mask in status_ids.items():
    partial_df = (
        pd.concat([
            create_descriptive_table(df[df['record_id'].isin(status_mask)], columns)
            for df, columns in [
                (id_df, id_categorical_columns),
                (diag_df, diag_categorical_columns),
                (followup_df, followup_categorical_columns),
                (trat_df, trat_categorical_columns),
            ]
        ])
        .assign(joined_column=join_categorical_columns)
        .drop(columns=value_count_columns)
        .rename(columns={'joined_column': joined_column_name})
        )
    categorical_grouped_df = pd.concat([categorical_grouped_df, partial_df], axis=1)

# Add Mulindex
categorical_grouped_df.columns = pd.MultiIndex.from_product([status_ids.keys(), [joined_column_name]])

In [16]:
print(f'{Y}Confirmed diagnosis:{R} {confirmed_diag_mask.sum()}')
print(f'{Y}Suspected diagnosis:{R} {suspect_diag_mask.sum()}')
print(f'{Y}Undiagnosed:{R} {undiagnosed_mask.sum()}')

display(numerical_grouped_df)
display(categorical_grouped_df)

print(f'{C}* For this analysis, each diagnosis was evaluated independently, considering that a participant may have more than one RD diagnosis.{R}')

[1m[38;5;220mConfirmed diagnosis:[0m 7803
[1m[38;5;220mSuspected diagnosis:[0m 2299
[1m[38;5;220mUndiagnosed:[0m 2159


Unnamed: 0_level_0,Confirmed,Suspect,Undiagnosed
Unnamed: 0_level_1,Mean (±SD),Mean (±SD),Mean (±SD)
age,22.16 (±20.08),17.15 (±18.03),13.36 (±15.93)
age_at_first_evaluation_at_center,16.19 (±19.64),13.77 (±17.52),9.84 (±15.55)
age_at_first_evaluation_at_specialty,16.72 (±19.76),14.71 (±17.92),11.2 (±15.94)
length_of_follow_up_at_center,6.19 (±6.7),3.83 (±5.53),3.87 (±5.04)
length_of_follow_up_at_specialty,5.1 (±6.23),2.51 (±4.44),2.25 (±3.79)
age_at_symptoms_onset,10.99 (±17.89),8.62 (±16.27),3.44 (±9.48)
maternal_age_at_birth,27.7 (±7.11),27.45 (±6.92),27.92 (±7.05)
paternal_age_at_birth,31.72 (±8.37),31.71 (±8.27),31.74 (±8.52)
number_of_hospitalizations,4.77 (±16.64),2.89 (±5.03),2.45 (±3.59)


Unnamed: 0_level_0,Unnamed: 1_level_0,Confirmed,Suspect,Undiagnosed
Unnamed: 0_level_1,Unnamed: 1_level_1,N (%),N (%),N (%)
feature,value,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Color or race,White,3156 (47.33%),761 (39.09%),909 (48.48%)
Color or race,Brown,3036 (45.53%),1068 (54.85%),862 (45.97%)
Color or race,Black,415 (6.22%),102 (5.24%),86 (4.59%)
Color or race,Yellow,40 (0.6%),11 (0.56%),14 (0.75%)
Color or race,Indigenous,21 (0.31%),5 (0.26%),4 (0.21%)
Sex,Female,4108 (53.13%),1074 (47.08%),971 (45.02%)
Sex,Male,3617 (46.78%),1202 (52.7%),1185 (54.94%)
Sex,Undetermined,7 (0.09%),5 (0.22%),1 (0.05%)
Birth region,Southeast,2412 (39.52%),592 (33.15%),703 (38.5%)
Birth region,Northeast,2190 (35.88%),739 (41.38%),753 (41.24%)


[1;36m* For this analysis, each diagnosis was evaluated independently, considering that a participant may have more than one RD diagnosis.[0m
