In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
#| echo: false
import pandas as pd
import os
import numpy as np
import redcap
from IPython.display import Markdown, display
import papermill as pm

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\langhe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\langhe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
#| tags: [parameters]
root    = r"C:\Users\langhe\switchdrive\Private\Collaborations\2023 EDCPT ultrasound\WP3\dictionaries"
in_csv1 = os.path.join(root, r"IMCIPLUSRCTPilot_DataDictionary_2024-06-07.csv")
in_csv2 = os.path.join(root, r"IMCIPLUSRCTPilot_DataDictionary_2024-06-10.csv")

In [5]:
d1 = redcap.DataDic(in_csv1,
                    dtype = "RCT")
d2 = redcap.DataDic(in_csv2,
                    dtype = "RCT")

# Characteristics of the REDCap data dictionary

In [6]:
s1 = d2.printNumForms()

In [7]:
display(Markdown(f'{s1}'))

The REDCap data dictionary contains 14 forms.

In [8]:
#| label: tbl-forms
#| tbl-cap: "Number of variables by forms"
d2.getForms()

Unnamed: 0,Forms,N
0,a_enrollment,4
1,a_exit_interview,66
2,a_phone_followup_d8,20
3,admitted_in_ward_follow_up,26
4,b1_screening_registration,19
5,b2_screening_antibiotic_use,3
6,b3_screening_vitalsigns,53
7,b_day_8_inperson_followup,34
8,b_patient_information,15
9,b_spontaneous_follow_up,18


In [9]:
q2 = d2.getVariables()

In [10]:
#| tbl-cap: "Extract of the data dictionary variables (first 10 variables)"
q2[["Variables", "Forms", "Field Type", "Field Label"]].head(10)

Unnamed: 0,Variables,Forms,Field Type,Field Label
0,record_id,b1_screening_registration,text,Screening /Study ID
1,screening_date,b1_screening_registration,text,Screening date
2,q2_prev_enrolled,b1_screening_registration,radio,Was the child previously enrolled in this stud...
3,q2_b_enrollment_date,b1_screening_registration,yesno,Is date of enrolment known ?
4,q2_a,b1_screening_registration,text,Date of enrolment
5,q2i_unknown_date,b1_screening_registration,radio,"If unknown, was enrolment >28days ago"
6,scr_prevenrollment28d,b1_screening_registration,calc,Enrolment into IMCI-PLUS study in the past 28...
7,please_exit_screening_and,b1_screening_registration,descriptive,"<div class=""rich-text-field-label""><p><span st..."
8,child_dob,b1_screening_registration,text,Child DoB
9,unknown_dob,b1_screening_registration,yesno,Are you sure the date of birth is unknown?


In [11]:
#| tbl-cap: "List of personally identifiable information variables"
d2.getIdentifiers()

Unnamed: 0,Variables,Forms,Field Type,Field Label
81,l_name,b_patient_information,text,Last name:
82,f_name,b_patient_information,text,First Name
85,id_num,b_patient_information,text,Patient's National ID number
86,caregiver_name,b_patient_information,text,Primary Caregiver's Last name:
87,caregiver_surname,b_patient_information,text,Primary Caregiver's First name:
88,alt_contact_person,b_patient_information,text,Alternate Contact person (Name & Surname):
89,alt_contact_num,b_patient_information,text,Alternate Contact person (Cell phone number)
90,legal_rep,b_patient_information,text,Legal representative of child in the event of ...
91,legal_cell,b_patient_information,text,Cell phone number of the legal representative\...


# Comparison with a reference REDCap data dictionary

In [12]:
d1 = redcap.DataDic(in_csv1,
                    dtype = "RCT")

In [13]:
s2 = d2.compareNumForms(d1)

In [14]:
display(Markdown(f'{s2}'))

Increased number of forms: 14 vs. 11

In [15]:
_, identical_df, samename_df, renamed_df, added_df = d2.detectIdenticalVariables(d1)

## Same forms

In [16]:
same_forms_df = d2.detectModificationsInSameForms(d1)

In [17]:
#| tbl-cap: "Overview of modifications in same forms"
same_forms_df

Unnamed: 0,Forms,N,N0,No changes,Changes
0,admitted_in_ward_follow_up,26,31,8,5 Variables deleted
1,phone_followup_d29,13,13,13,Same number of variables


In [18]:
form_list = same_forms_df["Forms"].unique().tolist()
cols = [i for i in q2.columns.tolist() if i not in ["Forms", "index"]]

In [19]:
identicals = dict()
samenames = dict()
renamed = dict()
for k in form_list:
    identicals[k] = identical_df[identical_df["Forms"] == k]
    samenames[k] = samename_df[samename_df["Forms"] == k]
    renamed[k] = renamed_df[renamed_df["Forms"] == k]

### Identical variables

In [20]:
# Display results dynamically
for k, v in identicals.items():
    display(Markdown(f'#### Form: {k}'))
    display(Markdown(f'{len(v.index)} variables'))
    v = v[["Variables", "Field Type", "Field Label", "Choices, Calculations, OR Slider Labels"]]
    display(v)

#### Form: admitted_in_ward_follow_up

8 variables

Unnamed: 0,Variables,Field Type,Field Label,"Choices, Calculations, OR Slider Labels"
101,convulsions,radio,Convulsions with this illness,"1, Yes | 0, No"
149,desc_findings,descriptive,"<div class=""rich-text-field-label""><p><span st...",
211,follow_up_date,text,Date,
214,follow_up_desc,descriptive,"<div class=""rich-text-field-label""><p><span st...",
228,general_status,radio,General Status of Patient,"1, Improving | 2, Unchanged | 3, Worsening"
529,stridor_present,yesno,Stridor present,
556,vomiting,radio,Vomiting everything,"1, Yes | 0, No"
577,working_diagnosis,text,Working Diagnoses:,


#### Form: phone_followup_d29

13 variables

Unnamed: 0,Variables,Field Type,Field Label,"Choices, Calculations, OR Slider Labels"
6,admission_date_dd3afa,text,When admitted/re-admitted:,
108,d8_date_155715,text,Date,
114,d8_no_reason_c9c481,text,If No: Reason,
119,d8_phone_call_bc9be7,yesno,Did the D8 phone call visit F/up take place:,
131,death_cause_63ef7e,text,Suspected cause of death:,
134,death_date_eca754,text,"if death, when:",
160,discharged_date_b507cc,text,Discharged Date,
240,hospi_related_illness,yesno,Hospitalization related to initial illness?,
249,hospitalization_4d4ecb,radio,"Hospitalization (Defined as ""Slept overnight i...","1, Not Admitted | 2, Admitted on D1 (on day of..."
254,hospitalization_place_278cb2,text,Where:,


### Modified variables (name unchanged)

In [21]:
# Display results dynamically
for k, v in samenames.items():
    display(Markdown(f'#### Form: {k}'))
    display(Markdown(f'{len(v.index)} variables'))
    v = v[cols].dropna(axis=1, how="all")
    display(v)

#### Form: admitted_in_ward_follow_up

11 variables

Unnamed: 0,Variables,Section Header,Field Type,Field Label,"Choices, Calculations, OR Slider Labels",Text Validation Type OR Show Slider Number,Text Validation Min,Text Validation Max,Branching Logic (Show field only if...),Required Field?,Matrix Group Name
56,day,,text,Study Day,,integer,1.0,28.0,,y,
80,drink_inability,"If yes, which ones:",radio,Inability to drink/breastfeed (assessed by pro...,"1, Yes | 0, No",,,,[who_danger_signs] = '1',y,who_imci_danger_sign_matrix
133,high_care_reason,,text,Reason for hospital transfer:,,,,,[level_of_care_change] = '3',y,
137,hospital_name,,text,Name of hospital to which patient was referred...,,,,,[level_of_care_change] = '3',y,
157,lethargy,,radio,Lethargy/unconscious (assessed by provider),"1, Yes | 0, No",,,,[who_danger_signs] = '1',y,who_imci_danger_sign_matrix
160,level_of_care_change,,radio,Was the patient's level of care changed since ...,"0, no | 1, admission to high care unit | 2, ad...",,,,,y,
210,patient_discharged,,yesno,Patient discharged home:,,,,,,y,
264,respi_need_support,,radio,Is patient currently receiving respiratory sup...,"0, none | 1, HFNC | 2, bubble CPAP | 3, CPAP |...",,,,,y,
266,respiratory_distress,,radio,Severe respiratory distress (please provide ov...,"0, no | 1, new since last study encounter | 2,...",,,,,y,
306,wheeze_present,,checkbox,Clinical signs of bronchiolitis/ reactive airw...,"0, no | 1, audible wheezing | 2, wheezing (aus...",,,,,y,


#### Form: phone_followup_d29

0 variables

### Renamed variables

In [22]:
# Display results dynamically
cols1 = cols.append("Variables0")
cols1

## Renamed forms

In [23]:
renamed_forms_df = d2.detectModificationsInRenamedForms(d1)

In [24]:
#| tbl-cap: "Overview of modifications in renamed forms"
renamed_forms_df

Unnamed: 0,Forms,N,Forms0,N0,No changes,Changes
0,a_enrollment,4,enrollment,3,0,1 Variables added
1,a_phone_followup_d8,20,phone_followup_d8,20,0,Same number of variables
2,b1_screening_registration,19,screening_registration,14,0,5 Variables added
3,b2_screening_antibiotic_use,3,screening_antibiotic_use,2,0,1 Variables added
4,b3_screening_vitalsigns,53,screening_vitalsigns,48,0,5 Variables added
5,b_day_8_inperson_followup,34,day_8_inperson_followup,30,0,4 Variables added
6,b_patient_information,15,patient_information,15,0,Same number of variables
7,c_lung_ultrasound,31,lung_ultrasound,27,0,4 Variables added


In [25]:
form_list = renamed_forms_df["Forms"].unique().tolist()
cols = [i for i in q2.columns.tolist() if i != "Forms"]
outs = dict()
for k in form_list:
    outs[k] = q2[q2["Forms"] == k][cols]
# Display results dynamically
for k, v in outs.items():
    display(Markdown(f'### Form: {k}'))
    v = v.dropna(axis=1, how="all")
    display(v)

### Form: a_enrollment

Unnamed: 0,index,Variables,Section Header,Field Type,Field Label,Field Note,Branching Logic (Show field only if...),Required Field?
75,75,consent_rct,"<div class=""rich-text-field-label""><div class=...",yesno,Did the caregiver sign the informed consent fo...,if No - [END FORM],[eligibility_rct] = '1',y
76,76,consent_observational,,yesno,Did the caregiver sign the informed consent fo...,if No - [END FORM],[eligibility_observational] = '1',y
77,77,consent_biobanking,,yesno,Did the caregiver sign the informed consent fo...,if No - [END FORM],[eligibility_general] = '1',y
78,78,please_randomize_patient,,descriptive,"<div class=""rich-text-field-label""><p><span st...",,[consent_rct] = '1',


### Form: a_phone_followup_d8

Unnamed: 0,index,Variables,Section Header,Field Type,Field Label,"Choices, Calculations, OR Slider Labels",Text Validation Type OR Show Slider Number,Branching Logic (Show field only if...),Required Field?
266,266,d8_date,,text,Date,,date_dmy,,y
267,267,d8_phone_call,,yesno,Did the D8 phone call visit F/up take place:,,,,y
268,268,d8_no_reason,,text,If No: Reason,,,[d8_phone_call] = '0',y
269,269,patient_outcome,,radio,Patient Outcome:,"1, Resolved Completely (Cured) | 2, Improved |...",,,y
270,270,death_date,,text,"if so, when",,date_dmy,[patient_outcome] = '5',y
271,271,death_cause,,text,Suspected cause:,,,[patient_outcome] = '5',y
272,272,attened_clinic,Further health seeking,yesno,Attended a clinic/GP/OPD/ER,,,,y
273,273,date_attended_clinic,,yesno,Date attended a clinic/GP/OPD/ER,,,[attened_clinic] = '1',y
274,274,medication_prescribed,,notes,Clinic/OPD/ER Medication / treatment prescribed:,,,[attened_clinic] = '1',y
275,275,gp_antibiotic,,text,What: Antibiotic,,,[attened_clinic] = '1',y


### Form: b1_screening_registration

Unnamed: 0,index,Variables,Section Header,Field Type,Field Label,"Choices, Calculations, OR Slider Labels",Field Note,Text Validation Type OR Show Slider Number,Text Validation Max,Branching Logic (Show field only if...),Required Field?,Custom Alignment,Matrix Group Name,Field Annotation
0,0,record_id,,text,Screening /Study ID,,,,,,,,,
1,1,screening_date,,text,Screening date,,,,,,,,,@NOW @HIDDEN
2,2,q2_prev_enrolled,,radio,Was the child previously enrolled in this stud...,"1, Yes | 0, No",,,,,y,RH,,
3,3,q2_b_enrollment_date,,yesno,Is date of enrolment known ?,,,,,[q2_prev_enrolled] = '1',y,RH,,
4,4,q2_a,,text,Date of enrolment,,,date_dmy,,[q2_b_enrollment_date] = '1',y,RH,,
5,5,q2i_unknown_date,,radio,"If unknown, was enrolment >28days ago","1, Yes | 0, No | 98, Unknown",,,,[q2_b_enrollment_date] = '0',y,RH,,
6,6,scr_prevenrollment28d,,calc,Enrolment into IMCI-PLUS study in the past 28...,,,,,,,,,
7,7,please_exit_screening_and,,descriptive,"<div class=""rich-text-field-label""><p><span st...",,,,,[scr_prevenrollment28d]=1,,,,
8,8,child_dob,,text,Child DoB,,,date_ymd,now,,,,,
9,9,unknown_dob,,yesno,Are you sure the date of birth is unknown?,,,,,[child_dob]='',y,,,


### Form: b2_screening_antibiotic_use

Unnamed: 0,index,Variables,Field Type,Field Label,"Choices, Calculations, OR Slider Labels",Branching Logic (Show field only if...),Required Field?
19,19,q4_meds,text,What medications has the child received in the...,BIOPORTAL:RXNORM,[medication_48hrs_yn]=1,y
20,20,scr_atb,yesno,Is this medication an antibiotic?,,,
21,21,q4_meds_duration,radio,Duration of Use,"1, Less than 48 hours | 2, More than 48 hours",[medication_48hrs_yn]=1,y


### Form: b3_screening_vitalsigns

Unnamed: 0,index,Variables,Section Header,Field Type,Field Label,"Choices, Calculations, OR Slider Labels",Field Note,Text Validation Type OR Show Slider Number,Text Validation Min,Text Validation Max,Branching Logic (Show field only if...),Required Field?,Custom Alignment,Matrix Group Name,Field Annotation
22,22,q5_imci_danger_sings,,yesno,WHO IMCI danger signs,,,,,,,,RH,,
23,23,q5_imci_sign_desc,,descriptive,"<div class=""rich-text-field-label""><p><span st...",,,,,,[q5_imci_danger_sings]=1,,,,
24,24,q5_a_drinking_inability,,radio,Inability to drink/breastfeed (assessed by stu...,"1, Yes | 0, No",,,,,[q5_imci_danger_sings]=1,y,,q5_danger_signs_matrix,"If yes, ask study clinician to check/confirm"
25,25,q5_vommiting,,radio,Vomiting everything,"1, Yes | 0, No",,,,,,y,,q5_danger_signs_matrix,
26,26,q5_convulsions,,radio,Convulsions with this illness,"1, Yes | 0, No",,,,,,y,,q5_danger_signs_matrix,
27,27,q5_lethargy,,radio,Lethargy/unconscious (assessed by study clinic...,"1, Yes | 0, No",,,,,,y,,q5_danger_signs_matrix,
28,28,q6_chronic_disease,,yesno,Does your child have any known chronic disease...,,"(like HIV, active TB, heart problems, kidney p...",,,,,y,RH,,
29,29,q6_chronic_type,,checkbox,"If yes, which chronic disease","1, HIV | 2, Active TB | 3, Heart problem | 4, ...",,,,,[q6_chronic_disease] = '1',y,,,
30,30,q7_other_chronic,,text,Other chronic disease,,,,,,[q6_chronic_type] = '99',y,RH,,
31,31,q7_hiv_status,,dropdown,What is your child's HIV status:,"1, Positive | 2, Negative | 3, Unknown | 98, R...",,,,,,y,RH,,


### Form: b_day_8_inperson_followup

Unnamed: 0,index,Variables,Section Header,Field Type,Field Label,"Choices, Calculations, OR Slider Labels",Field Note,Text Validation Type OR Show Slider Number,Branching Logic (Show field only if...),Required Field?,Matrix Group Name,Field Annotation
286,286,actual_follow_up_date,,text,Date,,,date_dmy,,y,,
287,287,d8_phone_call_7ed0e4,,yesno,Did the D8 in-person visit f/up take place:,,,,,y,,
288,288,d8_no_reason_c17839,,text,If No: Reason,,,,[d8_phone_call_7ed0e4] = '0',y,,
289,289,desc_caregiver_history,,descriptive,"<div class=""rich-text-field-label""><p><span st...",,,,,,,
290,290,patient_outcome_510e6b,,radio,Patient Outcome (According to caregiver):,"1, Resolved Completely (Cured) | 2, Improved |...",,,,y,,
291,291,in_person_visit,Further health seeking,yesno,Attended today due to study staff phone call a...,,,,,y,,
292,292,attened_clinic_ea407f,,radio,Between initial encounter and today: attended ...,"1, yes | 0, no | 98, unknown",,,,y,,
293,293,fu_addvisit_studyd8,Additional health seeking encounters between i...,radio,Today due to study staff phone call advising a...,"1, yes | 2, no | 98, unknown",,,,,fu_addvisit,
294,294,fu_addvisit_study,,radio,Spontaneous follow-up with study team,"1, yes | 2, no | 98, unknown",,,,,fu_addvisit,
295,295,fu_addvisit_opd,,radio,"Outpatient provider (clinic, GP, ED...)","1, yes | 2, no | 98, unknown",,,,,fu_addvisit,


### Form: b_patient_information

Unnamed: 0,index,Variables,Field Type,Field Label,"Choices, Calculations, OR Slider Labels",Text Validation Type OR Show Slider Number,Identifier?,Branching Logic (Show field only if...),Required Field?
79,79,p_folder_num,text,Patient's Folder Number:,,,,,y
80,80,p_folder_confirm,text,Patient's Folder Number confirmation:,,,,,
81,81,l_name,text,Last name:,,,y,,y
82,82,f_name,text,First Name,,,y,,y
83,83,sex,radio,Sex,"1, Male | 2, Female",,,,y
84,84,dob_p,text,DOB:,,date_dmy,,,y
85,85,id_num,text,Patient's National ID number,,identification_za,y,,y
86,86,caregiver_name,text,Primary Caregiver's Last name:,,,y,,
87,87,caregiver_surname,text,Primary Caregiver's First name:,,,y,,
88,88,alt_contact_person,text,Alternate Contact person (Name & Surname):,,,y,,


### Form: c_lung_ultrasound

Unnamed: 0,index,Variables,Section Header,Field Type,Field Label,"Choices, Calculations, OR Slider Labels",Text Validation Type OR Show Slider Number,Branching Logic (Show field only if...),Required Field?,Matrix Group Name
94,94,ultrasound,,yesno,Lung Ultrasound Collected:,,,,y,
95,95,exit_form,,descriptive,"<div class=""rich-text-field-label""><p><span st...",,,[ultrasound] = '0',,
96,96,lus_nolusreasonyn,,checkbox,Reason why LUS was not collected:,"1, patient refused | 2, patient deteriorated c...",,[ultrasound] = '0',,
97,97,lus_nolusreasonyn_2,,text,Other reason why LUS was not collected:,,,[lus_nolusreasonyn(3)] = '1',,
98,98,lus_id,,text,LUS identifier on device,,,,,
99,99,lus_date,,text,When was this LUS performed:,,date_dmy,,,
100,100,lus_visittype,,radio,Type of visit for which LUS was performed,"1, D1 | 2, D2-7 admitted patient | 3, D2-7 spo...",,,,
101,101,patience_postition,,radio,Patient Position:,"1, Sitting | 2, Lying Down",,,,
102,102,exam_problem,,yesno,Problem during exam?,,,,,
103,103,lus_problem_type,,checkbox,"If yes, what kind?:","1, patient crying | 2, patient moved a lot | 3...",,[exam_problem] = '1',,


## New forms

New forms did not exist in the reference version of the REDCap data dictionary.

In [26]:
#| tbl-cap: "Overview of modifications in new forms"
d2.detectModificationsInRenamedForms(d1)

Unnamed: 0,Forms,N,Forms0,N0,No changes,Changes
0,a_enrollment,4,enrollment,3,0,1 Variables added
1,a_phone_followup_d8,20,phone_followup_d8,20,0,Same number of variables
2,b1_screening_registration,19,screening_registration,14,0,5 Variables added
3,b2_screening_antibiotic_use,3,screening_antibiotic_use,2,0,1 Variables added
4,b3_screening_vitalsigns,53,screening_vitalsigns,48,0,5 Variables added
5,b_day_8_inperson_followup,34,day_8_inperson_followup,30,0,4 Variables added
6,b_patient_information,15,patient_information,15,0,Same number of variables
7,c_lung_ultrasound,31,lung_ultrasound,27,0,4 Variables added


## Deleted forms

Deleted forms existed in the reference version of the REDCap data dictionary, but no longer exist.