# Preprocessing

## Setup

#### Imports

In [756]:
import xmltodict as xd
import pandas as pd

from pandas import DataFrame

#### Methods

In [757]:
def attr_to_df(dicts: list[dict], attr: str, parent_pk: str, parent_prefix: str, attr_pk) -> DataFrame:
    dicts_with_attr = [d for d in dicts if attr in d]

    for d in dicts_with_attr:
        if not isinstance(d[attr], list):
            d[attr] = [d[attr]]

    attr_meta_df = pd.json_normalize(
        dicts_with_attr,
        record_path=attr,
        meta=[parent_pk],
        meta_prefix=parent_prefix
    )

    parent_pk_col_name = parent_prefix + parent_pk
    parent_pk_col = attr_meta_df.pop(parent_pk_col_name)
    attr_meta_df.insert(0, parent_pk_col_name, parent_pk_col)

    return attr_meta_df

In [758]:
def del_attr(dicts: list[dict], attr: str):
    return [{k: v for k, v in d.items() if k != attr} for d in dicts]

In [759]:
def standardize_df(df: DataFrame, pk_col_name: str) -> DataFrame:
    if pk_col_name == 'id':
        df['id'] = df['id'].astype(int)
    
    pk_col = df.pop(pk_col_name)
    df.insert(0, pk_col_name, pk_col)
    
    return df.sort_values(pk_col_name).reset_index(drop=True)

In [760]:
def extract_attr_as_df(dicts: list[dict], attr: str, parent_pk: str, parent_prefix: str, attr_pk) -> (
list[dict], DataFrame):
    attr_meta_df = attr_to_df(dicts, attr, parent_pk, parent_prefix, attr_pk)

    pruned_dicts = del_attr(dicts, attr)

    attr_meta_df = standardize_df(attr_meta_df, attr_pk)

    return pruned_dicts, attr_meta_df

#### Constants

In [761]:
HT_PK = 'id'
HT_META_PREFIX = 'health_topic_'

RENAMINGS = {
    'date-created': 'date_created',
    'meta-desc': 'description',
    'language-mapped-url': 'language_mapped_url',
    0: 'name'
}

## Parsing

In [762]:
with open("../data/mplus_topics_2024-08-10(2).xml", 'r') as file:
    health_topics_dict = xd.parse(file.read(), attr_prefix='', cdata_key='text')['health-topics']
health_topics = health_topics_dict['health-topic']

In [763]:
dataset_size = len(health_topics)
dataset_timestamp = health_topics_dict['date-generated']

## Data preparation

#### Pruning unused attributes

In [764]:
health_topics = [{**ht, HT_PK: int(ht[HT_PK])} for ht in health_topics]

attrs_to_prune = ['also-called', 'see-reference', 'full-summary', 'mesh-heading', 'language-mapped-topic',
                  'other-language']
health_topics = [{k: v for k, v in ht.items() if k not in attrs_to_prune} for ht in health_topics]

#### Extracting nested objects

In [765]:
for ht in health_topics:
    if not isinstance(ht['site'], list):
        ht['site'] = [ht['site']]

sites = [site for ht in health_topics if 'site' in ht for site in ht['site']]

In [766]:
(sites, info_cat_site_df) = extract_attr_as_df(sites, 'information-category', 'url', 'site_', 0)
(health_topics, site_ht_df) = extract_attr_as_df(health_topics, 'site', HT_PK, HT_META_PREFIX, 'url')
(health_topics, prim_inst_ht_df) = extract_attr_as_df(health_topics, 'primary-institute', HT_PK, HT_META_PREFIX, 'url')
(health_topics, group_ht_df) = extract_attr_as_df(health_topics, 'group', HT_PK, HT_META_PREFIX, 'id')
(health_topics, related_topic_ht_df) = extract_attr_as_df(health_topics, 'related-topic', HT_PK, HT_META_PREFIX, 'id')
health_topic_df = standardize_df(pd.json_normalize(health_topics), HT_PK)

#### Normalizing extracted DataFrames

In [767]:
info_cat_df = info_cat_site_df.drop_duplicates([0]).reset_index(drop=True)

site_df = (site_ht_df
           .drop(columns=['health_topic_id', 'information-category'])
           .drop_duplicates(['url'])
           .reset_index(drop=True))

prim_inst_df = (prim_inst_ht_df.drop(columns=['health_topic_id'])
                .drop_duplicates(['url'])
                .reset_index(drop=True))

group_df = (group_ht_df.drop(columns=['health_topic_id'])
            .drop_duplicates(['id'])
            .reset_index(drop=True))

health_topic_df = health_topic_df.drop_duplicates(['id']).reset_index(drop=True)

site_ht_df = (site_ht_df
              .drop(columns=['information-category', 'language-mapped-url', 'organization', 'standard-description']))

prim_inst_ht_df = prim_inst_ht_df.drop(columns=['text'])

group_ht_df = group_ht_df.drop(columns=['url', 'text'])

related_topic_ht_df = related_topic_ht_df.drop(columns=['url', 'text'])

#### Renaming columns

In [768]:
dfs = [info_cat_df, site_df, prim_inst_df, group_df, health_topic_df, site_ht_df, prim_inst_ht_df, group_ht_df, related_topic_ht_df]
for df in dfs:
    renamings = {k: v for k, v in RENAMINGS.items() if k in list(df.columns.values)}
    df.rename(columns=renamings, inplace=True)

## Display

In [769]:
dataset_size

2044

In [770]:
dataset_timestamp

'08/10/2024 02:30:32'

In [771]:
info_cat_df

Unnamed: 0,name,site_url
0,Adolescentes,https://kidshealth.org/es/teens/care-sickle-ce...
1,Adultos,https://medlineplus.gov/spanish/ency/patientin...
2,Adults,https://tourette.org/about-tourette/overview/l...
3,Asuntos específicos,https://www.nidcr.nih.gov/sites/default/files/...
4,Asuntos relacionados,https://medlineplus.gov/spanish/ency/article/0...
...,...,...
56,Tratamientos y terapias,https://www.cancer.gov/espanol/tipos/higado/ca...
57,Treatments and Therapies,https://www.nccih.nih.gov/health/probiotics-wh...
58,Videos and Tutorials,https://www.endocrine.org/patient-engagement/e...
59,Viviendo con...,https://medlineplus.gov/spanish/ency/patientin...


In [772]:
site_df

Unnamed: 0,url,title,language_mapped_url,organization,standard-description
0,http://depts.washington.edu/learncpr/spanish/,"Aprenda RCP ¡Sí, se puede!",http://depts.washington.edu/learncpr/,"Universidad de Washington, Facultad de Medicina",
1,http://npic.orst.edu/capro/exposure.es.html,¿Es tóxico si lo como o lo toco? ¿La ruta de e...,,Centro Nacional de Información de Pesticidas,
2,http://npic.orst.edu/capro/fruitwash.es.html,¿Cómo reducir los residuos de pesticidas en fr...,,Centro Nacional de Información de Pesticidas,
3,http://npic.orst.edu/envir/index.es.html,Pesticidas y el medio ambiente,,Centro Nacional de Información de Pesticidas,
4,http://npic.orst.edu/factsheets/MinimizingExpo...,Pesticidas: Minimizando la exposición,,Centro Nacional de Información de Pesticidas,
...,...,...,...,...,...
33839,https://youngwomenshealth.org/parents/labial-a...,Labial Adhesions: A Guide for Parents,,Boston Children's Hospital,
33840,https://youngwomenshealth.org/parents/pcos-par...,Polycystic Ovary Syndrome (PCOS) in Teens: A G...,https://youngwomenshealth.org/parents/sindrome...,Boston Children's Hospital,
33841,https://youngwomenshealth.org/parents/sindrome...,Síndrome de ovario poliquístico (SOPQ): Una gu...,https://youngwomenshealth.org/parents/pcos-par...,Hospital de Niños de Boston,
33842,https://youth.gov/youth-topics/lgbt,Youth Topics: LGBT,,Department of Health and Human Services,


In [773]:
prim_inst_df

Unnamed: 0,url,text
0,http://orwh.od.nih.gov/,NIH Office of Research on Women's Health
1,http://www.cancer.gov/,National Cancer Institute
2,http://www.genome.gov/,National Human Genome Research Institute
3,http://www.nhlbi.nih.gov/,"National Heart, Lung, and Blood Institute"
4,http://www.niaaa.nih.gov/,National Institute on Alcohol Abuse and Alcoho...
5,http://www.niaid.nih.gov/,National Institute of Allergy and Infectious D...
6,http://www.niams.nih.gov/,National Institute of Arthritis and Musculoske...
7,http://www.nidcr.nih.gov/,National Institute of Dental and Craniofacial ...
8,http://www.niehs.nih.gov/,National Institute of Environmental Health Sci...
9,http://www.nimh.nih.gov/,National Institute of Mental Health


In [774]:
group_df

Unnamed: 0,id,url,text
0,1,https://medlineplus.gov/spanish/cancers.html,Cánceres
1,2,https://medlineplus.gov/digestivesystem.html,Digestive System
2,3,https://medlineplus.gov/childrenandteenagers.html,Children and Teenagers
3,4,https://medlineplus.gov/spanish/women.html,Mujeres
4,5,https://medlineplus.gov/mentalhealthandbehavio...,Mental Health and Behavior
5,6,https://medlineplus.gov/olderadults.html,Older Adults
6,7,https://medlineplus.gov/spanish/bloodheartandc...,"Sangre, corazón y circulación"
7,8,https://medlineplus.gov/substanceuseanddisorde...,Substance Use and Disorders
8,9,https://medlineplus.gov/spanish/eyesandvision....,Ojos y visión
9,10,https://medlineplus.gov/bonesjointsandmuscles....,"Bones, Joints and Muscles"


In [775]:
health_topic_df

Unnamed: 0,id,description,title,url,language,date_created
0,1,HIV (human immunodeficiency virus) attacks the...,HIV,https://medlineplus.gov/hiv.html,English,10/22/1998
1,2,Asthma causes one's airways to become sore and...,Asthma,https://medlineplus.gov/asthma.html,English,10/22/1998
2,3,Breast cancer is the second most common type o...,Breast Cancer,https://medlineplus.gov/breastcancer.html,English,10/22/1998
3,4,Diabetes is a chronic health condition in whic...,Diabetes,https://medlineplus.gov/diabetes.html,English,10/22/1998
4,5,Over a million people in the U.S. have a heart...,Heart Attack,https://medlineplus.gov/heartattack.html,English,10/22/1998
...,...,...,...,...,...,...
2039,7648,Infórmese sobre las vacunas aprobadas para el ...,Vacunas contra el COVID-19,https://medlineplus.gov/spanish/covid19vaccine...,Spanish,11/09/2020
2040,7667,Vaccines protect you and your family from dise...,Vaccine Safety,https://medlineplus.gov/vaccinesafety.html,English,10/20/2020
2041,7668,Las vacunas le protegen a usted y a su familia...,Seguridad de las vacunas,https://medlineplus.gov/spanish/vaccinesafety....,Spanish,10/20/2020
2042,7807,"Anyone who had COVID-19, whether is was severe...",Post-COVID Conditions (Long COVID),https://medlineplus.gov/postcovidconditionslon...,English,09/20/2022


In [776]:
site_ht_df

Unnamed: 0,url,health_topic_id,title
0,http://depts.washington.edu/learncpr/spanish/,3640,"Aprenda RCP ¡Sí, se puede!"
1,http://depts.washington.edu/learncpr/spanish/,1818,"Aprenda RCP ¡Sí, se puede!"
2,http://depts.washington.edu/learncpr/spanish/,1906,"Aprenda RCP ¡Sí, se puede!"
3,http://npic.orst.edu/capro/exposure.es.html,2085,¿Es tóxico si lo como o lo toco? ¿La ruta de e...
4,http://npic.orst.edu/capro/fruitwash.es.html,1910,¿Cómo reducir los residuos de pesticidas en fr...
...,...,...,...
54136,https://youngwomenshealth.org/parents/labial-a...,4550,Labial Adhesions: A Guide for Parents
54137,https://youngwomenshealth.org/parents/pcos-par...,5912,Polycystic Ovary Syndrome (PCOS) in Teens: A G...
54138,https://youngwomenshealth.org/parents/sindrome...,5913,Síndrome de ovario poliquístico (SOPQ): Una gu...
54139,https://youth.gov/youth-topics/lgbt,261,Youth Topics: LGBT


In [777]:
prim_inst_ht_df

Unnamed: 0,url,health_topic_id
0,http://orwh.od.nih.gov/,4550
1,http://orwh.od.nih.gov/,462
2,http://www.cancer.gov/,330
3,http://www.cancer.gov/,1263
4,http://www.cancer.gov/,3752
...,...,...
1625,https://www.nih.gov/institutes-nih/nih-office-...,7467
1626,https://www.nih.gov/institutes-nih/nih-office-...,3073
1627,https://www.nih.gov/institutes-nih/nih-office-...,4565
1628,https://www.ninr.nih.gov/,496


In [778]:
group_ht_df

Unnamed: 0,id,health_topic_id
0,1,5329
1,1,4015
2,1,1831
3,1,405
4,1,4356
...,...,...
3653,45,3528
3654,45,1851
3655,45,6309
3656,45,4


In [779]:
related_topic_ht_df

Unnamed: 0,id,health_topic_id
0,1,7347
1,1,5737
2,1,1470
3,1,1503
4,1,481
...,...,...
6065,7807,7627
6066,7807,3181
6067,7808,7648
6068,7808,3182
