# Preprocessing

## Setup

#### Imports

In [None]:
import xmltodict as xd
import pandas as pd

from pandas import DataFrame

#### Methods

In [None]:
def _attr_to_df(dicts: list[dict], attr: str, parent_pk: str, parent_prefix: str, attr_pk) -> DataFrame:
    dicts_with_attr = [d for d in dicts if attr in d]

    for d in dicts_with_attr:
        if not isinstance(d[attr], list):
            d[attr] = [d[attr]]

    attr_meta_df = pd.json_normalize(
        dicts_with_attr,
        record_path=attr,
        meta=[parent_pk],
        meta_prefix=parent_prefix
    )

    parent_pk_col_name = parent_prefix + parent_pk
    parent_pk_col = attr_meta_df.pop(parent_pk_col_name)
    attr_meta_df.insert(0, parent_pk_col_name, parent_pk_col)

    return attr_meta_df

In [None]:
def _del_attr(dicts: list[dict], attr: str):
    return [{k: v for k, v in d.items() if k != attr} for d in dicts]

In [None]:
def _standardize_df(df: DataFrame, pk_col_name: str) -> DataFrame:
    if pk_col_name == 'id':
        df['id'] = df['id'].astype(int)

    pk_col = df.pop(pk_col_name)
    df.insert(0, pk_col_name, pk_col)

    return df.sort_values(pk_col_name).reset_index(drop=True)

In [None]:
def _extract_attr_as_df(dicts: list[dict], attr: str, parent_pk: str, parent_prefix: str, attr_pk) -> (
        list[dict], DataFrame):
    attr_meta_df = _attr_to_df(dicts, attr, parent_pk, parent_prefix, attr_pk)

    pruned_dicts = _del_attr(dicts, attr)

    attr_meta_df = _standardize_df(attr_meta_df, attr_pk)

    return pruned_dicts, attr_meta_df

In [None]:
def convert(dicts: list[dict], root_name: str, attr_pks: list[(str, str)], root_pk: str = 'id') -> dict:
    meta_prefix = root_name + '_'
    dataframes = {}
    for attr, attr_pk in attr_pks:
        (dicts, attr_meta_df) = _extract_attr_as_df(dicts, attr, root_pk, meta_prefix, attr_pk)
        dataframes[attr] = attr_meta_df

    root_df = _standardize_df(pd.json_normalize(dicts), root_pk)
    dataframes[root_name] = root_df
    return dataframes

In [None]:
def replace_keys(d):
    if isinstance(d, dict):
        return {k.replace('-', '_'): replace_keys(v) for k, v in d.items()}
    elif isinstance(d, list):
        return [replace_keys(item) for item in d]  # Handle lists by applying replace_keys to each item
    return d  # Return other data types unchanged

#### Constants

In [None]:
HT_PK = 'id'
HT_META_PREFIX = 'health_topic_'

RENAMINGS = {
    'meta_desc': 'description',
    'standard_description': 'description',
    'text': 'name',
    0: 'name'
}

## Parsing

In [None]:
with open("../data/mplus_topics.xml", 'r') as file:
    health_topics_dict = xd.parse(file.read(), attr_prefix='', cdata_key='text')['health-topics']
health_topics = health_topics_dict['health-topic']

In [None]:
dataset_size = len(health_topics)
dataset_timestamp = health_topics_dict['date-generated']

## Data preparation

#### Pruning unused attributes

In [None]:
attrs_to_prune = ['also-called', 'see-reference', 'full-summary', 'mesh-heading', 'language-mapped-topic',
                  'other-language']
health_topics = [{k: v for k, v in ht.items() if k not in attrs_to_prune} for ht in health_topics]

health_topics = replace_keys(health_topics)

health_topics = [{**ht, HT_PK: int(ht[HT_PK])} for ht in health_topics]

#### Extracting nested objects

In [None]:
for ht in health_topics:
    if not isinstance(ht['site'], list):
        ht['site'] = [ht['site']]

sites = [site for ht in health_topics if 'site' in ht for site in ht['site']]

In [None]:
info_cat_site_df = convert(sites, 'site', [('information_category', 0)], 'url')['information_category']

attr_pks = [('site', 'url'), ('primary_institute', 'url'), ('group', 'id'), ('related_topic', 'id')]

dfs = convert(health_topics, 'health_topic', attr_pks)

site_ht_df = dfs['site']
prim_inst_ht_df = dfs['primary_institute']
group_ht_df = dfs['group']
related_topic_ht_df = dfs['related_topic']
health_topic_df = dfs['health_topic']

#### Normalizing extracted DataFrames

In [None]:
info_cat_df = (info_cat_site_df.drop(columns=['site_url'])
               .drop_duplicates([0])
               .reset_index(drop=True))

site_df = (site_ht_df.drop(columns=['health_topic_id', 'information_category'])
           .drop_duplicates(['url'])
           .reset_index(drop=True))

prim_inst_df = (prim_inst_ht_df.drop(columns=['health_topic_id'])
                .drop_duplicates(['url'])
                .reset_index(drop=True))

group_df = (group_ht_df.drop(columns=['health_topic_id'])
            .drop_duplicates(['id'])
            .reset_index(drop=True))

health_topic_df = health_topic_df.drop_duplicates(['id']).reset_index(drop=True)

site_ht_df = (site_ht_df
              .drop(columns=['information_category', 'language_mapped_url', 'organization', 'standard_description']))

prim_inst_ht_df = prim_inst_ht_df.drop(columns=['text'])

group_ht_df = group_ht_df.drop(columns=['url', 'text'])

related_topic_ht_df = related_topic_ht_df.drop(columns=['url', 'text'])

#### Renaming columns

In [None]:
dfs = [info_cat_df, site_df, prim_inst_df, group_df, info_cat_site_df, health_topic_df, site_ht_df, prim_inst_ht_df,
       group_ht_df, related_topic_ht_df]
for df in dfs:
    renamings = {k: v for k, v in RENAMINGS.items() if k in list(df.columns.values)}
    df.rename(columns=renamings, inplace=True)

## Display

In [None]:
dataset_size

In [None]:
dataset_timestamp

In [None]:
info_cat_df

In [None]:
site_df

In [None]:
prim_inst_df

In [None]:
group_df

In [None]:
info_cat_site_df

In [None]:
health_topic_df

In [None]:
site_ht_df

In [None]:
prim_inst_ht_df

In [None]:
group_ht_df

In [None]:
related_topic_ht_df