In [None]:
import pandas as pd
import xmltodict
from tqdm import tqdm




import plotly.graph_objects as go



def exportImage(results, outputFile):
    ### pip install -U kaleido

    df = pd.DataFrame(results)
    col_widths = [max(df[df.columns[0]].astype(str).map(len)) * 2] + \
                 [max(df[col].astype(str).map(len)) * 10 for col in df.columns[1:]]
    fig = go.Figure(data=[go.Table(
        header=dict(values=list(df.columns),
                    font=dict(size=12, color="black"),
                    fill_color='#b8b8b8',
                    align='center'),
        cells=dict(
            values=[df[col] for col in df.columns],
            fill_color=[['#ececec', '#dcdcdc'] * len(df)],
            align='center',
            font=dict(size=12, color="black")
        ),
        columnwidth=col_widths
    )])
    # Save as image
    fig.write_image(outputFile)  # Requires `kaleido` package
    fig.show()



def runProcess():
    """
    Main method for number_of_Orphanet_active_entities notebook
    :return: None
    """
    print('--> Computing data ...')

    ## Select active entities in Orphadata Rare diseases and alignment product (product1)
    xml_data = getData('../xml/en_product1.xml')
    n_full_entities, actives_entities, inactives_entities = getActivesEntities(xml_data)

    ## Count nomenclature metrics
    dict_results_actives, \
    dict_results_inactives, \
    dict_results_groups, \
    dict_results_disorders, \
    dict_results_subtypes = countEntities(n_full_entities,
                                          inactives_entities,
                                          actives_entities
                                          )

    ## Outputs
    ## open Excel file with different worksheets
    print("Generating XLSX output file...")
    with pd.ExcelWriter('../output_tables/1_number_of_Orphanet_clinical_entities.xlsx', engine='xlsxwriter') as writer:
        myDataframe_full_actives = pd.DataFrame(dict_results_actives)
        myDataframe_full_actives.to_excel(writer, index=False, sheet_name='Orphanet active entities')
        ## display table in stdout as HTML table
        # display(HTML(myDataframe_full_actives.to_html(index=False)))

        myDataframe_full_inactives = pd.DataFrame(dict_results_inactives)
        myDataframe_full_inactives.to_excel(writer, index=False, sheet_name='Orphanet inactive entities')
        # display(HTML(myDataframe_full_inactives.to_html(index=False)))

        myDataframe_groups = pd.DataFrame(dict_results_groups)
        myDataframe_groups.to_excel(writer, index=False, sheet_name='Orphanet active gp of disorders')
        # display(HTML(myDataframe_groups.to_html(index=False)))

        myDataframe_disorders = pd.DataFrame(dict_results_disorders)
        myDataframe_disorders.to_excel(writer, index=False, sheet_name='Orphanet active disorders')
        # display(HTML(myDataframe_disorders.to_html(index=False)))

        myDataframe_subtypes = pd.DataFrame(dict_results_subtypes)
        myDataframe_subtypes.to_excel(writer, index=False, sheet_name='Orphanet active sb of disorders')
        # display(HTML(myDataframe_subtypes.to_html(index=False)))
        ## display table in stdout as HTML table
        ##display(HTML(myDataframe.to_html(index=False)))
    print("Generating PNG output file...")
    exportImage(dict_results_actives, "../output_images/1_number_of_Orphanet_active_entities.png")
    exportImage(dict_results_inactives, "../output_images/1_number_of_Orphanet_inactive_entities.png")
    exportImage(dict_results_groups, "../output_images/1_number_of_Orphanet_active_groups_of_disorder.png")
    exportImage(dict_results_disorders, "../output_images/1_number_of_Orphanet_active_disorders.png")
    exportImage(dict_results_subtypes, "../output_images/1_number_of_Orphanet_active_subtypes_of_disorder.png")

def getData(xmlfile):
    """
    Read an xml return a dict with xmltodict package
    :param: xmlfile
    :return: xml_dict (xml parsed as dict)
    """
    with open(xmlfile, "r", encoding='UTF-8') as ini:
        xml_dict = xmltodict.parse(ini.read())
    return xml_dict


def getActivesEntities(xml_data):
    """
    filter out inactive entities highlighting flag 8192
    :param: xml_data:
    :return: n_full_entities, actives_entities, inactives_entities
    """
    n_full_entities = 0
    inactives_entities = []
    actives_entities = []
    for entity in tqdm(xml_data["JDBOR"]["DisorderList"]["Disorder"]):
        n_full_entities += 1
        DisorderFlagList = entity['DisorderFlagList']['DisorderFlag']
        if isinstance(DisorderFlagList, dict):
            if DisorderFlagList['Value'] != '8192':
                actives_entities.append(entity)
            else:
                inactives_entities.append(entity)
        else:
            DisorderFlags = []
            for DisorderFlag in DisorderFlagList:
                DisorderFlags.append(DisorderFlag['Value'])
            if not '8192' in DisorderFlags:
                actives_entities.append(entity)
            else:
                inactives_entities.append(entity)
    return n_full_entities, actives_entities, inactives_entities


def countEntities(n_entities, inactives_entities, actives_entities):
    """
    keeping only disorders (with flag 36547)
    :params: n_entities, inactives_entities, actives_entities
    :return: dict_results_actives, dict_results_inactives, dict_results_groups, dict_results_disorders, dict_results_subtypes
    """
    ## creates columns dataframe
    groups_of_disorders = []
    inactives_groups_of_disorders = []
    categories = []
    clinical_groups = []
    disorders = []
    inactives_disorders = []
    biological_anomalies = []
    clinical_syndromes = []
    diseases = []
    malformation_syndromes = []
    morphological_anomalies = []
    particular_clinical_situations_in_a_disease_or_syndrome_anomalies = []
    subtypes_of_disorders = []
    inactives_subtypes_of_disorders = []
    clinical_subtype_of_disorders = []
    etiological_subtype_of_disorders = []
    histopathological_subtype_of_disorders = []

    ## loop on actives entities
    for entity in tqdm(actives_entities):
        ## gp of disorder
        if entity['DisorderGroup']['@id'] == '36540':
            groups_of_disorders.append(entity)
            ## if gp is a category
            if entity['DisorderType']['@id'] == '36561':
                categories.append(entity)
            ## or gp is a clinical group
            elif entity['DisorderType']['@id'] == '21436':
                clinical_groups.append(entity)

        ## disorders
        elif entity['DisorderGroup']['@id'] == '36547':
            disorders.append(entity)
            ## if disorder is a biological anomaly
            if entity['DisorderType']['@id'] == '21408':
                biological_anomalies.append(entity)
            ## or disorder is a clinical syndrome
            elif entity['DisorderType']['@id'] == '21422':
                clinical_syndromes.append(entity)
            ## or disorder is a disease
            elif entity['DisorderType']['@id'] == '21394':
                diseases.append(entity)
            ## or disorder is a malformation syndrome
            elif entity['DisorderType']['@id'] == '21401':
                malformation_syndromes.append(entity)
            ## or disorder is a morphological anomaly
            elif entity['DisorderType']['@id'] == '21415':
                morphological_anomalies.append(entity)
            ## or disorder is a particular clinical situations in a disease or syndrome anomaly
            elif entity['DisorderType']['@id'] == '21429':
                particular_clinical_situations_in_a_disease_or_syndrome_anomalies.append(entity)

        ## subtype of disorder
        elif entity['DisorderGroup']['@id'] == '36554':
            subtypes_of_disorders.append(entity)
            ## if subtype of disorder is a clinical subtype of disorder
            if entity['DisorderType']['@id'] == '21450':
                clinical_subtype_of_disorders.append(entity)
            ## or subtype of disorder is an etiological subtype of disorder
            elif entity['DisorderType']['@id'] == '21443':
                etiological_subtype_of_disorders.append(entity)
            ## or subtype of disorder is an histopathological subtype of disorder
            elif entity['DisorderType']['@id'] == '21457':
                histopathological_subtype_of_disorders.append(entity)

    ## loop on inactives entities
    for entity in tqdm(inactives_entities):
        if entity['DisorderGroup']['@id'] == '36540':
            inactives_groups_of_disorders.append(entity)
        elif entity['DisorderGroup']['@id'] == '36547':
            inactives_disorders.append(entity)
        elif entity['DisorderGroup']['@id'] == '36554':
            inactives_subtypes_of_disorders.append(entity)

    ## calculate metrics
    n_inactives = len(inactives_entities)
    n_actives = len(actives_entities)
    percent_inactives = str(round(((n_inactives * 100) / n_entities), 2))
    percent_actives = str(round(((n_actives * 100) / n_entities), 2))

    n_groups_of_disorders = len(groups_of_disorders)
    n_inactives_groups_of_disorders = len(inactives_groups_of_disorders)
    n_categories = len(categories)
    n_clinical_groups = len(clinical_groups)
    percent_groups_of_disorders = str(round(((n_groups_of_disorders * 100) / n_actives), 2))
    percent_inactives_groups_of_disorders = str(round(((n_inactives_groups_of_disorders * 100) / n_inactives), 2))
    percent_categories = str(round(((n_categories * 100) / n_actives), 2))
    percent_clinical_groups = str(round(((n_clinical_groups * 100) / n_actives), 2))

    n_disorders = len(disorders)
    n_inactives_disorders = len(inactives_disorders)
    n_biological_anomalies = len(biological_anomalies)
    n_clinical_syndromes = len(clinical_syndromes)
    n_diseases = len(diseases)
    n_malformation_syndromes = len(malformation_syndromes)
    n_morphological_anomalies = len(morphological_anomalies)
    n_particular_clinical_situations_in_a_disease_or_syndrome_anomalies = len(
        particular_clinical_situations_in_a_disease_or_syndrome_anomalies)
    percent_disorders = str(round(((n_disorders * 100) / n_actives), 2))
    percent_inactives_disorders = str(round(((n_inactives_disorders * 100) / n_inactives), 2))
    percent_biological_anomalies = str(round(((n_biological_anomalies * 100) / n_actives), 2))
    percent_clinical_syndromes = str(round(((n_clinical_syndromes * 100) / n_actives), 2))
    percent_diseases = str(round(((n_diseases * 100) / n_actives), 2))
    percent_malformation_syndromes = str(round(((n_malformation_syndromes * 100) / n_actives), 2))
    percent_morphological_anomalies = str(round(((n_morphological_anomalies * 100) / n_actives), 2))
    percent_particular_clinical_situations_in_a_disease_or_syndrome_anomalies = str(
        round(((n_particular_clinical_situations_in_a_disease_or_syndrome_anomalies * 100) / n_actives), 2))

    n_subtypes_of_disorders = len(subtypes_of_disorders)
    n_inactives_subtypes_of_disorders = len(inactives_subtypes_of_disorders)
    n_clinical_subtype_of_disorders = len(clinical_subtype_of_disorders)
    n_etiological_subtype_of_disorders = len(etiological_subtype_of_disorders)
    n_histopathological_subtype_of_disorders = len(histopathological_subtype_of_disorders)
    percent_inactives_subtypes_of_disorders = str(round(((n_inactives_subtypes_of_disorders * 100) / n_inactives), 2))
    percent_subtypes_of_disorders = str(round(((n_subtypes_of_disorders * 100) / n_actives), 2))
    percent_clinical_subtype_of_disorders = str(round(((n_clinical_subtype_of_disorders * 100) / n_actives), 2))
    percent_etiological_subtype_of_disorders = str(round(((n_etiological_subtype_of_disorders * 100) / n_actives), 2))
    percent_histopathological_subtype_of_disorders = str(
        round(((n_histopathological_subtype_of_disorders * 100) / n_actives), 2))

    ## creates matrices for dataframes
    dict_results_actives = {}
    dict_results_actives['Orphanet actives clinical entities'] = ['Number of Orphanet clinical entities',
                                                                  'Number of Orphanet active clinical entities',
                                                                  'Number of Orphanet active group of disorders',
                                                                  'Number of Orphanet active disorders',
                                                                  'Number of Orphanet active subtype of disorders'
                                                                  ]

    dict_results_actives['Amount'] = [n_entities,
                                      n_actives,
                                      n_groups_of_disorders,
                                      n_disorders,
                                      n_subtypes_of_disorders
                                      ]
    dict_results_actives['%'] = ['-',
                                 percent_actives,
                                 percent_groups_of_disorders,
                                 percent_disorders,
                                 percent_subtypes_of_disorders
                                 ]

    ####
    dict_results_inactives = {}
    dict_results_inactives['Orphanet inactives clinical entities'] = ['Number of Orphanet clinical entities',
                                                                      'Number of Orphanet inactives clinical entities',
                                                                      'Number of Orphanet inactives group of disorders',
                                                                      'Number of Orphanet inactives disorders',
                                                                      'Number of Orphanet inactives subtype of disorders'
                                                                      ]

    dict_results_inactives['Amount'] = [n_entities,
                                        n_inactives,
                                        n_inactives_groups_of_disorders,
                                        n_inactives_disorders,
                                        n_inactives_subtypes_of_disorders
                                        ]
    dict_results_inactives['%'] = ['-',
                                   percent_inactives,
                                   percent_inactives_groups_of_disorders,
                                   percent_inactives_disorders,
                                   percent_inactives_subtypes_of_disorders
                                   ]

    ####
    dict_results_groups = {}
    dict_results_groups['Orphanet active group of disorders'] = ['Number of Orphanet active group of disorders',
                                                                 'Number of Orphanet active categories',
                                                                 'Number of Orphanet active clinical groups'
                                                                 ]

    dict_results_groups['Amount'] = [n_groups_of_disorders,
                                     n_categories,
                                     n_clinical_groups
                                     ]

    dict_results_groups['%'] = [percent_groups_of_disorders,
                                percent_categories,
                                percent_clinical_groups
                                ]

    ####
    dict_results_disorders = {}
    dict_results_disorders['Orphanet active disorders'] = ['Number of Orphanet RD',
                                                           'Number of Orphanet active biological anomalies',
                                                           'Number of Orphanet active clinical syndromes',
                                                           'Number of Orphanet active diseases',
                                                           'Number of Orphanet active malformation syndromes',
                                                           'Number of Orphanet active morphological anomalies',
                                                           'Number of Orphanet active particular clinical situations in a disease or syndrome anomalies'
                                                           ]

    dict_results_disorders['Amount'] = [n_disorders,
                                        n_biological_anomalies,
                                        n_clinical_syndromes,
                                        n_diseases,
                                        n_malformation_syndromes,
                                        n_morphological_anomalies,
                                        n_particular_clinical_situations_in_a_disease_or_syndrome_anomalies
                                        ]

    dict_results_disorders['%'] = [percent_disorders,
                                   percent_biological_anomalies,
                                   percent_clinical_syndromes,
                                   percent_diseases,
                                   percent_malformation_syndromes,
                                   percent_morphological_anomalies,
                                   percent_particular_clinical_situations_in_a_disease_or_syndrome_anomalies
                                   ]

    ####
    dict_results_subtypes = {}
    dict_results_subtypes['Orphanet active subtype of disorders'] = ['Number of Orphanet active subtypes of disorder',
                                                                     'Number of Orphanet active clinical subtype of disorders',
                                                                     'Number of Orphanet active etiological subtype of disorders',
                                                                     'Number of Orphanet active histopathological subtype of disorders'
                                                                     ]

    dict_results_subtypes['Amount'] = [n_subtypes_of_disorders,
                                       n_clinical_subtype_of_disorders,
                                       n_etiological_subtype_of_disorders,
                                       n_histopathological_subtype_of_disorders
                                       ]

    dict_results_subtypes['%'] = [percent_subtypes_of_disorders,
                                  percent_clinical_subtype_of_disorders,
                                  percent_etiological_subtype_of_disorders,
                                  percent_histopathological_subtype_of_disorders
                                  ]

    return dict_results_actives, dict_results_inactives, dict_results_groups, dict_results_disorders, dict_results_subtypes




if __name__ == '__main__':
    runProcess()
