<a href="https://colab.research.google.com/github/NikitiusIvanov/gbd-life-extension-dashboard/blob/main/life_expectancy_extension_estimation_all_countries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook contains calculation estimation impact of risk factors into life expectancy (https://github.com/NikitiusIvanov/gbd-life-extension-dashboard)

All data was taken from "Global Burden of Disease Study 2019" results tool (https://vizhub.healthdata.org/gbd-results/)

Jupyter notebook with data preprocessing: https://drive.google.com/file/d/1o5NwBdJ5_NS11u_YAwWpsAWGMFc_xe8H/view?usp=sharing

For reproduce data manipulation you can load the same data from gbd site or fork it from my google-drive by link: https://drive.google.com/drive/folders/1B7BqeCXWVJunX0cBBFT4d25W_Yaw62Sh?usp=sharing

## Imports

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
cd '/content/drive/MyDrive/gbd-life-expectancy-risk-factors-impact-estimation'

/content/drive/MyDrive/gbd-life-expectancy-risk-factors-impact-estimation


In [4]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.colors import n_colors
from tqdm import tqdm

## Load the code book

In [6]:
code_book = pd.read_csv(
    os.path.join('data', 'coode_book', 'IHME_GBD_2019_CODEBOOK_Y2022M06D29.CSV')
).iloc[1:, 1:]

code_book.head(3)

Unnamed: 0,measure_id,measure_name,location_id,location_name,sex_id,sex_label,age_group_id,age_group_name,cause_id,cause_name,rei_id,rei_name,metric_id,metric_name,year_id,val,upper,lower
1,1,Deaths,1,Global,Sex ID,Sex,1,Under 5,294,All causes,169,All risk factors,1,Number,1990,,,
2,2,DALYs (Disability-Adjusted Life Years),44637,Low SDI,1,Male,2,Early Neonatal,295,"Communicable, maternal, neonatal, and nutritio...",202,Environmental/occupational risks,2,Percent,1991,,,
3,3,YLDs (Years Lived with Disability),44636,Low-middle SDI,2,Female,3,Late Neonatal,955,HIV/AIDS and sexually transmitted infections,203,Behavioral risks,3,Rate,1992,,,


In [None]:
measure_name_to_id = { # create mappings
    key: int(value) 
    for key, value in code_book[['measure_name', 'measure_id']].dropna().values
}

metric_name_to_id = {
    key: int(value) 
    for key, value in code_book[['metric_name', 'metric_id']].dropna().values
}

sex_name_to_id = {
    key: int(value) 
    for key, value in code_book[['sex_label', 'sex_id']].dropna().values[1:]
}

age_name_to_id = {
    key: int(value) 
    for key, value in code_book[['age_group_name', 'age_group_id']].dropna().values
}

cause_name_to_id = {
    key: int(value) 
    for key, value in code_book[['cause_name', 'cause_id']].dropna().values
}

risks_name_to_id = {
    key: int(value) 
    for key, value in code_book[['rei_name', 'rei_id']].dropna().values
}

location_name_to_id = {
    key: int(value) 
    for key, value in code_book[['location_name', 'location_id']].dropna().values
}

name_to_id_mappings: dict = {
    'measure_id' :measure_name_to_id,
    'metric_id' :metric_name_to_id,
    'sex_id' :sex_name_to_id,
    'age_id' :age_name_to_id,
    'cause_id' :cause_name_to_id,
    'risks_id' :risks_name_to_id,
    'location_id' :location_name_to_id,
}


age_group_name_to_start_age_int = {
    '<1 year': 0,
    '1 to 4': 1,
    '5 to 9': 5,
    '10 to 14': 10,
    '15 to 19': 15,
    '20 to 24': 20,
    '25 to 29': 25,
    '30 to 34': 30,
    '35 to 39': 35,
    '40 to 44': 40,
    '45 to 49': 45,
    '50 to 54': 50,
    '55 to 59': 55,
    '60 to 64': 60,
    '65 to 69': 65,
    '70 to 74': 70,
    '75 to 79': 75,    
    '80 to 84': 80,
    '85 to 89': 85,
    '90 to 94': 90,
    '95 plus': 95,
}

age_group_id_to_start_age_int = {
    '28': 0,
    '5': 1,
    '6': 5,
    '7': 10,
    '8': 15,
    '9': 20,
    '10': 25,
    '11': 30,
    '12': 35,
    '13': 40,
    '14': 45,
    '15': 50,
    '16': 55,
    '17': 60,
    '18': 65,
    '19': 70,
    '20': 75,
    '30': 80,
    '31': 85,
    '32': 90,
    '235': 95
 }

## Define the utilites and functions to estimation impact of risk factors into life expectancy

In [None]:
def find_name_by_fragment_in_id_mapping(
    name_to_id_dict: dict,
    fragment: str
) -> list:
    result = [x for x in name_to_id_dict.keys() if fragment.lower() in x.lower()]
    return result

In [None]:
def add_names_columns_by_ids(
    df: pd.DataFrame,
    name_to_id_mappings: dict = {
        'measure_id' :measure_name_to_id,
        'metric_id' :metric_name_to_id,
        'sex_id' :sex_name_to_id,
        'age_id' :age_name_to_id,
        'cause_id' :cause_name_to_id,
        'risks_id' :risks_name_to_id,
        'location_id' :location_name_to_id,
    },
) -> pd.DataFrame:
    result = df.copy()
    for column in set(result.columns).intersection(set(name_to_id_mappings.keys())):
        result[column.replace('_id', '_name')] = (
            df[column]
            .astype('str')
            .map({key: value for value, key in name_to_id_mappings[column].items()})
        )
    return result

In [None]:
def age_group_name_to_integer(
    data: pd.Series
) -> pd.Series:
    """
    return first year's number of age group
    WARNING: working only with GBD tables age group names format
    i.e. start and end year of age group separator 
    might be only ' ' or '-' and '<1' hardcoded replace to 0
    and  '95+' replace to 95
    example: 
        age_group_name_to_integer(
            pd.Series(['<1 year' ,'5 to 9', '10-14 years', '95+']
        ) -> pd.Series([0, 5, 10, 95])
    """
    first_term = pd.Series(
        [x.split(' ')[0] for x in data.copy()],
        index=data.copy().index
    )

    if np.any(first_term.str.contains('-')):

        result = first_term.copy().str.replace('<1', '0').str.replace('+', '').str.split('-').str[0]

        result = pd.to_numeric(result.copy(), errors='coerce')
        
    else:
        result = first_term.copy().str.replace('<1', '0')

        result = pd.to_numeric(result.copy(), errors='coerce')

    return result

In [None]:
def data_interpolate_inside_age_groups(
    data: pd.DataFrame,
    start_age_column: str,
    columns_to_interpolate: list,
    columns_for_subsample: list,
    columns_for_subsample_is_string: bool=True,
    age_max: int=110,
    age_max_value: int=0,
    method='cubic',
    divide=True,
) -> pd.DataFrame:
    """
    Linear interpolate data inside age groups 
    return data with values converted from many years 
    bins to single year bins
    """ 
    variants_filters = pd.DataFrame(
        index=pd.MultiIndex.from_product(
            [
                data[column].unique()
                for column in columns_for_subsample
            ],
            names=columns_for_subsample
        )
    ).reset_index()

    full_index = list(np.sort(data[start_age_column].unique())) + [age_max]

    age_min = min(full_index)
   
    result = []

    for variant in variants_filters.iterrows():

        if columns_for_subsample_is_string == True:
            query_text = ' and '.join(
                [f'{column} == "{variant[1][column]}"' for column in columns_for_subsample]
            )
        else:
            query_text = ' and '.join(
                [f'{column} == {variant[1][column]}' for column in columns_for_subsample]
            )


        subsample_for_interpolation = (
            data.query(query_text)
            [[start_age_column] + columns_to_interpolate]
            .set_index(start_age_column)
            .sort_index()
        )
                        
        if len(subsample_for_interpolation) > 0:

            subsample_age_max = subsample_for_interpolation.index.max()

            subsample_age_min = subsample_for_interpolation[
                (subsample_for_interpolation[columns_to_interpolate] != 0).any(axis=1) == True
            ].index.min()
        
            next_age = full_index[
                np.where(full_index == subsample_age_max)[0][0] + 1
            ]# - 1

            subsample_for_interpolation.loc[next_age, :] = [age_max_value for _ in columns_to_interpolate]

            subsample_for_interpolation = subsample_for_interpolation.join(
                pd.DataFrame(
                    index=full_index
                ),
                how='outer'
            ).fillna(age_max_value)

            subsample_for_interpolation['group_mean_element'] = (
                [0]
                +
                list(
                    (
                        (
                            subsample_for_interpolation.index[:-1]
                            -
                            subsample_for_interpolation.index[1:]
                        )
                        //
                        2
                    )
                    + 
                    subsample_for_interpolation.index[1:]
                )[1:]
                +
                [age_max]
            )

            if divide == True:
                    subsample_for_interpolation.loc[:subsample_age_max, columns_to_interpolate] = pd.concat(
                        [
                            subsample_for_interpolation.loc[
                                [x], columns_to_interpolate
                            ] / (y - x) 
                            for x, y in zip(
                                subsample_for_interpolation.index[:-1],
                                subsample_for_interpolation.index[1:]
                            )
                        ] + [
                                subsample_for_interpolation.iloc[[-1], :] 
                            ]
                    )
                    
            interpolated_data_subsample = subsample_for_interpolation.set_index('group_mean_element').join(
                pd.DataFrame(
                    index=list(
                        range(
                            age_min,
                            age_max + 1,
                            1
                        )
                    )
                ),
                how='outer',
            )

            interpolated_data_subsample = pd.concat(
                [
                    interpolated_data_subsample[column].interpolate(method=method)
                    for column in columns_to_interpolate
                ],
                axis=1
            )

            interpolated_data_subsample[variant[1].index] = variant[1].values

            result.append(interpolated_data_subsample)
    
    return pd.concat(result, axis=0)


In [None]:
def life_expectancy_from_population_and_deats(
    data: pd.DataFrame,
    age_group_start_column: str,
    deaths_column: str,
    population_column: str,
    sex_id_column: str,
    suffix: str='',
    radix: int=100_000,
    last_age_group_years: int=5,
) -> pd.DataFrame:
    """
    calculating life expectancy by population and deaths
    based on guideline  
    https://www.ons.gov.uk/peoplepopulationandcommunity/healthandsocialcare/healthandlifeexpectancies/methodologies/guidetocalculatingnationallifetables
    """

    life_table = data.copy()

    life_table['m_x'] = ( # Specific mortality rate at age x, central death rate
        life_table[deaths_column] / life_table[population_column]
    )

    life_table['q_x'] = ( # Likelihood or risk of death having reached age x
        life_table[deaths_column] 
        / 
        (
            life_table[population_column]
            +
            (0.5 * life_table[deaths_column])
        )
    )

    l_x = [] # number of people surviving to age x

    for sex in life_table[sex_id_column].unique():

        subsample = life_table.query(f'{sex_id_column} == @sex').sort_values(by=age_group_start_column)

        subsample['l_x'] = np.nan

        subsample.loc[subsample.index[0], 'l_x'] = radix

        for i in range(1, len(subsample.index), 1):

            subsample.loc[subsample.index[i], 'l_x'] = subsample.loc[subsample.index[i - 1], 'l_x'] - (
                subsample.loc[subsample.index[i - 1], 'l_x']
                *
                subsample.loc[subsample.index[i - 1], 'q_x']
            )
        
        l_x.append(subsample)

    life_table = pd.concat(l_x, axis=0).sort_values(by=[age_group_start_column, sex_id_column])

    L_x = [] # the number of person years lived at each individual age group

    for sex in life_table[sex_id_column].unique():

        subsample = life_table.query(f'{sex_id_column} == @sex').sort_values(by=age_group_start_column)

        subsample['L_x'] = np.nan

        for i in range(0, len(subsample.index) - 1, 1):

            curr_age_group_years = (
                subsample.loc[subsample.index[i + 1], age_group_start_column]
                -
                subsample.loc[subsample.index[i], age_group_start_column]
            )

            subsample.loc[subsample.index[i], 'L_x'] = curr_age_group_years * (
                (
                    subsample.loc[subsample.index[i], 'l_x']
                    +
                    subsample.loc[subsample.index[i + 1], 'l_x']
                ) / 2
            )
        
        subsample.loc[subsample.index[-1], 'L_x'] = (
            last_age_group_years * (subsample.loc[subsample.index[-1], 'l_x'] / 2)
        )
        
        L_x.append(subsample)

    life_table = pd.concat(L_x, axis=0).sort_values(by=[age_group_start_column, sex_id_column])

    T_x = [] # total number of years lived

    for sex in life_table[sex_id_column].unique():

        subsample = life_table.query(f'{sex_id_column} == @sex').sort_values(by=age_group_start_column)

        subsample['T_x'] = np.nan

        for i in range(0, len(subsample.index), 1):

            subsample.loc[subsample.index[i], 'T_x'] = subsample.loc[subsample.index[i]:, 'L_x'].sum()
        
        T_x.append(subsample)

    life_table = pd.concat(T_x, axis=0).sort_values(by=[age_group_start_column, sex_id_column])

    life_table['E_x'] = ( # period expectation of life at exact age x
        life_table.T_x 
        / 
        life_table.l_x
    )
    for column in [
        'm_x', 'q_x', 'l_x',
       'L_x', 'T_x', 'E_x'
    ]:
        life_table.columns = life_table.columns.str.replace(column, f'{column}{suffix}')
    
    life_table.sort_values(by=[age_group_start_column, sex_id_column], inplace=True)

    return life_table
    

In [None]:
def risk_factors_impact_calculator(
    deaths_by_risks: pd.DataFrame,
    deaths_by_causes: pd.DataFrame,
    all_risk_factors_name: str='All risk factors',
    estimations_list: list=['val', 'upper', 'lower'],
    risk_impact_deaths_prefix: str='rei_impact_deaths'
) -> pd.DataFrame:

    # calculate share of deaths attributed to the all risks factors
    deaths_share_risks = {
        estimate: (
            deaths_by_risks[deaths_by_risks['rei_name'] == all_risk_factors_name][estimate].sum()
            /
            deaths_by_causes[estimate].sum()
        )
        for estimate in estimations_list
    }

    deaths_by_risk_most_detailed = deaths_by_risks[deaths_by_risks['rei_name'] != all_risk_factors_name].copy()

    # calculate the share of most detailed deaths attributed to each risk
    for estimate in estimations_list:

        estimate_col = deaths_by_risk_most_detailed[estimate].copy()

        deaths_by_risk_most_detailed[f'rei_impact_{estimate}'] = (
            estimate_col
            /
            estimate_col.sum()
        )

        # estimate how many deaths might be caused by each risk
        deaths_by_risk_most_detailed.loc[:, f'{risk_impact_deaths_prefix}_{estimate}'] = (
            deaths_by_risk_most_detailed[f'rei_impact_{estimate}'] 
            *
            (
                deaths_by_causes[estimate].sum()
                *
                deaths_share_risks[estimate]
            )
        )

    return deaths_by_risk_most_detailed

In [None]:
def life_expectancy_with_risk_impact_removing_from_deaths(
    deaths_by_causes: pd.DataFrame,
    deaths_by_risks: pd.DataFrame,
    population: pd.DataFrame,
    all_risk_factors_name: str,
    risks_to_remove: list=None,    
    age_column: str='age_group_start',
    sex_column: str='sex_id',
    estimations_list: list=['val', 'upper', 'lower'],
    risk_impact_deaths_prefix: str='rei_impact_deaths',
    mortality_suffix: str='_deaths',
    population_suffix: str='_population',
    interpolation_method='linear',
    interpolation_age_max: int=110,
    interpolation_age_max_value: int=0,
    radix: int=100_000,
    le_calculation_last_age_group_years: int=1,
) -> pd.DataFrame:

    mortality = (
        deaths_by_causes
        .copy()
        .groupby(by=[age_column, sex_column,])
        .sum()
        [estimations_list]
        .sort_index()
    )

    mortality_columns_names = [x + mortality_suffix for x in estimations_list]
    population_columns_names = [x + population_suffix for x in estimations_list]

    life_table = (
        population
        .copy()
        .set_index([age_column, sex_column])
        [estimations_list]
        .join(
            mortality[estimations_list].copy(),
            lsuffix=population_suffix,
            rsuffix=mortality_suffix,            
        )
        [mortality_columns_names + population_columns_names]
        .sort_index()
        .reset_index()
    )

    if risks_to_remove is not None:
        risk_impact_columns = [
            f'{risk_impact_deaths_prefix}_{estimate}'
            for estimate in estimations_list
        ]

        deaths_by_risks = risk_factors_impact_calculator(
            deaths_by_risks=deaths_by_risks.copy(),
            deaths_by_causes=deaths_by_causes,
            all_risk_factors_name=all_risk_factors_name,
            estimations_list=estimations_list,
            risk_impact_deaths_prefix=risk_impact_deaths_prefix,
        )

        risks_impact = (
            deaths_by_risks
            .copy().query('rei_name in @risks_to_remove')
            .groupby(by=[age_column, sex_column,])
            .sum()[risk_impact_columns]
            .sort_index()
        )

        life_table_risk_excluded = life_table.copy()

        life_table_risk_excluded = (
            life_table_risk_excluded
            .copy()
            .set_index([age_column, sex_column])
            .join(
                risks_impact[risk_impact_columns],
                how='left'
            )
        )

        life_table_risk_excluded.loc[:, mortality_columns_names] = (
            life_table_risk_excluded[mortality_columns_names]
            -
            life_table_risk_excluded[risk_impact_columns].values
        )

        life_table_risk_excluded = (
            life_table_risk_excluded
            [mortality_columns_names + population_columns_names]
            .reset_index()
        )
    
    else:
        life_table_risk_excluded = life_table.copy()

    life_table_interpolated = data_interpolate_inside_age_groups(
        data=life_table_risk_excluded,
        start_age_column=age_column,
        columns_to_interpolate=mortality_columns_names + population_columns_names,
        columns_for_subsample=[sex_column],
        columns_for_subsample_is_string=False,
        age_max=interpolation_age_max + 2,
        age_max_value=interpolation_age_max_value,
        method=interpolation_method,
        divide=True,
    )

    life_table_interpolated.reset_index(inplace=True)

    life_table_interpolated.columns = (
        [age_column]
        +
        mortality_columns_names
        +
        population_columns_names
        +
        [sex_column]
    )

    life_table_interpolated = life_table_interpolated[
        [age_column]
        +
        [sex_column]
        +
        mortality_columns_names
        +
        population_columns_names
    ].query(f'{age_column} <= {interpolation_age_max}')

    result = pd.concat([
        life_expectancy_from_population_and_deats(
                data=life_table_interpolated,
                age_group_start_column=age_column,
                deaths_column=deaths_column,
                population_column=population_column,
                sex_id_column=sex_column,
                suffix=suffix,
                radix=radix,
                last_age_group_years=le_calculation_last_age_group_years,
        )
        for deaths_column, population_column, suffix in zip(
            mortality_columns_names,
            population_columns_names,
            [f'_{estimate}' for estimate in estimations_list],
        )
        ], axis=1
    )

    result = result.iloc[:, np.where(result.columns.duplicated(keep='first') == False)[0]]

    result.columns = result.columns.str.replace(age_column, 'age')

    result = (
        result
        .copy()
        [
            ['age', sex_column]
            +
            ['E_x' + x for x in [f'_{estimate}' for estimate in estimations_list]]
            +
            mortality_columns_names
            +
            population_columns_names
        ]
    ).reset_index(drop=True)

    return result

In [None]:
def plotly_compaire_scatter(
    column_orig: pd.Series,
    column_calculated: pd.Series,
    orig_name: str='original',
    calculated_name: str='calculated',
) -> None:

    fig = make_subplots()

    for data, name in zip(
        [column_orig, column_calculated,],
        [orig_name, calculated_name,]
    ):

        fig.add_trace(
            go.Scatter(
                x=data.index,
                y=data.values,
                mode='lines+markers',
                line=dict(width=1),
                marker=dict(size=1.5),
                name=f'{name} {column_orig.name}',
            )
        )

    fig.update_layout(template='plotly_dark')

    fig.show()


## Preprocessed data loading

### load causes and risks ierarchy

In [None]:
causes_ierarchy = pd.read_csv(
    os.path.join(
        'data',
        'coode_book',
        'IHME_GBD_2019_CAUSE_HIERARCHY_Y2020M11D25.csv'
    )
)

causes_ierarchy.columns = [x.lower().replace(' ', '_') for x in causes_ierarchy.columns]

In [None]:
causes_ierarchy.head(3)

Unnamed: 0,cause_id,cause_name,parent_id,parent_name,level,cause_outline,sort_order,yll_only,yld_only
0,294,All causes,294,All causes,0,Total,1,,
1,295,"Communicable, maternal, neonatal, and nutritio...",294,All causes,1,A,2,,
2,955,HIV/AIDS and sexually transmitted infections,295,"Communicable, maternal, neonatal, and nutritio...",2,A.1,3,,


In [None]:
risks_ierarchy = pd.read_csv(
    os.path.join(
        'data',
        'coode_book',
        'IHME_GBD_2019_REI_HIERARCHY_Y2020M10D15.csv'
    )
)

risks_ierarchy.columns = [x.lower().replace(' ', '_') for x in risks_ierarchy.columns]

risks_ierarchy.head(3)

Unnamed: 0,rei_id,rei_name,parent_id,parent_name,level,sort_order
0,169,All risk factors,169,All risk factors,0,1.0
1,202,Environmental/occupational risks,169,All risk factors,1,2.0
2,203,Behavioral risks,169,All risk factors,1,3.0


In [None]:
risks_ierarchy.query('rei_name.str.contains("Diet")')

Unnamed: 0,rei_id,rei_name,parent_id,parent_name,level,sort_order
62,110,Dietary risks,203,Behavioral risks,2,63.0
63,111,Diet low in fruits,110,Dietary risks,3,64.0
64,112,Diet low in vegetables,110,Dietary risks,3,65.0
65,333,Diet low in legumes,110,Dietary risks,3,65.5
66,113,Diet low in whole grains,110,Dietary risks,3,66.0
67,114,Diet low in nuts and seeds,110,Dietary risks,3,67.0
68,115,Diet low in milk,110,Dietary risks,3,68.0
69,116,Diet high in red meat,110,Dietary risks,3,69.0
70,117,Diet high in processed meat,110,Dietary risks,3,70.0
71,118,Diet high in sugar-sweetened beverages,110,Dietary risks,3,71.0


### load mortality (age, sex, cause specific)

In [None]:
deaths_by_causes = pd.read_csv(
    os.path.join(
        'data',
        'population_causes_risks_preprocessed',
        'causes.csv'
    )
)

deaths_by_causes

Unnamed: 0,measure_id,measure_name,location_id,location_name,sex_id,sex_name,age_id,age_name,cause_id,cause_name,metric_id,metric_name,year,val,upper,lower,age_group_start
0,1,Deaths,30,Vanuatu,1,Male,5,1-4 years,704,Exposure to mechanical forces,1,Number,2019,0.059426,0.152398,0.018754,1
1,1,Deaths,30,Vanuatu,2,Female,5,1-4 years,704,Exposure to mechanical forces,1,Number,2019,0.132070,0.386744,0.036372,1
2,1,Deaths,30,Vanuatu,1,Male,6,5-9 years,704,Exposure to mechanical forces,1,Number,2019,0.165406,0.318827,0.085764,5
3,1,Deaths,30,Vanuatu,2,Female,6,5-9 years,704,Exposure to mechanical forces,1,Number,2019,0.114914,0.229480,0.053363,5
4,1,Deaths,30,Vanuatu,1,Male,7,10-14 years,704,Exposure to mechanical forces,1,Number,2019,0.169386,0.324510,0.094666,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1015915,1,Deaths,184,Mozambique,2,Female,31,85-89 years,709,Animal contact,1,Number,2019,2.699511,5.095214,1.225879,85
1015916,1,Deaths,184,Mozambique,1,Male,32,90-94 years,709,Animal contact,1,Number,2019,0.822983,1.325274,0.458083,90
1015917,1,Deaths,184,Mozambique,2,Female,32,90-94 years,709,Animal contact,1,Number,2019,0.985076,1.853808,0.477237,90
1015918,1,Deaths,184,Mozambique,1,Male,235,95+ years,709,Animal contact,1,Number,2019,0.259206,0.437644,0.125232,95


### load risks factors attributed mortality (age, sex, cause, risk specific)

In [None]:
risk_factors = pd.read_csv(
    os.path.join(
        'data',
        'population_causes_risks_preprocessed',
        'risk_factors.csv'
    )
)

risk_factors

Unnamed: 0,location_id,age_group_start,sex_id,cause_id,rei_id,val,upper,lower
0,6,0,1,504,91,0.000000,0.000000,0.000000
1,6,0,2,504,91,0.000000,0.000000,0.000000
2,6,1,1,504,91,0.000000,0.000000,0.000000
3,6,1,2,504,91,0.000000,0.000000,0.000000
4,6,5,1,504,91,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...
3581419,522,85,2,499,124,0.052622,0.187631,0.012720
3581420,522,90,1,499,124,0.061035,0.292564,0.009096
3581421,522,90,2,499,124,0.027368,0.100173,0.006522
3581422,522,95,1,499,124,0.022894,0.103315,0.003498


In [None]:
len(risk_factors.location_id.unique())

204

In [None]:
risks_name_to_id["All risk factors"]

169

In [None]:
risk_factors.query('rei_id == @risks_name_to_id["All risk factors"]')[['val', 'upper', 'lower']].sum()

val      3.498040e+07
upper    4.819105e+07
lower    2.475025e+07
dtype: float64

In [None]:
deaths_by_causes[['val', 'upper', 'lower']].sum()

val      5.649470e+07
upper    7.918967e+07
lower    3.953605e+07
dtype: float64

In [None]:
np.concatenate(risk_factors_manageable.values)

array(['Alcohol use', 'Drug use', 'Smoking', 'Secondhand smoke',
       'Chewing tobacco', 'Unsafe sex', 'Low physical activity',
       'High fasting plasma glucose', 'High LDL cholesterol',
       'High systolic blood pressure', 'High body-mass index',
       'Diet high in processed meat', 'Diet high in red meat',
       'Diet high in sodium', 'Diet high in sugar-sweetened beverages',
       'Diet high in trans fatty acids', 'Diet low in calcium',
       'Diet low in fiber', 'Diet low in fruits', 'Diet low in legumes',
       'Diet low in milk', 'Diet low in nuts and seeds',
       'Diet low in polyunsaturated fatty acids',
       'Diet low in seafood omega-3 fatty acids',
       'Diet low in vegetables', 'Diet low in whole grains'], dtype=object)

In [None]:
risk_factors_manageable = pd.read_csv(
    os.path.join(
        'data',
        'population_causes_risks_preprocessed',
        'risks_names_manageable.csv'
    )
)

risk_factors_manageable = [x for x in np.concatenate(risk_factors_manageable.values)]

risk_factors_manageable

['Alcohol use',
 'Drug use',
 'Smoking',
 'Secondhand smoke',
 'Chewing tobacco',
 'Unsafe sex',
 'Low physical activity',
 'High fasting plasma glucose',
 'High LDL cholesterol',
 'High systolic blood pressure',
 'High body-mass index',
 'Diet high in processed meat',
 'Diet high in red meat',
 'Diet high in sodium',
 'Diet high in sugar-sweetened beverages',
 'Diet high in trans fatty acids',
 'Diet low in calcium',
 'Diet low in fiber',
 'Diet low in fruits',
 'Diet low in legumes',
 'Diet low in milk',
 'Diet low in nuts and seeds',
 'Diet low in polyunsaturated fatty acids',
 'Diet low in seafood omega-3 fatty acids',
 'Diet low in vegetables',
 'Diet low in whole grains']

### load population

In [None]:
population = pd.read_csv(
    os.path.join(
        'data',
        'population_causes_risks_preprocessed',
        'population.csv'
    ),
)

population.head()

Unnamed: 0,location_id,location_name,sex_id,sex_name,age_group_id,age_group_name,year_id,measure_id,measure_name,metric_id,metric_name,val,upper,lower,age_group_start
0,6,China,1,male,5,1 to 4,2019,44,Population,1,Number,35756780.0,40160350.0,31148440.0,1
1,6,China,1,male,6,5 to 9,2019,44,Population,1,Number,39266150.0,44097410.0,34208170.0,5
2,6,China,1,male,7,10 to 14,2019,44,Population,1,Number,38321130.0,43036630.0,33384350.0,10
3,6,China,1,male,8,15 to 19,2019,44,Population,1,Number,40095740.0,45030470.0,34929820.0,15
4,6,China,1,male,9,20 to 24,2019,44,Population,1,Number,42766860.0,48022160.0,37261990.0,20


### load life expectancy (age sex specific)

In [None]:
life_expectancy = pd.read_csv(
    os.path.join(
        'data',
        'population_causes_risks_preprocessed',
        'life_expectancy.csv'
    )
)

life_expectancy

Unnamed: 0,location_id,location_name,sex_id,sex_name,age_group_id,age_group_name,year_id,measure_id,measure_name,metric_id,metric_name,val,upper,lower,age_group_start
0,6,China,1,male,10,25 to 29,2019,26,Life expectancy,5,Years,50.962995,52.878710,49.057086,25
1,6,China,2,female,10,25 to 29,2019,26,Life expectancy,5,Years,56.687422,58.356527,55.012004,25
2,7,Democratic People's Republic of Korea,1,male,10,25 to 29,2019,26,Life expectancy,5,Years,46.650743,48.358684,44.909324,25
3,7,Democratic People's Republic of Korea,2,female,10,25 to 29,2019,26,Life expectancy,5,Years,52.564331,54.271523,50.312662,25
4,8,Taiwan (Province of China),1,male,10,25 to 29,2019,26,Life expectancy,5,Years,52.949729,55.567294,50.053236,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8155,422,United States Virgin Islands,2,female,9,20 to 24,2019,26,Life expectancy,5,Years,59.743758,61.102169,58.311953,20
8156,435,South Sudan,1,male,9,20 to 24,2019,26,Life expectancy,5,Years,49.370010,52.606344,46.281317,20
8157,435,South Sudan,2,female,9,20 to 24,2019,26,Life expectancy,5,Years,53.051489,56.020234,49.892046,20
8158,522,Sudan,1,male,9,20 to 24,2019,26,Life expectancy,5,Years,52.962875,55.256215,50.475401,20


## Estimation risk factors impact to the deaths and recalculate life expectancy with exclude each of them

In [None]:
print(len(risk_factors.location_name.unique()))
print(len(deaths_by_causes.location_name.unique()))
print(len(population.location_name.unique()))

204
204
204


In [None]:
risk_factors

Unnamed: 0,location_name,age_group_start,sex_id,cause_name,rei_name,val,upper,lower
0,Afghanistan,0,1,Non-rheumatic valvular heart disease,Lead exposure,0.000000,0.000000,0.000000
1,Afghanistan,0,2,Non-rheumatic valvular heart disease,Lead exposure,0.000000,0.000000,0.000000
2,Afghanistan,1,1,Non-rheumatic valvular heart disease,Lead exposure,0.000000,0.000000,0.000000
3,Afghanistan,1,2,Non-rheumatic valvular heart disease,Lead exposure,0.000000,0.000000,0.000000
4,Afghanistan,5,1,Non-rheumatic valvular heart disease,Lead exposure,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...
3581419,Zimbabwe,85,2,Cardiomyopathy and myocarditis,Diet high in sodium,0.660982,3.248675,0.039776
3581420,Zimbabwe,90,1,Cardiomyopathy and myocarditis,Diet high in sodium,0.235667,1.071680,0.013310
3581421,Zimbabwe,90,2,Cardiomyopathy and myocarditis,Diet high in sodium,0.372526,1.880443,0.022270
3581422,Zimbabwe,95,1,Cardiomyopathy and myocarditis,Diet high in sodium,0.047071,0.225502,0.002520


In [None]:
risk_factors['location_name'] = risk_factors['location_id'].map({v:k for k,v in location_name_to_id.items()})
risk_factors['sex_name'] = risk_factors['sex_id'].map({v:k for k,v in sex_name_to_id.items()})
risk_factors['cause_name'] = risk_factors['cause_id'].map({v:k for k,v in cause_name_to_id.items()})
risk_factors['rei_name'] = risk_factors['rei_id'].map({v:k for k,v in risks_name_to_id.items()})

risk_factors

Unnamed: 0,location_id,age_group_start,sex_id,cause_id,rei_id,val,upper,lower,location_name,sex_name,cause_name,rei_name
0,6,0,1,504,91,0.000000,0.000000,0.000000,China,Male,Non-rheumatic valvular heart disease,Lead exposure
1,6,0,2,504,91,0.000000,0.000000,0.000000,China,Female,Non-rheumatic valvular heart disease,Lead exposure
2,6,1,1,504,91,0.000000,0.000000,0.000000,China,Male,Non-rheumatic valvular heart disease,Lead exposure
3,6,1,2,504,91,0.000000,0.000000,0.000000,China,Female,Non-rheumatic valvular heart disease,Lead exposure
4,6,5,1,504,91,0.000000,0.000000,0.000000,China,Male,Non-rheumatic valvular heart disease,Lead exposure
...,...,...,...,...,...,...,...,...,...,...,...,...
3581419,522,85,2,499,124,0.052622,0.187631,0.012720,Sudan,Female,Cardiomyopathy and myocarditis,Diet high in sodium
3581420,522,90,1,499,124,0.061035,0.292564,0.009096,Sudan,Male,Cardiomyopathy and myocarditis,Diet high in sodium
3581421,522,90,2,499,124,0.027368,0.100173,0.006522,Sudan,Female,Cardiomyopathy and myocarditis,Diet high in sodium
3581422,522,95,1,499,124,0.022894,0.103315,0.003498,Sudan,Male,Cardiomyopathy and myocarditis,Diet high in sodium


In [None]:
results = []

for location_name in tqdm(risk_factors.location_id.unique()):

    deaths_by_causes_located = deaths_by_causes.query('location_id == @location_name')
    risk_factors_located = risk_factors.query('location_id == @location_name')
    population_located = population.query('location_id == @location_name')

    le_calculated = life_expectancy_with_risk_impact_removing_from_deaths(
        deaths_by_causes=deaths_by_causes_located,
        deaths_by_risks=risk_factors_located,
        population=population_located,
        risks_to_remove=None,
        all_risk_factors_name='All risk factors',
        age_column='age_group_start',
        sex_column='sex_id',
        estimations_list=['val', 'upper', 'lower'],
        risk_impact_deaths_prefix='rei_impact_deaths',
        mortality_suffix='_deaths',
        population_suffix='_population',
        interpolation_method='linear',
        interpolation_age_max=110,
        interpolation_age_max_value=0,
        radix=100_000,
        le_calculation_last_age_group_years=1,
    )


    for risk_factor in list(
        (
            set(risk_factors_located.rei_name)
            .difference(set(['All risk factors']))
        )
    ):

        result = life_expectancy_with_risk_impact_removing_from_deaths(
            deaths_by_causes=deaths_by_causes_located,
            deaths_by_risks=risk_factors_located,
            population=population_located,
            risks_to_remove=[risk_factor],
            all_risk_factors_name='All risk factors',
            age_column='age_group_start',
            sex_column='sex_id',
            estimations_list=['val', 'upper', 'lower'],
            risk_impact_deaths_prefix='rei_impact_deaths',
            mortality_suffix='_deaths',
            population_suffix='_population',
            interpolation_method='linear',
            interpolation_age_max=110,
            interpolation_age_max_value=0,
            radix=100_000,
            le_calculation_last_age_group_years=1,
        )

        result = (
            result.copy()
            [['age', 'sex_id', 'E_x_val', 'E_x_upper', 'E_x_lower', ]]
            .set_index(['age', 'sex_id']).sort_index()
            ) - (
            le_calculated
            [['age', 'sex_id', 'E_x_val', 'E_x_upper', 'E_x_lower', ]]
            .set_index(['age', 'sex_id']).sort_index()
        )

        result.columns = ['val', 'lower', 'upper']

        result.reset_index(inplace=True)

        result['E_x_val'] = le_calculated.sort_values(by=['age', 'sex_id'])['E_x_val']

        result['E_x_upper'] = le_calculated.sort_values(by=['age', 'sex_id'])['E_x_lower']

        result['E_x_lower'] = le_calculated.sort_values(by=['age', 'sex_id'])['E_x_upper']

        result['rei_name'] = risk_factor

        result['location_name'] = location_name

        results.append(result)

100%|██████████| 204/204 [1:37:41<00:00, 28.74s/it]


In [None]:
result = pd.concat(results, axis=0)

In [None]:
result.columns = list(result.columns[:-1]) + ['location_id']

In [None]:
result

Unnamed: 0,age,sex_id,val,lower,upper,E_x_val,E_x_upper,E_x_lower,rei_name,location_id
0,0,1,3.871769e-01,3.815749e-01,4.020134e-01,74.390988,75.806042,72.957224,High LDL cholesterol,6
1,0,2,4.263880e-01,4.045684e-01,4.289166e-01,80.625011,82.315388,78.988120,High LDL cholesterol,6
2,1,1,3.900383e-01,3.848020e-01,4.046695e-01,73.937070,75.303588,72.570016,High LDL cholesterol,6
3,1,2,4.289830e-01,4.072627e-01,4.313471e-01,80.112659,81.778994,78.510812,High LDL cholesterol,6
4,2,1,3.919575e-01,3.869754e-01,4.064447e-01,73.298415,74.631729,71.977076,High LDL cholesterol,6
...,...,...,...,...,...,...,...,...,...,...
217,108,2,-6.661338e-16,0.000000e+00,4.440892e-16,1.791660,2.020867,1.520489,Child stunting,522
218,109,1,0.000000e+00,-2.220446e-16,-2.220446e-16,1.212100,1.317661,1.083061,Child stunting,522
219,109,2,0.000000e+00,2.220446e-16,-2.220446e-16,1.241636,1.330739,1.127160,Child stunting,522
220,110,1,0.000000e+00,0.000000e+00,0.000000e+00,0.500000,0.500000,0.500000,Child stunting,522


In [None]:
result['rei_id'] = result.rei_name.map(risks_name_to_id)

In [None]:
result.columns

Index(['age', 'sex_id', 'val', 'lower', 'upper', 'E_x_val', 'E_x_upper',
       'E_x_lower', 'rei_name', 'location_id', 'rei_id'],
      dtype='object')

In [None]:
result = result[[
    'age', 'sex_id', 'location_id',
    'val', 'lower', 'upper',
    'E_x_val', 'E_x_upper', 'E_x_lower', 
    'rei_id'
]]

In [None]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2989008 entries, 0 to 221
Data columns (total 10 columns):
 #   Column       Dtype  
---  ------       -----  
 0   age          int64  
 1   sex_id       int64  
 2   location_id  int64  
 3   val          float64
 4   lower        float64
 5   upper        float64
 6   E_x_val      float64
 7   E_x_upper    float64
 8   E_x_lower    float64
 9   rei_id       int64  
dtypes: float64(6), int64(4)
memory usage: 250.8 MB


In [None]:
calculated_le = []

for location_id in tqdm(result.location_id.unique()):

    filtered_le = (
        result
        .query(f'location_id == {location_id}')
        [['age', 'sex_id', 'E_x_val', 'E_x_upper', 'E_x_lower']]
    ).drop_duplicates()

    filtered_le['location_id'] = location_id

    calculated_le.append(filtered_le)

100%|██████████| 204/204 [00:05<00:00, 38.36it/s]


In [None]:
calculated_le_df = pd.concat(calculated_le, axis=0)

In [None]:
len(calculated_le_df.location_id.unique())

204

In [None]:
calculated_le_df.columns = ['age', 'sex_id', 'val', 'upper', 'lower', 'location_id']

In [None]:
result.columns

Index(['age', 'sex_id', 'location_id', 'val', 'lower', 'upper', 'E_x_val',
       'E_x_upper', 'E_x_lower', 'rei_id'],
      dtype='object')

In [None]:
result = result[[
    'age', 'sex_id', 'location_id', 'rei_id',
    'val', 'lower', 'upper',
]]

In [None]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2989008 entries, 0 to 221
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   age          int64  
 1   sex_id       int64  
 2   location_id  int64  
 3   rei_id       int64  
 4   val          float64
 5   lower        float64
 6   upper        float64
dtypes: float64(3), int64(4)
memory usage: 182.4 MB


In [None]:
result

Unnamed: 0,age,sex_id,location_id,rei_id,val,lower,upper
0,0,1,6,367,3.871769e-01,3.815749e-01,4.020134e-01
1,0,2,6,367,4.263880e-01,4.045684e-01,4.289166e-01
2,1,1,6,367,3.900383e-01,3.848020e-01,4.046695e-01
3,1,2,6,367,4.289830e-01,4.072627e-01,4.313471e-01
4,2,1,6,367,3.919575e-01,3.869754e-01,4.064447e-01
...,...,...,...,...,...,...,...
217,108,2,522,241,-6.661338e-16,0.000000e+00,4.440892e-16
218,109,1,522,241,0.000000e+00,-2.220446e-16,-2.220446e-16
219,109,2,522,241,0.000000e+00,2.220446e-16,-2.220446e-16
220,110,1,522,241,0.000000e+00,0.000000e+00,0.000000e+00


In [None]:
len(result.rei_id.unique())

66

Save the results

In [None]:
result.to_csv(
    os.path.join(
        'data',
        'risk_impact_calculated',
        'risk_impact.csv'
    ),
    index=False
)

In [None]:
calculated_le_df.to_csv(
    os.path.join(
        'data',
        'risk_impact_calculated',
        'life_expectancy_calculated.csv'
    ),
    index=False
)

Save mapping id to name

In [None]:
def color_map_from_categories(
    categories: list,
    plotly_colors_seq_names: str,
) -> dict:
    colors = []
    n_terms = len(categories)
    n_color_scales = len(plotly_colors_seq_names)
    share = (n_terms // n_color_scales) + 10
    for plotly_colors_seq_name in plotly_colors_seq_names:
        colors += px.colors.sample_colorscale(plotly_colors_seq_name, [n/(share - 1) for n in range(share)])
    np.random.shuffle(colors)

    return {
        k: v 
        for k, v in zip(
            categories,
            colors
        )
    }

In [None]:
rei_color_map = color_map_from_categories(
    rei_ierarchy.rei_name.values,
    ['ylgn', 'blackbody', 'rainbow']
)

In [None]:
rei_color_map = color_map_from_categories(
    risk_factors.rei_name.unique(),
    'turbo'
)

causes_color_map = color_map_from_categories(
    deaths_by_causes.cause_name.unique(),
    'Portland'
)

In [None]:
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=['Male', 'Female'],
    x_title='Deaths',
    shared_yaxes=True,
    shared_xaxes=True,
    vertical_spacing=0.01,
    horizontal_spacing=0.01
)

for sex_name, sex_id in zip(['Male', 'Female'], [1,2]):

    deaths_no_risk_attributed = (
        deaths_by_causes
        .query('sex_id == @sex_id')
        .groupby(by=['age_group_start'])
        ['val'].sum()
        .sort_index()
    ) - (
        risk_factors
        .query('sex_id == @sex_id and rei_name == "All risk factors"')
        .groupby(by=['age_group_start'])
        [['val']].sum()
        .sort_index()
        ['val'].values
    )

    fig.add_trace(
        go.Bar(
            y=deaths_no_risk_attributed.index,
            x=deaths_no_risk_attributed.values,
            name='No risk attributed',
            showlegend=True if sex_name == 'Female' else False,
            legendgroup='No risk attributed',
            marker=dict(color='gray'),
            width=2.9,
            orientation='h',
            hovertemplate=("Age: %{y}<br>No risk attributed<br>Deaths:%{x}<extra></extra>")
        ),
        col=2 if sex_name == 'Female' else 1,
        row=1
    )

    fig.add_trace(
        go.Scatter(
            y=[0],
            x=[170000],
            marker=dict(opacity=0.00001, size=0.0001),
            showlegend=False,
        )
    )

    for rei_name in (
        risk_factors_impact
        .query('sex_id == @sex_id')
        .groupby(by='rei_name')
        .val.sum()
        .sort_values(ascending=False).index
    ):
        data_to_plot = (
            risk_factors_impact
            .query('sex_id == @sex_id and rei_name == @rei_name')
            .sort_values(by='age_group_start')
        )

        fig.add_trace(
            go.Bar(
                y=data_to_plot['age_group_start'],
                x=data_to_plot['val'],
                name=rei_name,
                showlegend=True if sex_name == 'Female' else False,
                legendgroup=rei_name,
                marker=dict(color=rei_color_map[rei_name]),
                width=2.9,
                orientation='h',
                hovertemplate=("Age: %{y}<br>"f"Risk:{rei_name}<br>""Deaths:%{x}<extra></extra>")
            ),
            col=2 if sex_name == 'Female' else 1,
            row=1
        )

    fig.update_layout(xaxis_range=[0, 170000])


fig.update_layout(
    height=800,
    width=1400,
    barmode='stack',
    template='plotly_dark',
    title=dict(
        text=f'Deaths by risk facrots, sex and age groups. {country} {year}',
        x=0.5
    ),
    xaxis_autorange='reversed',    
    legend=dict(
        title=dict(text='Causes:')
    ),
    bargap=0.01,bargroupgap=0.0
)
fig.show()

In [None]:
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=deaths_by_causes.sex_name.unique(),
    x_title='Deaths',
    shared_yaxes=True,
    shared_xaxes=True,
    vertical_spacing=0.1,
    horizontal_spacing=0.01
)

for sex_name in deaths_by_causes.sex_name.unique():
    for cause_name in deaths_by_causes.cause_name.unique():
        data_to_plot = deaths_by_causes.query(
            "cause_name == @cause_name"
            " and sex_name == @sex_name"
        )

        fig.add_trace(
            go.Scatter(
                y=['<1 year'],
                x=[170000],
                marker=dict(opacity=0.00001, size=0.0001),
                showlegend=False,
            )
        )

        fig.add_trace(
            go.Bar(
                y=data_to_plot['age_name'],
                x=data_to_plot['val'],
                name=cause_name,
                showlegend=True if sex_name == 'female' else False,
                legendgroup=cause_name,
                marker=dict(color=causes_color_map[cause_name]),
                width=0.9,
                orientation='h',
                hovertemplate=("Age: %{y}<br>"f"Cause:{cause_name}<br>""Deaths:%{x}<extra></extra>")
            ),
            col=2 if sex_name == 'female' else 1,
            row=1
        )

        fig.update_layout(xaxis_range=[0, 170000])


fig.update_layout(
    height=800,
    width=1400,
    barmode='stack',
    template='plotly_dark',
    title=dict(
        text=f'Deaths by causes, sex and age groups. {country} {year}',
        x=0.5
    ),
    xaxis_autorange='reversed',    
    legend=dict(
        title=dict(text='Causes:')
    )
)
fig.show()