In [14]:
import pandas as pd
import os
import plotly.express as px

## Building the Risk Factors Barplot 
Sex: "a=All (females and males); f=Females; m=Males"


#### Notes:
1. We want to display 'all' when a toggle for a specific sex or age_group is not clicked


In [2]:
os.chdir('..')

In [3]:
riskfacs = pd.read_csv("data/raw/TB_burden_age_sex_2024-03-26.csv")

In [35]:
# select country
riskfacs_pakistan = riskfacs.loc[riskfacs["country"] == 'Pakistan']
# default: show both sexes and all age groups (both without 'all') 
# we only show "all" if selected
rfs_pak_age_sex = riskfacs_pakistan.groupby(['age_group', 'sex'], as_index=False)['best'].sum()
rfs_pak_age_sex_w_all = rfs_pak_age_sex.loc[(rfs_pak_age_sex["sex"] != 'a') & (rfs_pak_age_sex["age_group"] != 'all')]
rfs_pak_age_sex_w_all

Unnamed: 0,age_group,sex,best
1,0-14,f,40000
2,0-14,m,42000
3,0-4,f,17000
4,0-4,m,20000
5,15-24,f,43000
6,15-24,m,60000
8,15plus,f,230730
9,15plus,m,341100
11,18plus,f,13000
12,18plus,m,18000


In [36]:
# order age_group and omitting the following redundant groups: '0-14', '15plus' '18plus'

rfs_pak_age_sex_w_all = rfs_pak_age_sex_w_all[~rfs_pak_age_sex_w_all['age_group'].isin(['0-14', '15plus', '18plus'])].copy()

order_age = ['0-4', '5-14', '15-24', '25-34', '35-44', '45-54', '55-64', '65plus']

# Set the 'age_group' column to a categorical type with the defined order
rfs_pak_age_sex_w_all['age_group'] = pd.Categorical(rfs_pak_age_sex_w_all['age_group'], categories=order_age, ordered=True)

# Sort the DataFrame by 'age_group'
rfs_pak_age_sex_w_all = rfs_pak_age_sex_w_all.sort_values('age_group')

rfs_pak_age_sex_w_all


Unnamed: 0,age_group,sex,best
3,0-4,f,17000
4,0-4,m,20000
19,5-14,f,23000
20,5-14,m,22000
5,15-24,f,43000
6,15-24,m,60000
13,25-34,f,38000
14,25-34,m,45000
15,35-44,f,36000
16,35-44,m,49000


In [37]:
px.bar(rfs_pak_age_sex_w_all, x = 'age_group', y = 'best', color='sex')

In [None]:
tb = pd.read_csv("TB_burden_countries_2024-03-25.csv")

In [None]:
len(tb.loc[tb.isna().any(axis=1)]["country"].unique())

72

In [None]:
tb[tb.isna().any(axis=1)]

Unnamed: 0,country,iso2,iso3,iso_numeric,g_whoregion,year,e_pop_num,e_inc_100k,e_inc_100k_lo,e_inc_100k_hi,...,cfr,cfr_lo,cfr_hi,cfr_pct,cfr_pct_lo,cfr_pct_hi,c_newinc_100k,c_cdr,c_cdr_lo,c_cdr_hi
69,American Samoa,AS,ASM,16,WPR,2000,58230,5.9,5.1,6.8,...,0.08,0.05,0.13,8.0,5.0,13.0,5.2,87.0,75.0,100.0
70,American Samoa,AS,ASM,16,WPR,2001,58324,5.9,5.1,6.8,...,0.08,0.05,0.12,8.0,5.0,12.0,5.1,87.0,75.0,100.0
71,American Samoa,AS,ASM,16,WPR,2002,58177,4.0,3.4,4.6,...,0.08,0.05,0.12,8.0,5.0,12.0,3.4,87.0,75.0,100.0
72,American Samoa,AS,ASM,16,WPR,2003,57941,6.0,5.1,6.9,...,0.08,0.05,0.12,8.0,5.0,12.0,5.2,87.0,75.0,100.0
73,American Samoa,AS,ASM,16,WPR,2004,57626,10.0,8.5,12.0,...,0.08,0.05,0.12,8.0,5.0,12.0,8.7,87.0,75.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4843,Wallis and Futuna,WF,WLF,876,WPR,2018,11816,0.0,0.0,0.0,...,,,,,,,0.0,,,
4844,Wallis and Futuna,WF,WLF,876,WPR,2019,11714,9.8,8.4,11.0,...,0.09,0.05,0.13,9.0,5.0,13.0,8.5,87.0,75.0,100.0
4845,Wallis and Futuna,WF,WLF,876,WPR,2020,11655,0.0,0.0,0.0,...,,,,,,,0.0,,,
4846,Wallis and Futuna,WF,WLF,876,WPR,2021,11627,1.9,1.6,2.1,...,0.08,0.05,0.12,8.0,5.0,12.0,,,,


In [None]:
len(tb["year"].unique())

23

In [None]:
tb.loc[tb["country"] == "Afghanistan"]

Unnamed: 0,country,iso2,iso3,iso_numeric,g_whoregion,year,e_pop_num,e_inc_100k,e_inc_100k_lo,e_inc_100k_hi,...,cfr,cfr_lo,cfr_hi,cfr_pct,cfr_pct_lo,cfr_pct_hi,c_newinc_100k,c_cdr,c_cdr_lo,c_cdr_hi
0,Afghanistan,AF,AFG,4,EMR,2000,19542982,190.0,122.0,271.0,...,0.37,0.17,0.61,37.0,17.0,61.0,36.0,19.0,13.0,30.0
1,Afghanistan,AF,AFG,4,EMR,2001,19688632,189.0,122.0,271.0,...,0.35,0.16,0.56,35.0,16.0,56.0,51.0,27.0,19.0,42.0
2,Afghanistan,AF,AFG,4,EMR,2002,21000256,189.0,122.0,270.0,...,0.31,0.15,0.51,31.0,15.0,51.0,66.0,35.0,24.0,54.0
3,Afghanistan,AF,AFG,4,EMR,2003,22645130,189.0,122.0,270.0,...,0.32,0.15,0.52,32.0,15.0,52.0,61.0,32.0,23.0,50.0
4,Afghanistan,AF,AFG,4,EMR,2004,23553551,189.0,122.0,270.0,...,0.28,0.13,0.46,28.0,13.0,46.0,78.0,41.0,29.0,64.0
5,Afghanistan,AF,AFG,4,EMR,2005,24411191,189.0,122.0,270.0,...,0.26,0.12,0.43,26.0,12.0,43.0,89.0,47.0,33.0,73.0
6,Afghanistan,AF,AFG,4,EMR,2006,25442944,189.0,122.0,270.0,...,0.24,0.11,0.39,24.0,11.0,39.0,100.0,53.0,37.0,82.0
7,Afghanistan,AF,AFG,4,EMR,2007,25903301,189.0,122.0,270.0,...,0.21,0.1,0.35,21.0,10.0,35.0,111.0,59.0,41.0,91.0
8,Afghanistan,AF,AFG,4,EMR,2008,26427199,189.0,122.0,271.0,...,0.22,0.11,0.37,22.0,11.0,37.0,107.0,57.0,40.0,88.0
9,Afghanistan,AF,AFG,4,EMR,2009,27385307,189.0,122.0,271.0,...,0.25,0.12,0.4,25.0,12.0,40.0,95.0,50.0,35.0,78.0


In [None]:
cols = [col for col in tb.columns if "_num" in col]
cols = [col for col in cols if "_hi" not in col]
cols = [col for col in cols if "_lo" not in col]

cols

['iso_numeric',
 'e_pop_num',
 'e_inc_num',
 'e_inc_tbhiv_num',
 'e_mort_exc_tbhiv_num',
 'e_mort_tbhiv_num',
 'e_mort_num']

In [None]:
tb_agg = pd.read_csv("TB_burden_age_sex_2024-03-25.csv")
tb_agg

Unnamed: 0,country,iso2,iso3,iso_numeric,year,measure,unit,age_group,sex,risk_factor,best,lo,hi
0,Afghanistan,AF,AFG,4,2022,inc,num,0-14,a,all,15000,8300,22000
1,Afghanistan,AF,AFG,4,2022,inc,num,0-14,f,all,7400,2700,12000
2,Afghanistan,AF,AFG,4,2022,inc,num,0-14,m,all,7900,2800,13000
3,Afghanistan,AF,AFG,4,2022,inc,num,0-4,f,all,3400,0,8300
4,Afghanistan,AF,AFG,4,2022,inc,num,0-4,m,all,4000,0,9900
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7300,Zimbabwe,ZW,ZWE,716,2022,inc,num,65plus,m,all,1700,0,3800
7301,Zimbabwe,ZW,ZWE,716,2022,inc,num,all,a,all,33000,22000,45000
7302,Zimbabwe,ZW,ZWE,716,2022,inc,num,all,a,hiv,20000,11000,31000
7303,Zimbabwe,ZW,ZWE,716,2022,inc,num,all,f,all,14000,7400,21000


In [None]:
len(tb_agg["country"].unique())

215

In [None]:
agg_cols = set(tb_agg["country"].unique())

col_set = set(tb["country"].unique())

agg_cols ^ col_set

{'Netherlands Antilles', 'Serbia & Montenegro'}

In [None]:
tb_agg.groupby(["country"])["age_group"].unique()[0]

  tb_agg.groupby(["country"])["age_group"].unique()[0]


array(['0-14', '0-4', '15-24', '15plus', '18plus', '25-34', '35-44',
       '45-54', '5-14', '55-64', '65plus', 'all'], dtype=object)

In [None]:
tb_agg["age_group"].unique()

array(['0-14', '0-4', '15-24', '15plus', '18plus', '25-34', '35-44',
       '45-54', '5-14', '55-64', '65plus', 'all'], dtype=object)

In [None]:
tb_agg["sex"].unique()

array(['a', 'f', 'm'], dtype=object)

In [None]:
tb.loc[(tb["country"] == "Afghanistan")].loc[0, :]

country                     Afghanistan
iso2                                 AF
iso3                                AFG
iso_numeric                           4
g_whoregion                         EMR
year                               2000
e_pop_num                      19542982
e_inc_100k                        190.0
e_inc_100k_lo                     122.0
e_inc_100k_hi                     271.0
e_inc_num                         37000
e_inc_num_lo                      24000
e_inc_num_hi                      53000
e_tbhiv_prct                       0.02
e_tbhiv_prct_lo                     0.0
e_tbhiv_prct_hi                    0.05
e_inc_tbhiv_100k                   0.03
e_inc_tbhiv_100k_lo                 0.0
e_inc_tbhiv_100k_hi                 0.1
e_inc_tbhiv_num                     6.0
e_inc_tbhiv_num_lo                  1.0
e_inc_tbhiv_num_hi                 19.0
e_mort_exc_tbhiv_100k              68.0
e_mort_exc_tbhiv_100k_lo           39.0
e_mort_exc_tbhiv_100k_hi          104.0


## Building Cards to display global statistics

In [15]:
import pandas as pd
import os
import plotly.express as px

In [18]:
os.chdir('/Users/hinabandukwala/Documents/MDS/DSCI-532_2024_1_TBtracker')

In [19]:
tb_data = pd.read_csv("data/preprocessing/tb_data.csv")

In [59]:
global_stat = tb_data.loc[tb_data["year"] == 2022]["mortality_rate"].sum()
global_stat

0.030523628687045154

In [60]:
def update_global_stats(selected_year, selected_type, selected_value):

    if selected_type == "absolute" and selected_value == "incidence":
        y_column = "incidence_total"

    elif selected_type == "relative" and selected_value == "incidence":
        y_column = "incidence_rate"

    elif selected_type == "absolute" and selected_value == "mortality":
        y_column = "mortality_total"

    elif selected_type == "relative" and selected_value == "mortality":
        y_column = "mortality_rate"

    else:
        y_column = "incidence_total"

    tb_data["year_dt"] = pd.to_datetime(tb_data["year"], format='%Y')
    year = pd.Timestamp(str(selected_year))
    previous = year - pd.DateOffset(years=1)
    next = year + pd.DateOffset(years=1)

    global_stat = tb_data.loc[tb_data["year"] == selected_year][y_column].sum()
    
    if selected_year != 2000:
        global_stat_previous = tb_data.loc[tb_data["year_dt"] == previous][y_column].sum()
        diff_previous = round(((global_stat - global_stat_previous)/ global_stat_previous) * 100, 1)
    else:
        diff_previous = None

    if selected_year != 2022:
        global_stat_next = tb_data.loc[tb_data["year_dt"] == next][y_column].sum()
        diff_next =  round(((global_stat - global_stat_next)/ global_stat_next) * 100, 1)
    else:
        diff_next = None 
        global_stat_next = None

    if diff_previous is not None:
        diff_previous_color = "blue" if diff_previous and diff_previous > 0 else "red"
        diff_previous_text = f"+{diff_previous}%" if diff_previous and diff_previous > 0 else f"{diff_previous}%"
    else:
        diff_previous_text = 'data not available'
        diff_previous_color = 'black' 

    if diff_next is not None:
        diff_next_color = "blue" if diff_next and diff_next > 0 else "red"
        diff_next_text = f"+{diff_next}%" if diff_next and diff_next > 0 else f"{diff_next}%"
    else:
        diff_next_text = 'data not available'
        diff_next_color = 'black'
    
    return f"{global_stat:.2f}", diff_previous_text, diff_next_text, diff_previous_color, diff_next_color 

In [50]:
def update_global_stats2(selected_year, selected_type, selected_value):
    y_column_mapping = {
        ("absolute", "incidence"): "incidence_total",
        ("relative", "incidence"): "incidence_rate",
        ("absolute", "mortality"): "mortality_total",
        ("relative", "mortality"): "mortality_rate",
    }
    y_column = y_column_mapping.get((selected_type, selected_value), "incidence_total")

    tb_data["year_dt"] = pd.to_datetime(tb_data["year"], format='%Y')
    selected_year_dt = pd.Timestamp(str(selected_year))
    previous_year_dt = selected_year_dt - pd.DateOffset(years=1)
    next_year_dt = selected_year_dt + pd.DateOffset(years=1)

    global_stat = tb_data.loc[tb_data["year"] == selected_year, y_column].sum()
    diff_previous = diff_next = None  # Default values for differences
    diff_previous_color = diff_next_color = 'black'  # Default text color

    if selected_year > 2000:
        global_stat_previous = tb_data.loc[tb_data["year_dt"] == previous_year_dt, y_column].sum()
        if global_stat_previous:
            diff_previous = round(((global_stat - global_stat_previous) / global_stat_previous) * 100, 1)
            diff_previous_color = "blue" if diff_previous > 0 else "red"
    
    if selected_year < 2022:
        global_stat_next = tb_data.loc[tb_data["year_dt"] == next_year_dt, y_column].sum()
        if global_stat_next:
            diff_next = round(((global_stat - global_stat_next) / global_stat_next) * 100, 1)
            diff_next_color = "blue" if diff_next > 0 else "red"

    diff_previous_text = f"{diff_previous:+.1f}%" if diff_previous is not None else "data not available"
    diff_next_text = f"{diff_next:+.1f}%" if diff_next is not None else "data not available"

    return f"{global_stat:.2f}", diff_previous_text, diff_next_text, diff_previous_color, diff_next_color



In [56]:
update_global_stats2(2019, "relative", "mortality")

(0.032867554104100624, '-3.9%', '-4.9%', 'red', 'red')

In [57]:
update_global_stats2(2022, "relative", "incidence")

(0.45882556390807117, '-0.7%', 'data not available', 'red', 'black')

In [53]:
update_global_stats2(2000, "absolute", "incidence")

(29815063, 'data not available', '-0.9%', 'black', 'red')

In [38]:
update_global_stats(2019, "absolute", "incidence")

(22778707, '-1.4%', '+0.7%', 'red', 'blue')

In [39]:
update_global_stats(2022, "absolute", "incidence")

(24101148, '+3.4%', 'data not available', 'blue', 'black')

In [40]:
update_global_stats(2000, "absolute", "incidence")

(29815063, 'data not available', '-0.9%', 'black', 'red')

In [None]:
def update_global_stats(selected_year, selected_type, selected_value):

    if selected_type == "absolute" and selected_value == "incidence":
        y_column = "incidence_total"

    elif selected_type == "relative" and selected_value == "incidence":
        y_column = "incidence_rate"

    elif selected_type == "absolute" and selected_value == "mortality":
        y_column = "mortality_total"

    elif selected_type == "relative" and selected_value == "mortality":
        y_column = "mortality_rate"

    else:
        y_column = "incidence_total"

    year = pd.Timestamp(str(selected_year))
    previous = year - pd.DateOffset(years=1)
    next = year + pd.DateOffset(years=1)

    global_stat = tb_data.loc[tb_data["year"] == selected_year][y_column].sum()
    global_stat_previous = tb_data.loc[tb_data["year_dt"] == previous][y_column].sum()
    global_stat_next = tb_data.loc[tb_data["year_dt"] == next][y_column].sum()
    
    if year != 2000:
        diff_previous = round(((global_stat - global_stat_previous)/ global_stat_previous) * 100, 1)
    else:
        diff_previous = None

    if year != 2022:
        diff_next =  round(((global_stat - global_stat_next)/ global_stat_next) * 100, 1)
    else:
        diff_next = None 
    
    return global_stat, diff_previous, diff_next
