In [2]:
# import libraries
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# load WHO TB Burden - incidence number 2023 + severity level, enriched with external data + corrections done in EDA
TB_burden_enriched = "df_TB_burden_enriched_new.csv"
df_TB_burden_enriched = pd.read_csv(TB_burden_enriched)
df_TB_burden_enriched

Unnamed: 0,country_name,iso2,iso3,iso_numeric,g_whoregion,year,e_pop_num,e_inc_num,e_tbhiv_prct,e_inc_tbhiv_num,...,targets_number,doses,BCG_coverage,population_density,MPI_value,pop_in_MP_percent,total_smokers_2022_percent,avg_air_pollution_PM2-5_in_2023,e_tb_inc_prct,tb_severity
0,Afghanistan,AF,AFG,4,EMR,2023,41454760,75000,0.03,19.0,...,2057002.0,1969540.0,95.75,63.8457,0.271721,55.910239,22.7,,0.180920,Moderate
1,Albania,AL,ALB,8,EUR,2023,2811661,430,1.10,4.0,...,22887.0,22703.0,99.20,102.6226,0.002748,0.703561,21.9,16.7,0.015293,Very Low
2,Algeria,DZ,DZA,12,AFR,2023,46164214,22000,0.83,180.0,...,936973.0,924725.0,98.69,19.3826,0.005409,1.380835,21.2,13.8,0.047656,Very Low
3,American Samoa,AS,ASM,16,WPR,2023,47521,2,0.37,0.0,...,,,,237.6050,,,,,0.004209,Very Low
4,Andorra,AD,AND,20,EUR,2023,80854,5,0.47,0.0,...,,,,172.0340,,,36.3,7.9,0.006184,Very Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,Viet Nam,VN,VNM,704,WPR,2023,100352197,182000,2.50,4600.0,...,1267834.0,1205427.0,95.08,320.1956,0.007729,1.919121,22.5,29.6,0.181361,Moderate
210,Wallis and Futuna,WF,WLF,876,WPR,2023,11377,0,0.15,0.0,...,,,,81.2179,,,,,0.000000,Very Low
211,Yemen,YE,YEM,887,EMR,2023,39390797,19000,0.72,140.0,...,1254398.0,802445.0,63.97,74.6080,0.245166,48.466429,21.4,,0.048235,Very Low
212,Zambia,ZM,ZMB,894,AFR,2023,20723959,59000,32.00,19000.0,...,877840.0,707320.0,80.58,27.8776,0.231685,47.906131,14.6,24.1,0.284695,High


In [6]:
# Summary of the data: checking data types
column_summary_df_TB_burden_enriched = pd.DataFrame({
    'Column Name': df_TB_burden_enriched.columns,
    'Data Type': df_TB_burden_enriched.dtypes.values,
    'Non-Null Count': df_TB_burden_enriched.notnull().sum().values,
    'Null Count': df_TB_burden_enriched.isnull().sum().values,
    'Data Type': df_TB_burden_enriched.dtypes.values,
    'Percentage Null': df_TB_burden_enriched.isnull().mean().values * 100,
    'Unique Values': df_TB_burden_enriched.nunique().values
})

print(column_summary_df_TB_burden_enriched)

                        Column Name Data Type  Non-Null Count  Null Count  \
0                      country_name    object             214           0   
1                              iso2    object             213           1   
2                              iso3    object             214           0   
3                       iso_numeric     int64             214           0   
4                       g_whoregion    object             214           0   
5                              year     int64             214           0   
6                         e_pop_num     int64             214           0   
7                         e_inc_num     int64             214           0   
8                      e_tbhiv_prct   float64             214           0   
9                   e_inc_tbhiv_num   float64             214           0   
10             e_mort_exc_tbhiv_num   float64             213           1   
11                 e_mort_tbhiv_num   float64             211           3   

In [8]:
# assign grade numerical values to TB_severity and store in a new TB_severity_level column:
TB_severity_mapping = {"Very Low": 1, "Low": 2, "Moderate": 3, "High": 4, "Critical": 5}
df_TB_burden_enriched["tb_severity_level"] = df_TB_burden_enriched["tb_severity"].map(TB_severity_mapping)
df_TB_burden_enriched.head(50)

Unnamed: 0,country_name,iso2,iso3,iso_numeric,g_whoregion,year,e_pop_num,e_inc_num,e_tbhiv_prct,e_inc_tbhiv_num,...,doses,BCG_coverage,population_density,MPI_value,pop_in_MP_percent,total_smokers_2022_percent,avg_air_pollution_PM2-5_in_2023,e_tb_inc_prct,tb_severity,tb_severity_level
0,Afghanistan,AF,AFG,4,EMR,2023,41454760,75000,0.03,19.0,...,1969540.0,95.75,63.8457,0.271721,55.910239,22.7,,0.18092,Moderate,3
1,Albania,AL,ALB,8,EUR,2023,2811661,430,1.1,4.0,...,22703.0,99.2,102.6226,0.002748,0.703561,21.9,16.7,0.015293,Very Low,1
2,Algeria,DZ,DZA,12,AFR,2023,46164214,22000,0.83,180.0,...,924725.0,98.69,19.3826,0.005409,1.380835,21.2,13.8,0.047656,Very Low,1
3,American Samoa,AS,ASM,16,WPR,2023,47521,2,0.37,0.0,...,,,237.605,,,,,0.004209,Very Low,1
4,Andorra,AD,AND,20,EUR,2023,80854,5,0.47,0.0,...,,,172.034,,,36.3,7.9,0.006184,Very Low,1
5,Angola,AO,AGO,24,AFR,2023,36749909,125000,7.9,9800.0,...,1163980.0,95.52,29.4777,0.282435,51.104112,,7.8,0.340137,High,4
6,Anguilla,AI,AIA,660,AMR,2023,14413,3,8.2,0.0,...,167.0,109.87,163.75,,,,7.4,0.020815,Very Low,1
7,Antigua and Barbuda,AG,ATG,28,AMR,2023,93317,1,1.2,0.0,...,,,212.0807,,,,,0.001072,Very Low,1
8,Argentina,AR,ARG,32,AMR,2023,45538407,16000,6.1,980.0,...,339846.0,68.77,16.3114,0.001469,0.432323,23.8,9.2,0.035135,Very Low,1
9,Armenia,AM,ARM,51,EUR,2023,2943390,720,11.0,83.0,...,32287.0,98.54,103.4076,0.00069,0.190553,24.9,26.4,0.024462,Very Low,1


In [10]:
# Select features and target columns: 10 features + 2 targets
df_TB_burden_selected = df_TB_burden_enriched[["e_tbhiv_prct", "cfr_pct", "c_cdr", "e_rr_pct_new", "BCG_coverage", "population_density","MPI_value",
                                               "pop_in_MP_percent", "total_smokers_2022_percent", "avg_air_pollution_PM2-5_in_2023", "e_tb_inc_prct", "tb_severity", "tb_severity_level"]]
df_TB_burden_selected.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 13 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   e_tbhiv_prct                     214 non-null    float64
 1   cfr_pct                          196 non-null    float64
 2   c_cdr                            192 non-null    float64
 3   e_rr_pct_new                     214 non-null    float64
 4   BCG_coverage                     156 non-null    float64
 5   population_density               214 non-null    float64
 6   MPI_value                        109 non-null    float64
 7   pop_in_MP_percent                109 non-null    float64
 8   total_smokers_2022_percent       164 non-null    float64
 9   avg_air_pollution_PM2-5_in_2023  131 non-null    float64
 10  e_tb_inc_prct                    214 non-null    float64
 11  tb_severity                      214 non-null    object 
 12  tb_severity_level     

In [None]:
# Select feature columns: 9 features selected, 1 dropped
df_TB_burden_selected = df_TB_burden_selected.drop(["pop_in_MP_percent"], axis=1)
df_TB_burden_selected.info()