#### Notes

This script includes

- Selecting countries
- Labelling data
- Checking data types

**Don't run if raw data not existed**

##### See **README.md** for data sources

In [1]:
library(tidyverse)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.1     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
countries <- c("India", "South Africa", "Kenya", "Brazil")

# Risk profile

In [3]:
raw <- read_csv("data_raw/Table-6.3.2.csv")
head(raw)

[1mRows: [22m[34m33[39m [1mColumns: [22m[34m9[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (5): Country, Und_all_a, HIV_1549_a, Smoking_15_f, Smoking_15_m
[32mdbl[39m (4): Dia_18_f, Dia-18_m, Alc_15_f, Alc_15_m

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Country,Und_all_a,HIV_1549_a,Smoking_15_f,Smoking_15_m,Dia_18_f,Dia-18_m,Alc_15_f,Alc_15_m
<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
Angola,19,1.9,—,—,7.8,8.5,1.7,11.0
Bangladesh,13,—,1,41,9.3,10.0,0.3,1.4
Brazil,2.5,0.5,9.5,17,8.7,7.8,1.6,6.9
Cambodia,14,0.5,2,32,6.9,7.4,1.8,8.7
Central African Republic,—,3.5,—,—,7.6,8.0,0.9,6.8
China,2.5,—,1.8,48,7.6,9.9,0.2,8.4


In [4]:
risk <- raw %>% 
    filter(Country %in% countries) %>%
    mutate(
        Und_all_a = as.numeric(Und_all_a),
        HIV_1549_a = as.numeric(HIV_1549_a),
        HIV_1549_a = ifelse(Country == "India", 0.2, HIV_1549_a),
        Smoking_15_f = as.numeric(Smoking_15_f),
        Smoking_15_m = as.numeric(Smoking_15_m),
        across(Und_all_a:Alc_15_m, function(x) x / 100)
    )
risk

[1m[22m[36mℹ[39m In argument: `HIV_1549_a = as.numeric(HIV_1549_a)`.
[33m![39m NAs introduced by coercion”


Country,Und_all_a,HIV_1549_a,Smoking_15_f,Smoking_15_m,Dia_18_f,Dia-18_m,Alc_15_f,Alc_15_m
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Brazil,0.025,0.005,0.095,0.17,0.087,0.078,0.016,0.069
India,0.14,0.002,0.014,0.22,0.083,0.091,0.005,0.091
Kenya,0.23,0.045,0.01,0.2,0.062,0.058,0.009,0.071
South Africa,0.057,0.19,0.071,0.34,0.13,0.097,0.018,0.12


In [5]:
risk %>% write_csv("data/gho_risk.csv")

### TB incidence disaggregated

In [6]:
raw <- read_csv("data_raw/TB_burden_age_sex_2023-03-02.csv")

head(raw)

[1mRows: [22m[34m7333[39m [1mColumns: [22m[34m13[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (9): country, iso2, iso3, iso_numeric, measure, unit, age_group, sex, ri...
[32mdbl[39m (4): year, best, lo, hi

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


country,iso2,iso3,iso_numeric,year,measure,unit,age_group,sex,risk_factor,best,lo,hi
<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>
Afghanistan,AF,AFG,4,2021,inc,num,0-14,a,all,16000,8000,24000
Afghanistan,AF,AFG,4,2021,inc,num,0-14,f,all,7600,2300,13000
Afghanistan,AF,AFG,4,2021,inc,num,0-14,m,all,8200,2500,14000
Afghanistan,AF,AFG,4,2021,inc,num,0-4,f,all,3400,0,8600
Afghanistan,AF,AFG,4,2021,inc,num,0-4,m,all,4000,0,10000
Afghanistan,AF,AFG,4,2021,inc,num,15-24,f,all,7400,0,19000


In [7]:
inc_by_risk <- raw %>% 
    filter(country %in% countries) %>%
    rename(Country = country, M = best, L = lo, H = hi)

head(inc_by_risk)

Country,iso2,iso3,iso_numeric,year,measure,unit,age_group,sex,risk_factor,M,L,H
<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>
Brazil,BR,BRA,76,2021,inc,num,0-14,a,all,5100,4300,5900
Brazil,BR,BRA,76,2021,inc,num,0-14,f,all,2500,2000,3000
Brazil,BR,BRA,76,2021,inc,num,0-14,m,all,2600,2100,3200
Brazil,BR,BRA,76,2021,inc,num,0-4,f,all,1100,570,1700
Brazil,BR,BRA,76,2021,inc,num,0-4,m,all,1300,680,2000
Brazil,BR,BRA,76,2021,inc,num,15-24,f,all,5900,3000,8800


In [8]:
inc_by_risk %>% write_csv("data/who_inc.csv")

### Population size

In [9]:
raw <- read_csv("data_raw/WPP2022.csv")

[1mRows: [22m[34m2072520[39m [1mColumns: [22m[34m20[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (7): Notes, ISO3_code, ISO2_code, LocTypeName, Location, Variant, AgeGrp
[32mdbl[39m (13): SortOrder, LocID, SDMX_code, LocTypeID, ParentID, VarID, Time, Mid...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [10]:
pop <- raw %>%
    filter(Location %in% countries) %>%
    filter(Time %in% 2020:2030) %>%
    filter(Variant == "Medium") %>%
    select(
        iso3 = ISO3_code, iso2 = ISO2_code, Country = Location, Year = Time,
        Age = AgeGrp, PopMale, PopFemale, PopTotal      
    ) %>%
    mutate(
        PopMale = PopMale * 1000,
        PopFemale = PopFemale * 1000,
        PopTotal = PopTotal * 1000
    )

head(pop)

iso3,iso2,Country,Year,Age,PopMale,PopFemale,PopTotal
<chr>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>
KEN,KE,Kenya,2020,0,707808,697235,1405043
KEN,KE,Kenya,2020,1,695585,686771,1382357
KEN,KE,Kenya,2020,2,689356,681644,1371000
KEN,KE,Kenya,2020,3,687634,680136,1367771
KEN,KE,Kenya,2020,4,681460,675277,1356736
KEN,KE,Kenya,2020,5,670933,666410,1337343


In [11]:
pop %>% write_csv("data/wpp_pop.csv")