# DATA WRANGLING: GLOBAL BURDEN OF DISEASE DATASET

## Sid Gurajala
## Last Updated: 11/18/2024

### Read in Files/Import Libraries

In [3]:
import pandas as pd
import os 

cwd = os.getcwd()
data_dir = os.path.join(cwd, "../../data/")
#join file paths 
gbd_raw_file_path = os.path.join(data_dir, "raw/IHME-GBD_2021_DATA-5c7069d7-1.csv")
#read in file
gbd_raw = pd.read_csv(gbd_raw_file_path)

### Filter Data to Number

GBD data contains metric types, including the raw number, percentage, and rate. Here we hone in on the number type per metric.

In [2]:
gbd_filtered = gbd_raw[gbd_raw.metric == "Number"]

### Aggregate by Year & Split Out Metric Types

GBD Data presents separate entries per disease type per year. Here we just want the raw number over all disease types, which have been pre-filtered to chronic respiratory type diseases. Therefore we need to aggregate over diseases and produce a single number per year per metric.

In [3]:
gbd_aggregated = gbd_filtered.groupby(["year", "measure"]).agg({"val" : "sum"}).reset_index()

In [4]:
#Split out each metric type
gbd_aggregated_incidence = gbd_aggregated[gbd_aggregated['measure'] == "Incidence"]
gbd_aggregated_death = gbd_aggregated[gbd_aggregated['measure'] == "Deaths"]

### Write to CSVs

Finally we write our wrangled data files out.

In [5]:
#Write to CSV 
gbd_aggregated_death.to_csv(os.path.join(data_dir, "final/gbd_aggregated_death_metric_per_year.csv"), index = False)
gbd_aggregated_incidence.to_csv(os.path.join(data_dir, "final/gbd_aggregated_incidence_metric_per_year.csv"), index = False)