### Define all packages and modules needed for the notebook.

In [1]:
import pandas as pd

### Read in raw datasets

In [2]:
# Read in the "Distribution of COVID-19 deaths and populations, by jurisdiction, age, and race and Hispanic origin" dataset
impacts_file = '../Data/Input/raw_covid_impacts.csv'
impacts_df = pd.read_csv(impacts_file)

# Read in the "COVID-19 Vaccine Distribution Allocations by Jurisdiction" datasets for:

# Janssen
janssen_file = '../Data/Input/raw_janssen.csv'
janssen_df = pd.read_csv(janssen_file)

# Moderna
moderna_file = '../Data/Input/raw_moderna.csv'
moderna_df = pd.read_csv(moderna_file)

# Pfizer
pfizer_file = '../Data/Input/raw_pfizer.csv'
pfizer_df = pd.read_csv(pfizer_file)

## Cleaning the COVID-19 Impacts dataframe

### Remove 'United States' data shown in the 'State' field
- The way that we are intending to integrate supplementary data, 'United States' serves as a unusable total for all states
- This only obfuscates the compatibility of the data, and can be queried if truly desired anyhow

In [3]:
impacts_df1 = impacts_df[impacts_df["State"] != "United States"]

### Remove 'All ages, unadjusted' and 'All ages, standardized'  data shown in the AgeGroup field
- The way we are intending to integrate supplementary data, these would complicate querying
- The raw unadjusted values can be queried if deemed necessary
- the standardized values can be added back if there is a particular usefulness in using this data

In [4]:
impacts_df2 = impacts_df1[impacts_df1["AgeGroup"] != "All ages, unadjusted"]
impacts_df2 = impacts_df2[impacts_df2["AgeGroup"] != "All ages, standardized"]

### Drop the 'Data as of', 'Start Date', and 'End Date 'columns
- As these columns contain the same data point, we can safely remove
- We will specify the details of this dataframe in the README

In [5]:
impacts_df3 = impacts_df2.drop(columns=['Data as of', 'Start Date', 'End Date'])

### Rename columns to be more SQL friendly
- Remove all spaces, replace with underscores
- Limit use of any special characters
- Reduce length where appropriate

In [6]:
# Define the dictionary used to rename each column
column_rename = {
    "State" : "state"
    ,"Race/Hispanic origin" : "race_hispanic"
    ,"Count of COVID-19 deaths" : "death_count"
    ,"Distribution of COVID-19 deaths (%)" : "death_percent"
    ,"Unweighted distribution of population (%)" : "unweighted_population_percent"
    ,"Weighted distribution of population (%)" : "weighted_population_percent"
    ,"Difference between COVID-19 and unweighted population %" : "diff_death_unweighted_population_percent"
    ,"Difference between COVID-19 and weighted population %" : "diff_death_weighted_population_percent"
    ,"AgeGroup" : "age_group"
    ,"Suppression" : "suppression"
}

In [7]:
# Invoke the the column_rename dictionary to change column names
impacts_df4 = impacts_df3.rename(columns = column_rename)

### Reorder data to improve visibility
- Column priority will be the following: state, age_group, race_hispanic

In [8]:
impacts_df5 = impacts_df4.sort_values(['state', 'age_group', 'race_hispanic'])

### Reset index values
- Since we've removed rows and reordered data we'll have to reset the index
- This will make the migration into SQL much better

In [9]:
impacts_df6 = impacts_df5.reset_index(drop=True)

### View sample of the finished dataframe

In [10]:
impacts_df6.head(50)

Unnamed: 0,state,race_hispanic,death_count,death_percent,unweighted_population_percent,weighted_population_percent,diff_death_unweighted_population_percent,diff_death_weighted_population_percent,age_group,suppression
0,Alabama,Hispanic,,,7.3,6.6,,,0-24 years,Suppressed (counts <10)
1,Alabama,Non-Hispanic American Indian or Alaska Native,0.0,0.0,0.5,0.3,-0.5,-0.3,0-24 years,
2,Alabama,Non-Hispanic Asian,0.0,0.0,1.5,1.9,-1.5,-1.9,0-24 years,
3,Alabama,Non-Hispanic Black,,,29.5,40.0,,,0-24 years,Suppressed (counts <10)
4,Alabama,Non-Hispanic Native Hawaiian or Other Pacific ...,0.0,0.0,0.1,0.1,-0.1,-0.1,0-24 years,
5,Alabama,Non-Hispanic White,,,58.1,48.4,,,0-24 years,Suppressed (counts <10)
6,Alabama,Other,0.0,0.0,3.0,2.8,-3.0,-2.8,0-24 years,
7,Alabama,Hispanic,,,5.0,4.4,,,25-34 years,Suppressed (counts <10)
8,Alabama,Non-Hispanic American Indian or Alaska Native,0.0,0.0,0.6,0.4,-0.6,-0.4,25-34 years,
9,Alabama,Non-Hispanic Asian,0.0,0.0,1.8,2.3,-1.8,-2.3,25-34 years,
