# HealthData: Top 10 Causes of Death by State

We will begin with importing our datasets and saving it to a dataframe

The COVID19 Deaths dataset was cleaned for a previous project and we can use the cleaned file to join to our top 10 causes set.

In [1]:
import pandas as pd
import datetime as dt
import numpy as np

path = r"C:\Users\Basil\Documents\Data Science\Projects\20200506 Coronavirus\1. Original Data\NCHS_-_Leading_Causes_of_Death__United_States.csv"
df = pd.read_csv(path)

path = r"C:\Users\Basil\Documents\Data Science\Projects\20200506 Coronavirus\2. Prepared Data\covid19_deaths.csv"
df2 = pd.read_csv(path)

In [2]:
df.head(5)

Unnamed: 0,Year,113 Cause Name,Cause Name,State,Deaths,Age-adjusted Death Rate
0,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,United States,169936,49.4
1,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,Alabama,2703,53.8
2,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,Alaska,436,63.7
3,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,Arizona,4184,56.2
4,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,Arkansas,1625,51.8


First we will remove the United States Total rows that we can see in the state column.

In [3]:
clean_df = df[df.State != 'United States']

In [4]:
clean_df['Cause Name'].unique()

array(['Unintentional injuries', 'All causes', "Alzheimer's disease",
       'Stroke', 'CLRD', 'Diabetes', 'Heart disease',
       'Influenza and pneumonia', 'Suicide', 'Cancer', 'Kidney disease'],
      dtype=object)

There appears to be an All Causes type. As Tableau will automatically sum this we can remove these rows.

In [5]:
clean_df.head(5)

Unnamed: 0,Year,113 Cause Name,Cause Name,State,Deaths,Age-adjusted Death Rate
1,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,Alabama,2703,53.8
2,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,Alaska,436,63.7
3,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,Arizona,4184,56.2
4,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,Arkansas,1625,51.8
5,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,California,13840,33.2


In [6]:
clean_df = clean_df[clean_df["Cause Name"] != 'All causes']

Next we will remove all years except 2017

In [7]:
clean_df = clean_df[clean_df.Year == 2017]

We need a column with state abbreviations to join to our COVID-19 Death dataset.

In [8]:
clean_df = clean_df.replace({"State" : {"Alabama" : "AL", 
                                "Alaska" : "AK", 
                                "Arizona" : "AZ",
                                "Arkansas" : "AR",
                                "California" : "CA",
                                "Colorado" : "CO",
                                "Connecticut" : "CT",
                                "Delaware" : "DE",
                                "District of Columbia" : "DC",
                                "Florida" : "FL",
                                "Georgia" : "GA",
                                "Hawaii" : "HI",
                                "Idaho" : "ID",
                                "Illinois" : "IL",
                                "Indiana" : "IN",
                                "Iowa" : "IA",
                                "Kansas" : "KS",
                                "Kentucky" : "KY",
                                "Louisiana" : "LA",
                                "Maine" : "ME",
                                "Maryland" : "MD",
                                "Massachusetts" : "MA",
                                "Michigan" : "MI",
                                "Minnesota" : "MN",
                                "Mississippi" : "MS",
                                "Missouri" : "MO",
                                "Montana" : "MT",
                                "Nebraska" : "NE",
                                "Nevada" : "NV",
                                "New Hampshire" : "NH",
                                "New Jersey" : "NJ",
                                "New Mexico" : "NM",
                                "New York" : "NY",
                                "North Carolina" : "NC",
                                "North Dakota" : "ND",
                                "Ohio" : "OH",
                                "Oklahoma" : "OK",
                                "Oregon" : "OR",
                                "Pennsylvania" : "PA",
                                "Rhode Island" : "RI",
                                "South Carolina" : "SC",
                                "South Dakota" : "SD",
                                "Tennessee" : "TN",
                                "Texas" : "TX",
                                "Utah" : "UT",
                                "Vermont" : "VT",
                                "Virginia" : "VA",
                                "Washington" : "WA",
                                "West Virginia" : "WV",
                                "Wisconsin" : "WI",
                                "Wyoming" : "WY"}})

In [9]:
clean_df.head(5)

Unnamed: 0,Year,113 Cause Name,Cause Name,State,Deaths,Age-adjusted Death Rate
1,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,AL,2703,53.8
2,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,AK,436,63.7
3,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,AZ,4184,56.2
4,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,AR,1625,51.8
5,2017,"Accidents (unintentional injuries) (V01-X59,Y8...",Unintentional injuries,CA,13840,33.2


We dont need the following columns.

* 113 Cause Name
* Age-adjusted Death Rate

In [10]:
del clean_df['113 Cause Name']
del clean_df['Age-adjusted Death Rate']

In [11]:
clean_df.head(5)

Unnamed: 0,Year,Cause Name,State,Deaths
1,2017,Unintentional injuries,AL,2703
2,2017,Unintentional injuries,AK,436
3,2017,Unintentional injuries,AZ,4184
4,2017,Unintentional injuries,AR,1625
5,2017,Unintentional injuries,CA,13840


We will want to be able to view the top 10 categories by number of deaths per month. Since we do not have this level of detail we will want to show average deaths per month.

We will begin by creating columns for each month with the total deaths/12 as the value for each. We will then melt those monthly categories and create a monthly average deaths column with those values.

In [12]:
clean_df['January'] = clean_df['Deaths']/12
clean_df['February'] = clean_df['Deaths']/12
clean_df['March'] = clean_df['Deaths']/12
clean_df['April'] = clean_df['Deaths']/12
clean_df['May'] = clean_df['Deaths']/12
clean_df['June'] = clean_df['Deaths']/12
clean_df['July'] = clean_df['Deaths']/12
clean_df['August'] = clean_df['Deaths']/12
clean_df['September'] = clean_df['Deaths']/12
clean_df['October'] = clean_df['Deaths']/12
clean_df['November'] = clean_df['Deaths']/12
clean_df['December'] = clean_df['Deaths']/12

In [13]:
clean_df = pd.melt(clean_df,
                       ["Year", "Cause Name", "State", "Deaths"],
                       var_name="Month",
                       value_name="Monthly Deaths")
del clean_df['Deaths']

In [14]:
clean_df.head(5)

Unnamed: 0,Year,Cause Name,State,Month,Monthly Deaths
0,2017,Unintentional injuries,AL,January,225.25
1,2017,Unintentional injuries,AK,January,36.333333
2,2017,Unintentional injuries,AZ,January,348.666667
3,2017,Unintentional injuries,AR,January,135.416667
4,2017,Unintentional injuries,CA,January,1153.333333


We will now take a look at our COVID-19 data and prep it for merging with our averages file.

In [15]:
df2.head(5)

Unnamed: 0.1,Unnamed: 0,countyFIPS,County Name,State,stateFIPS,Date,Deaths
0,0,0,Statewide Unallocated,AL,1,1/22/20,0
1,1,1001,Autauga County,AL,1,1/22/20,0
2,2,1003,Baldwin County,AL,1,1/22/20,0
3,3,1005,Barbour County,AL,1,1/22/20,0
4,4,1007,Bibb County,AL,1,1/22/20,0


We will first aggregate the deaths by state and date. Then we will create year and month columns and aggregate the deaths by state, year and month.

In [16]:
df3 = df2.groupby(['State', 'Date']).agg({'Deaths': ['sum']})
df3.columns = ['Deaths']
df3 = df3.reset_index()

In [17]:
df3['Date'] = pd.to_datetime(df3['Date'])
df3['Month'] = df3['Date'].dt.month_name()
df3['Year'] = df3['Date'].dt.year

In [18]:
df3 = df3.groupby(['State', 'Month', 'Year']).agg({'Deaths': ['sum']})
df3.columns = ['Monthly Deaths']
df3 = df3.reset_index()

In [19]:
df3.head(5)

Unnamed: 0,State,Month,Year,Monthly Deaths
0,AK,April,2020,232
1,AK,February,2020,0
2,AK,January,2020,0
3,AK,March,2020,26
4,AK,May,2020,285


Now we will add our Cause Name column and populate with COVID-19

In [20]:
df3['Cause Name'] = "COVID-19"

In [21]:
df3.head(5)

Unnamed: 0,State,Month,Year,Monthly Deaths,Cause Name
0,AK,April,2020,232,COVID-19
1,AK,February,2020,0,COVID-19
2,AK,January,2020,0,COVID-19
3,AK,March,2020,26,COVID-19
4,AK,May,2020,285,COVID-19


In [22]:
clean_df.head(5)

Unnamed: 0,Year,Cause Name,State,Month,Monthly Deaths
0,2017,Unintentional injuries,AL,January,225.25
1,2017,Unintentional injuries,AK,January,36.333333
2,2017,Unintentional injuries,AZ,January,348.666667
3,2017,Unintentional injuries,AR,January,135.416667
4,2017,Unintentional injuries,CA,January,1153.333333


In [23]:
combined_data = pd.concat([clean_df, df3], ignore_index = True, sort=True)

In [24]:
combined_data.head(5)

Unnamed: 0,Cause Name,Month,Monthly Deaths,State,Year
0,Unintentional injuries,January,225.25,AL,2017
1,Unintentional injuries,January,36.333333,AK,2017
2,Unintentional injuries,January,348.666667,AZ,2017
3,Unintentional injuries,January,135.416667,AR,2017
4,Unintentional injuries,January,1153.333333,CA,2017


In [25]:
combined_data.tail(5)

Unnamed: 0,Cause Name,Month,Monthly Deaths,State,Year
6370,COVID-19,April,79.0,WY,2020
6371,COVID-19,February,0.0,WY,2020
6372,COVID-19,January,0.0,WY,2020
6373,COVID-19,March,0.0,WY,2020
6374,COVID-19,May,298.0,WY,2020


Now that the data is clean we will save the file



In [26]:
save_path = r"C:\Users\Basil\Documents\Data Science\Projects\20200506 Coronavirus\2. Prepared Data\Top 10 Deaths by State.csv"
combined_data.to_csv(save_path)