# Re-clean the overdose deaths data

In [1]:
import pandas as pd 
import numpy as np

In [2]:
# import deaths data
deaths = pd.read_parquet("/Users/yangshining/Desktop/pds2021-opioids-pds6/10_modified_data/Vital.parquet", engine='fastparquet')

In [3]:
# missing value are set to zero previously
deaths[deaths["Deaths"]==0]

Unnamed: 0,County,County Code,Year,Deaths
385,"Bedford city, VA",51515,2015,0
1496,"Clifton Forge city, VA",51560,2015,0
5724,"Prince of Wales-Outer Ketchikan Census Area, AK",2201,2015,0
6547,"Skagway-Hoonah-Angoon Census Area, AK",2232,2015,0
7804,"Wrangell-Petersburg Census Area, AK",2280,2015,0


In [4]:
# import population data
population = pd.read_parquet("/Users/yangshining/Desktop/pds2021-opioids-pds6/10_modified_data/pop_final.parquet", engine='fastparquet')

population.sample(10)

#population["Year"].value_counts()

Unnamed: 0,STATE,COUNTY,STNAME,CTYNAME,Year,Population,STATE1,COUNTY1,fips
19973,19,45,Iowa,Clinton County,2009,49147,19,45,19045
28972,8,11,Colorado,Bent County,2012,5848,8,11,8011
35944,19,77,Iowa,Guthrie County,2014,10686,19,77,19077
3226,1,69,Alabama,Houston County,2004,92593,1,69,1069
30878,39,163,Ohio,Vinton County,2012,13221,39,163,39163
21152,37,167,North Carolina,Stanly County,2009,60496,37,167,37167
21071,37,5,North Carolina,Alleghany County,2009,11125,37,5,37005
16950,20,181,Kansas,Sherman County,2008,6017,20,181,20181
17860,36,91,New York,Saratoga County,2008,217282,36,91,36091
19385,6,101,California,Sutter County,2009,94366,6,101,6101


In [5]:
# create rows for La salle, LA


# create a list of Series

listofSeries = [
    pd.Series(
        [22, 59, "Louisiana", "La Salle Parish", 2003, 14356, "22", "059", "22059"],
        index=population.columns,
    ),
    pd.Series(
        [22, 59, "Louisiana", "La Salle Parish", 2004, 14366, "22", "059", "22059"],
        index=population.columns,
    ),
    pd.Series(
        [22, 59, "Louisiana", "La Salle Parish", 2005, 14313, "22", "059", "22059"],
        index=population.columns,
    ),
    pd.Series(
        [22, 59, "Louisiana", "La Salle Parish", 2006, 14519, "22", "059", "22059"],
        index=population.columns,
    ),
    pd.Series(
        [22, 59, "Louisiana", "La Salle Parish", 2007, 14570, "22", "059", "22059"],
        index=population.columns,
    ),
    pd.Series(
        [22, 59, "Louisiana", "La Salle Parish", 2008, 14667, "22", "059", "22059"],
        index=population.columns,
    ),
    pd.Series(
        [22, 59, "Louisiana", "La Salle Parish", 2009, 14717, "22", "059", "22059"],
        index=population.columns,
    ),
    pd.Series(
        [22, 59, "Louisiana", "La Salle Parish", 2010, 14908, "22", "059", "22059"],
        index=population.columns,
    ),
    pd.Series(
        [22, 59, "Louisiana", "La Salle Parish", 2011, 14941, "22", "059", "22059"],
        index=population.columns,
    ),
    pd.Series(
        [22, 59, "Louisiana", "La Salle Parish", 2012, 14862, "22", "059", "22059"],
        index=population.columns,
    ),
    pd.Series(
        [22, 59, "Louisiana", "La Salle Parish", 2013, 14821, "22", "059", "22059"],
        index=population.columns,
    ),
    pd.Series(
        [22, 59, "Louisiana", "La Salle Parish", 2014, 14875, "22", "059", "22059"],
        index=population.columns,
    ),
    pd.Series(
        [22, 59, "Louisiana", "La Salle Parish", 2015, 14979, "22", "059", "22059"],
        index=population.columns,
    )
]


# append these rows to population dataframe
population = population.append(listofSeries, ignore_index=True)


# check if the rows are correctly appended
population[
    (population["STNAME"] == "Louisiana")
    & (population["CTYNAME"] == "La Salle Parish")
]

Unnamed: 0,STATE,COUNTY,STNAME,CTYNAME,Year,Population,STATE1,COUNTY1,fips
41483,22,59,Louisiana,La Salle Parish,2003,14356,22,59,22059
41484,22,59,Louisiana,La Salle Parish,2004,14366,22,59,22059
41485,22,59,Louisiana,La Salle Parish,2005,14313,22,59,22059
41486,22,59,Louisiana,La Salle Parish,2006,14519,22,59,22059
41487,22,59,Louisiana,La Salle Parish,2007,14570,22,59,22059
41488,22,59,Louisiana,La Salle Parish,2008,14667,22,59,22059
41489,22,59,Louisiana,La Salle Parish,2009,14717,22,59,22059
41490,22,59,Louisiana,La Salle Parish,2010,14908,22,59,22059
41491,22,59,Louisiana,La Salle Parish,2011,14941,22,59,22059
41492,22,59,Louisiana,La Salle Parish,2012,14862,22,59,22059


## Merging

In [6]:
# Fix some fips inconsistencies in the dataset
# Fix Bedford, VA
population["fips"] = np.where((population['fips']== '51019') & (population['Year'] == 2015), '51515', population["fips"])

# Fix Clifton Forge, VA
deaths["County Code"] = np.where((deaths['County Code']== '51560') & (deaths['Year'] == 2015), '51005', deaths['County Code'])

# Drop Alaska
deaths = deaths[deaths["County"].str[-2:] != "AK"]

# Test if Alaska is correctly dropped
assert len(deaths[deaths["County"].str[-2:] == "AK"])==0

In [7]:
# Modify data types of columns - prepare for merging
deaths["County Code"] = deaths["County Code"].astype(str)
deaths["County Code"] = deaths["County Code"].apply(lambda x: x.zfill(5))


population["Year"] = population["Year"].astype(int)

# subset for population to keep only the states we need
state = [
    "Florida",
    "Georgia",
    "South Carolina",
    "Mississippi",
    "Alabama",
    "Texas",
    "Louisiana",
    "Oklahoma",
    "New Mexico",
    "Washington",
    "Montana",
    "Oregon",
    "Idaho",
]

pop2 = population[population["STNAME"].isin(state)]

# check for states

pop2["STNAME"].unique()


array(['Alabama', 'Florida', 'Georgia', 'Idaho', 'Louisiana',
       'Mississippi', 'Montana', 'New Mexico', 'Oklahoma', 'Oregon',
       'South Carolina', 'Texas', 'Washington'], dtype=object)

In [8]:
# outer merging
merge2= pd.merge(deaths, pop2, left_on=["County Code", "Year"], right_on=["fips", "Year"], how='outer') #Merging based on deaths deaths dataset

# keep states of our interests
merge2 = merge2[merge2["STNAME"].notnull()]

# keep columns that we need
merge2 = merge2[["STNAME", "CTYNAME", "Year", "Deaths", "Population", "fips"]]

# test for any missing deaths value
merge2[merge2["Deaths"].isnull()][["STNAME", "CTYNAME", "Year", "fips"]]

Unnamed: 0,STNAME,CTYNAME,Year,fips
7890,Alabama,Alabama,2003,01000
7891,Alabama,Autauga County,2003,01001
7892,Alabama,Barbour County,2003,01005
7893,Alabama,Bibb County,2003,01007
7894,Alabama,Blount County,2003,01009
...,...,...,...,...
18926,Louisiana,La Salle Parish,2011,22059
18927,Louisiana,La Salle Parish,2012,22059
18928,Louisiana,La Salle Parish,2013,22059
18929,Louisiana,La Salle Parish,2014,22059


In [9]:
# Generate deaths per capita

# generate deaths per capita
merge2["deaths_per_cap"] = merge2["Deaths"]/merge2["Population"]

# as we are missing some counties, we need to fill them with state-year level drug-related mortality rate:

# 1. accumulate total population of each state
merge2["state_pop_total"] = merge2.groupby(["STNAME", "Year"])["Population"].transform("sum")

# 2. accumulate total drug-related deaths of each state
merge2["state_deaths_total"] = merge2.groupby(["STNAME", "Year"])["Deaths"].transform("sum")

# 3. calculate state-year level drug-related mortality rate
merge2["state_mortality_rate"] = merge2["state_deaths_total"]/merge2["state_pop_total"]

# 4. fill the missing value in Deaths with the state-year level mortality rate
merge2["deaths_per_cap"] = np.where(merge2["deaths_per_cap"].isnull(), merge2["state_mortality_rate"], merge2["deaths_per_cap"])

# test if there's any missing value for deaths_per_cap
merge2[merge2["deaths_per_cap"].isnull()]

Unnamed: 0,STNAME,CTYNAME,Year,Deaths,Population,fips,deaths_per_cap,state_pop_total,state_deaths_total,state_mortality_rate


In [10]:
# test for the number of obs for each year
merge2["Year"].value_counts()

2003    1037
2005    1037
2006    1037
2007    1037
2009    1037
2010    1037
2011    1037
2004    1037
2008    1037
2012    1037
2013    1037
2014    1037
2015    1037
Name: Year, dtype: int64

In [11]:
merge2["deaths_per_cap"].describe()

count    13481.000000
mean         0.000056
std          0.000055
min          0.000000
25%          0.000029
50%          0.000038
75%          0.000061
max          0.000981
Name: deaths_per_cap, dtype: float64

In [12]:
# store the data
merge2.to_parquet("/Users/yangshining/Desktop/pds2021-opioids-pds6/15_re-modified_data/deaths_pop.parquet", engine='fastparquet')