# Merging opioid&population and vital deaths&population

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read data 

# opioid
opioid = pd.read_parquet("/Users/yangshining/Desktop/pds2021-opioids-pds6/10_modified_data/opi_merge_final.parquet", engine='fastparquet')

# vital deaths
vital = pd.read_parquet("/Users/yangshining/Desktop/pds2021-opioids-pds6/10_modified_data/Vital.parquet", engine='fastparquet')

# population
population = pd.read_parquet("/Users/yangshining/Desktop/pds2021-opioids-pds6/10_modified_data/pop_final.parquet", engine='fastparquet')

In [3]:
# check for years in population data
population["Year"].value_counts()

2003    3191
2004    3191
2005    3191
2006    3191
2007    3191
2008    3191
2009    3191
2010    3191
2011    3191
2012    3191
2013    3191
2014    3191
2015    3191
Name: Year, dtype: int64

In [4]:
# Check for number of rows in each dataset

len_opi = len(opioid)
len_vital = len(vital)
len_pop = len(population)

print(f"Opioid dataset has {len_opi} rows; vital deaths dataset has {len_vital} rows; population dataset has {len_pop} rows.")

Opioid dataset has 8736 rows; vital deaths dataset has 7925 rows; population dataset has 41483 rows.


In [5]:
vital

Unnamed: 0,County,County Code,Year,Deaths
0,"Acadia Parish, LA",22001,2003,11
1,"Acadia Parish, LA",22001,2005,23
2,"Acadia Parish, LA",22001,2006,19
3,"Acadia Parish, LA",22001,2007,19
4,"Acadia Parish, LA",22001,2009,11
...,...,...,...,...
7920,"Yuma County, AZ",4027,2011,20
7921,"Yuma County, AZ",4027,2012,32
7922,"Yuma County, AZ",4027,2013,22
7923,"Yuma County, AZ",4027,2014,35


In [6]:
population.sample(10)

Unnamed: 0,STATE,COUNTY,STNAME,CTYNAME,Year,Population,STATE1,COUNTY1,fips
11809,40,139,Oklahoma,Texas County,2006,19361,40,139,40139
40463,40,9,Oklahoma,Beckham County,2015,23554,40,9,40009
14558,33,3,New Hampshire,Carroll County,2007,47829,33,3,33003
24422,39,15,Ohio,Brown County,2010,44846,39,15,39015
25678,5,75,Arkansas,Lawrence County,2011,17282,5,75,5075
26292,18,105,Indiana,Monroe County,2011,140233,18,105,18105
4935,31,127,Nebraska,Nemaha County,2004,7182,31,127,31127
24561,40,115,Oklahoma,Ottawa County,2010,31848,40,115,40115
3607,13,37,Georgia,Calhoun County,2004,6306,13,37,13037
28622,55,0,Wisconsin,Wisconsin,2011,5705288,55,0,55000


## Merge opioid dataset with population

In [7]:
# Change the population FIPS code to strings
population["fips"] = population["fips"].astype(str)

In [8]:
# Modify data types of columns - prepare for merging

population["Year"] = population["Year"].astype(int)

opioid["fips"] = opioid["fips"].astype(str)

opioid["fips"] = opioid["fips"].apply(lambda x: x.zfill(5)) # fill fips code up to 5 digits

merge1 = pd.merge(opioid, population, left_on=["fips", "year"], right_on=["fips", "Year"], how='left') #Merging based on opioid dataset

merge1

# Check if there's any missing value in population

merge1[merge1["Population"].isnull()]  # La Salle is NOT considered to be a county in Louisiana


Unnamed: 0,county_name,state_abbr,fips,BUYER_STATE,BUYER_COUNTY,year,MME,STATE,COUNTY,STNAME,CTYNAME,Year,Population,STATE1,COUNTY1
3171,LA SALLE,LA,22059,LA,LA SALLE,2006,3570544.0,,,,,,,,
3172,LA SALLE,LA,22059,LA,LA SALLE,2007,4503112.0,,,,,,,,
3173,LA SALLE,LA,22059,LA,LA SALLE,2008,4978754.0,,,,,,,,
3174,LA SALLE,LA,22059,LA,LA SALLE,2009,5705937.0,,,,,,,,
3175,LA SALLE,LA,22059,LA,LA SALLE,2010,6206684.0,,,,,,,,
3176,LA SALLE,LA,22059,LA,LA SALLE,2011,6813357.0,,,,,,,,
3177,LA SALLE,LA,22059,LA,LA SALLE,2012,7597359.0,,,,,,,,
3178,LA SALLE,LA,22059,LA,LA SALLE,2013,7459239.0,,,,,,,,
3179,LA SALLE,LA,22059,LA,LA SALLE,2014,8371265.0,,,,,,,,


In [9]:

# Drop La Salle in LA
merge1 = merge1[merge1["fips"] != "22059"]

# Check if there's any missing value in population after dropping
merge1[merge1["Population"].isnull()] 


Unnamed: 0,county_name,state_abbr,fips,BUYER_STATE,BUYER_COUNTY,year,MME,STATE,COUNTY,STNAME,CTYNAME,Year,Population,STATE1,COUNTY1


## Generate additional columns

In [10]:
# MME per cap
merge1["MME per cap"] = merge1["MME"]/merge1["Population"]

# Add indicator
merge1["treatment"] = np.where(merge1["BUYER_STATE"]=="FL", 1, 0)


# Testing
#merge1[merge1["BUYER_STATE"]=="FL"]


In [11]:
# Subset the columns that we need
merge1 = merge1[["BUYER_STATE", "BUYER_COUNTY", "fips", "year", "MME", "Population", "MME per cap", "treatment"]]

# Save the data
merge1.to_parquet("/Users/yangshining/Desktop/pds2021-opioids-pds6/10_modified_data/opi_pop.parquet", engine='fastparquet')

## Merge vital deaths dataset with population

In [12]:
vital

Unnamed: 0,County,County Code,Year,Deaths
0,"Acadia Parish, LA",22001,2003,11
1,"Acadia Parish, LA",22001,2005,23
2,"Acadia Parish, LA",22001,2006,19
3,"Acadia Parish, LA",22001,2007,19
4,"Acadia Parish, LA",22001,2009,11
...,...,...,...,...
7920,"Yuma County, AZ",4027,2011,20
7921,"Yuma County, AZ",4027,2012,32
7922,"Yuma County, AZ",4027,2013,22
7923,"Yuma County, AZ",4027,2014,35


In [13]:
# Fix some fips inconsistencies in the dataset
# Fix Bedford, VA
population["fips"] = np.where((population['fips']== '51019') & (population['Year'] == 2015), '51515', population["fips"])

# Fix Clifton Forge, VA
vital["County Code"] = np.where((vital['County Code']== '51560') & (vital['Year'] == 2015), '51005', vital['County Code'])

# Drop Alaska
vital = vital[vital["County"].str[-2:] != "AK"]

# Test if Alaska is correctly dropped
assert len(vital[vital["County"].str[-2:] == "AK"])==0

In [14]:
#population[population["fips"]=="51005"]

In [15]:
# Modify data types of columns - prepare for merging
vital["County Code"] = vital["County Code"].astype(str)
vital["County Code"] = vital["County Code"].apply(lambda x: x.zfill(5))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vital["County Code"] = vital["County Code"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vital["County Code"] = vital["County Code"].apply(lambda x: x.zfill(5))


In [16]:
# Merging
merge2= pd.merge(vital, population, left_on=["County Code", "Year"], right_on=["fips", "Year"], how='left') #Merging based on vital deaths dataset


# testing for any missing population value
merge2[merge2["Population"].isnull()]

Unnamed: 0,County,County Code,Year,Deaths,STATE,COUNTY,STNAME,CTYNAME,Population,STATE1,COUNTY1,fips
1483,"Clifton Forge city, VA",51560,2015,0,,,,,,,,


## Generate vital deaths per cap

In [17]:
# vital deaths per cap
merge2["vital deaths per cap"] = merge2["Deaths"]/ merge2["Population"]

# Add indicator
merge2["treatment"] = np.where(((merge2["STNAME"]=="Florida" )| (merge2["STNAME"]=="Texas") | (merge2["STNAME"]=="Washington")), 1, 0)

#merge2[merge2["STNAME"]=="Texas"]

merge2["Year"].value_counts()



2015    794
2014    749
2013    704
2011    693
2012    680
2010    652
2009    596
2008    596
2007    571
2006    528
2005    480
2004    452
2003    408
Name: Year, dtype: int64

In [18]:
# keep useful columns
merge2 = merge2[["STNAME", "CTYNAME", "fips", "Year", "Deaths", "vital deaths per cap", "treatment"]]

# save the data
merge2.to_parquet("/Users/yangshining/Desktop/pds2021-opioids-pds6/10_modified_data/vital_pop.parquet", engine='fastparquet')