In [16]:
import pandas as pd
import numpy as np

In [17]:
# Read data 

# opioid
opioid = pd.read_parquet("/Users/yangshining/Desktop/pds2021-opioids-pds6/10_modified_data/opi_merge_final.parquet", engine='fastparquet')

# vital deaths
vital = pd.read_parquet("/Users/yangshining/Desktop/pds2021-opioids-pds6/10_modified_data/Vital.parquet", engine='fastparquet')

# population
population = pd.read_parquet("/Users/yangshining/Desktop/pds2021-opioids-pds6/10_modified_data/pop_final.parquet", engine='fastparquet')

In [18]:
# Check for number of rows in each dataset

len_opi = len(opioid)
len_vital = len(vital)
len_pop = len(population)

print(f"Opioid dataset has {len_opi} rows; vital deaths dataset has {len_vital} rows; population dataset has {len_pop} rows.")

Opioid dataset has 8736 rows; vital deaths dataset has 7925 rows; population dataset has 31910 rows.


In [19]:
vital

Unnamed: 0,County,County Code,Year,Deaths
0,"Acadia Parish, LA",22001,2003,11
1,"Acadia Parish, LA",22001,2005,23
2,"Acadia Parish, LA",22001,2006,19
3,"Acadia Parish, LA",22001,2007,19
4,"Acadia Parish, LA",22001,2009,11
...,...,...,...,...
7920,"Yuma County, AZ",4027,2011,20
7921,"Yuma County, AZ",4027,2012,32
7922,"Yuma County, AZ",4027,2013,22
7923,"Yuma County, AZ",4027,2014,35


In [20]:
population.sample(10)

Unnamed: 0,STATE,COUNTY,STNAME,CTYNAME,Year,Population,STATE1,COUNTY1,FIPS
18661,48,283,Texas,La Salle County,2011,7002,48,283,48283
31627,51,89,Virginia,Henry County,2015,51940,51,89,51089
733,18,43,Indiana,Floyd County,2006,72358,18,43,18043
11298,31,89,Nebraska,Holt County,2009,10406,31,89,31089
22547,6,43,California,Mariposa County,2013,17809,6,43,6043
28904,5,145,Arkansas,White County,2015,78915,5,145,5145
22212,54,55,West Virginia,Mercer County,2012,62419,54,55,54055
1520,29,25,Missouri,Caldwell County,2006,9343,29,25,29025
28874,5,85,Arkansas,Lonoke County,2015,71359,5,85,5085
13433,17,121,Illinois,Marion County,2010,39437,17,121,17121


## Merge opioid dataset with population

In [21]:
# Modify data types of columns - prepare for merging

population["Year"] = population["Year"].astype(int)

opioid["fips"] = opioid["fips"].astype(str)

opioid["fips"] = opioid["fips"].apply(lambda x: x.zfill(5))

merge1 = pd.merge(opioid, population, left_on=["fips", "year"], right_on=["FIPS", "Year"], how='left') #Merging based on opioid dataset

# Check if there's any missing value in population

merge1[merge1["Population"].isnull()]  # La Salle is NOT considered to be a county in Louisiana


Unnamed: 0,county_name,state_abbr,fips,BUYER_STATE,BUYER_COUNTY,year,MME,STATE,COUNTY,STNAME,CTYNAME,Year,Population,STATE1,COUNTY1,FIPS
3171,LA SALLE,LA,22059,LA,LA SALLE,2006,3570544.0,,,,,,,,,
3172,LA SALLE,LA,22059,LA,LA SALLE,2007,4503112.0,,,,,,,,,
3173,LA SALLE,LA,22059,LA,LA SALLE,2008,4978754.0,,,,,,,,,
3174,LA SALLE,LA,22059,LA,LA SALLE,2009,5705937.0,,,,,,,,,
3175,LA SALLE,LA,22059,LA,LA SALLE,2010,6206684.0,,,,,,,,,
3176,LA SALLE,LA,22059,LA,LA SALLE,2011,6813357.0,,,,,,,,,
3177,LA SALLE,LA,22059,LA,LA SALLE,2012,7597359.0,,,,,,,,,
3178,LA SALLE,LA,22059,LA,LA SALLE,2013,7459239.0,,,,,,,,,
3179,LA SALLE,LA,22059,LA,LA SALLE,2014,8371265.0,,,,,,,,,


## Generate MME per cap

In [48]:
merge1["MME per cap"] = merge1["MME"]/merge1["Population"]

# Store data in parquet

merge1.to_parquet("/Users/yangshining/Desktop/plotting_test/merge1.parquet", engine='fastparquet')

## Merge vital deaths dataset with population

In [37]:
vital

Unnamed: 0,County,County Code,Year,Deaths
0,"Acadia Parish, LA",22001,2003,11
1,"Acadia Parish, LA",22001,2005,23
2,"Acadia Parish, LA",22001,2006,19
3,"Acadia Parish, LA",22001,2007,19
4,"Acadia Parish, LA",22001,2009,11
...,...,...,...,...
7920,"Yuma County, AZ",04027,2011,20
7921,"Yuma County, AZ",04027,2012,32
7922,"Yuma County, AZ",04027,2013,22
7923,"Yuma County, AZ",04027,2014,35


In [45]:
population["Year"].value_counts()

2006    3191
2007    3191
2008    3191
2009    3191
2010    3191
2011    3191
2012    3191
2013    3191
2014    3191
2015    3191
Name: Year, dtype: int64

In [43]:
vital["County Code"] = vital["County Code"].astype(str)
vital["County Code"] = vital["County Code"].apply(lambda x: x.zfill(5))

#vital[vital["County Code"] == "18043"]

merge2= pd.merge(vital, population, left_on=["County Code", "Year"], right_on=["FIPS", "Year"], how='left') #Merging based on vital deaths dataset

merge2[merge2["Population"].isnull()]

Unnamed: 0,County,County Code,Year,Deaths,STATE,COUNTY,STNAME,CTYNAME,Population,STATE1,COUNTY1,FIPS
0,"Acadia Parish, LA",22001,2003,11,,,,,,,,
1,"Acadia Parish, LA",22001,2005,23,,,,,,,,
7,"Ada County, ID",16001,2003,17,,,,,,,,
8,"Ada County, ID",16001,2004,21,,,,,,,,
9,"Ada County, ID",16001,2005,20,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
7901,"York County, PA",42133,2005,33,,,,,,,,
7912,"York County, SC",45091,2003,10,,,,,,,,
7913,"York County, SC",45091,2004,11,,,,,,,,
7914,"York County, SC",45091,2005,23,,,,,,,,


## Generate vital deaths per cap

In [None]:
merge2["vital deaths per cap"] = merge2["Deaths"]/ merge2["Population"]