In [1]:
import pandas as pd
import numpy as np
from fastparquet import ParquetFile
#import addfips

In [2]:
# Load fips data
fips = pd.read_csv("https://raw.githubusercontent.com/kjhealy/fips-codes/master/county_fips_master.csv", encoding="ISO-8859-1")

# Modify the county name in fips for better merging with the opioid data
fips['county_name']= fips['county_name'].str[:-7]
fips['county_name']= fips['county_name'].str.upper()

fips.head()

Unnamed: 0,fips,county_name,state_abbr,state_name,long_name,sumlev,region,division,state,county,crosswalk,region_name,division_name
0,1001,AUTAUGA,AL,Alabama,Autauga County AL,50.0,3.0,6.0,1.0,1.0,3-6-1-1,South,East South Central
1,1003,BALDWIN,AL,Alabama,Baldwin County AL,50.0,3.0,6.0,1.0,3.0,3-6-1-3,South,East South Central
2,1005,BARBOUR,AL,Alabama,Barbour County AL,50.0,3.0,6.0,1.0,5.0,3-6-1-5,South,East South Central
3,1007,BIBB,AL,Alabama,Bibb County AL,50.0,3.0,6.0,1.0,7.0,3-6-1-7,South,East South Central
4,1009,BLOUNT,AL,Alabama,Blount County AL,50.0,3.0,6.0,1.0,9.0,3-6-1-9,South,East South Central


In [3]:
# Check the county names inside a certain state (Don't run!)
#fips[fips["state_abbr"] == "FL"]["county_name"].unique()

In [6]:
# Load opioid data (Change the working directory when you run)
 
opi_data = pd.read_parquet("/Users/yangshining/Desktop/pds2021-opioids-pds6/10_modified_data/opioid_whole_final.parquet", engine = 'fastparquet')
opi_data.sample(10)

Unnamed: 0,BUYER_STATE,BUYER_COUNTY,year,MME
171,AL,COVINGTON,2006,11899530.0
1844,GA,HENRY,2013,74398640.0
7957,TX,SAN AUGUSTINE,2012,2537415.0
4373,MT,GARFIELD,2010,605.4
5143,OK,GARFIELD,2013,18181190.0
2080,GA,MONROE,2006,5680544.0
4870,NM,SANTA FE,2010,73876000.0
8655,WA,SNOHOMISH,2006,257298500.0
1963,GA,LIBERTY,2006,4391264.0
3660,MS,FORREST,2009,49040740.0


In [None]:
# Check the county names inside a certain state (Don't run!)
#opi_data[opi_data["BUYER_STATE"]=="FL"].unique()

array(['ALACHUA', 'BAKER', 'BAY', 'BRADFORD', 'BREVARD', 'BROWARD',
       'CALHOUN', 'CHARLOTTE', 'CITRUS', 'CLAY', 'COLLIER', 'COLUMBIA',
       'DIXIE', 'DUVAL', 'ESCAMBIA', 'FLAGLER', 'FRANKLIN', 'GILCHRIST',
       'HENDRY', 'HERNANDO', 'HIGHLANDS', 'HILLSBOROUGH', 'HOLMES',
       'INDIAN RIVER', 'JACKSON', 'JEFFERSON', 'LAFAYETTE', 'LAKE', 'LEE',
       'LEON', 'LEVY', 'MANATEE', 'MARION', 'MARTIN', 'MIAMI-DADE',
       'MONROE', 'NASSAU', 'OKALOOSA', 'OKEECHOBEE', 'ORANGE', 'OSCEOLA',
       'PALM BEACH', 'PASCO', 'PINELLAS', 'POLK', 'PUTNAM', 'ST. JOHNS',
       'ST. LUCIE', 'SANTA ROSA', 'SARASOTA', 'SEMINOLE', 'SUMTER',
       'TAYLOR', 'VOLUSIA', 'WASHINGTON', 'HAMILTON', 'UNION', 'WALTON',
       'DE SOTO', 'GADSDEN', 'HARDEE', 'SUWANNEE', 'WAKULLA', 'MADISON',
       'GULF', 'LIBERTY', 'GLADES'], dtype=object)

In [7]:
# This cell is to fix the county name inconsistency between the two datasets

# Fix ST JOHN THE BAPTIST
opi_data['BUYER_COUNTY'] = np.where(opi_data["BUYER_COUNTY"].str[:3] == "ST ", "ST. " + opi_data["BUYER_COUNTY"].str[3:] , opi_data["BUYER_COUNTY"])

# Fix SAINT

opi_data['BUYER_COUNTY'] = np.where(opi_data["BUYER_COUNTY"].str[:6] == "SAINT ", "ST. " + opi_data["BUYER_COUNTY"].str[6:] , opi_data["BUYER_COUNTY"])

#Fix DONA ANA

fips['county_name'] = np.where(fips['county_name'] == "DOÐA ANA", "DONA ANA" , fips['county_name'])

# Fix DE SOTO
opi_data['BUYER_COUNTY'] = np.where(opi_data["BUYER_COUNTY"] == "DE SOTO", "DESOTO" , opi_data["BUYER_COUNTY"])
fips['county_name'] = np.where(fips['county_name'] == "DE SOTO", "DESOTO" , fips['county_name'])

# Fix DE KALB
opi_data['BUYER_COUNTY'] = np.where(opi_data["BUYER_COUNTY"] == "DE KALB", "DEKALB" , opi_data["BUYER_COUNTY"])

# Fix DE WITT
opi_data['BUYER_COUNTY'] = np.where(opi_data["BUYER_COUNTY"] == "DE WITT", "DEWITT" , opi_data["BUYER_COUNTY"])


In [10]:

# Subset the fips data - keep only the columns we need 
fips_sub = fips[["county_name", "state_abbr", "fips"]]

opi_merge = pd.merge(fips_sub, opi_data, how = "right",left_on = ["state_abbr", "county_name"], right_on=["BUYER_STATE", "BUYER_COUNTY"])


# Test if there's any na values
assert len(opi_merge[opi_merge["fips"].isna()]) ==0


In [None]:
# Save it to parquet

opi_merge.to_parquet("/Users/yangshining/Desktop/pds2021-opioids-pds6/10_modified_data/opi_merge_final.parquet", 
engine = 'fastparquet')