In [2]:
import pandas as pd
import os
import requests
import json
import sqlite3 as sq 
from sqlalchemy.ext.automap import automap_base
from sqlalchemy import create_engine, inspect, func

In [3]:
df = pd.read_csv("Resources/COVID19VaccineRecords.csv", low_memory=False, encoding='utf-8')

In [3]:
df.head()

Unnamed: 0,_id,as_of_date,zip_code_tabulation_area,local_health_jurisdiction,county,vaccine_equity_metric_quartile,vem_source,age12_plus_population,age5_plus_population,tot_population,persons_fully_vaccinated,persons_partially_vaccinated,percent_of_population_fully_vaccinated,percent_of_population_partially_vaccinated,percent_of_population_with_1_plus_dose,up_to_date_count,redacted
0,1,2021-01-12,93618,Tulare,Tulare,1.0,Healthy Places Index Score,24482.3,28588,31470.0,51.0,252.0,0.001621,0.008008,0.009629,0,Information redacted in accordance with CA sta...
1,2,2021-01-12,95437,Mendocino,Mendocino,2.0,Healthy Places Index Score,12595.5,13932,14859.0,66.0,317.0,0.004442,0.021334,0.025776,0,Information redacted in accordance with CA sta...
2,3,2021-01-12,95991,Sutter,Sutter,1.0,Healthy Places Index Score,33300.8,37870,40861.0,114.0,835.0,0.00279,0.020435,0.023225,0,Information redacted in accordance with CA sta...
3,4,2021-01-12,93444,San Luis Obispo,San Luis Obispo,3.0,Healthy Places Index Score,18951.8,20522,21331.0,155.0,441.0,0.007266,0.020674,0.02794,0,Information redacted in accordance with CA sta...
4,5,2021-01-12,95039,Monterey,Monterey,1.0,CDPH-Derived ZCTA Score,860.0,1032,1074.0,,,,,,0,Information redacted in accordance with CA sta...


In [4]:
# Define a dictionary to map the old column names to new names
renamed_columns = {
    '_id': 'ID',
    'as_of_date': 'Date',
    'zip_code_tabulation_area': 'Zip Code',
    'local_health_jurisdiction': 'Health Jurisdiction',
    'county': 'County Name',
    'vaccine_equity_metric_quartile': 'Vaccine Quartile',
    'vem_source': 'Vaccine Source',
    'age12_plus_population': 'Age 12+ Population',
    'age5_plus_population': 'Age 5+ Population',
    'tot_population': 'Total Population',
    'persons_fully_vaccinated': 'Fully Vaccinated',
    'persons_partially_vaccinated': 'Partially Vaccinated',
    'percent_of_population_fully_vaccinated': 'Percent Fully Vaccinated',
    'percent_of_population_partially_vaccinated': 'Percent Partially Vaccinated',
    'percent_of_population_with_1_plus_dose': 'Percent With 1+ Dose',
    'up_to_date_count': 'Up To Date Count',
    'redacted': 'Redacted Info'
}

# Use the rename method to apply the column name mapping
df.rename(columns=renamed_columns, inplace=True)

# Display the DataFrame with renamed columns
df.head()


Unnamed: 0,ID,Date,Zip Code,Health Jurisdiction,County Name,Vaccine Quartile,Vaccine Source,Age 12+ Population,Age 5+ Population,Total Population,Fully Vaccinated,Partially Vaccinated,Percent Fully Vaccinated,Percent Partially Vaccinated,Percent With 1+ Dose,Up To Date Count,Redacted Info
0,1,2021-01-12,93618,Tulare,Tulare,1.0,Healthy Places Index Score,24482.3,28588,31470.0,51.0,252.0,0.001621,0.008008,0.009629,0,Information redacted in accordance with CA sta...
1,2,2021-01-12,95437,Mendocino,Mendocino,2.0,Healthy Places Index Score,12595.5,13932,14859.0,66.0,317.0,0.004442,0.021334,0.025776,0,Information redacted in accordance with CA sta...
2,3,2021-01-12,95991,Sutter,Sutter,1.0,Healthy Places Index Score,33300.8,37870,40861.0,114.0,835.0,0.00279,0.020435,0.023225,0,Information redacted in accordance with CA sta...
3,4,2021-01-12,93444,San Luis Obispo,San Luis Obispo,3.0,Healthy Places Index Score,18951.8,20522,21331.0,155.0,441.0,0.007266,0.020674,0.02794,0,Information redacted in accordance with CA sta...
4,5,2021-01-12,95039,Monterey,Monterey,1.0,CDPH-Derived ZCTA Score,860.0,1032,1074.0,,,,,,0,Information redacted in accordance with CA sta...


In [5]:
# Delete the last column
df = df.drop(df.columns[-1], axis=1)

In [6]:
df.head()

Unnamed: 0,ID,Date,Zip Code,Health Jurisdiction,County Name,Vaccine Quartile,Vaccine Source,Age 12+ Population,Age 5+ Population,Total Population,Fully Vaccinated,Partially Vaccinated,Percent Fully Vaccinated,Percent Partially Vaccinated,Percent With 1+ Dose,Up To Date Count
0,1,2021-01-12,93618,Tulare,Tulare,1.0,Healthy Places Index Score,24482.3,28588,31470.0,51.0,252.0,0.001621,0.008008,0.009629,0
1,2,2021-01-12,95437,Mendocino,Mendocino,2.0,Healthy Places Index Score,12595.5,13932,14859.0,66.0,317.0,0.004442,0.021334,0.025776,0
2,3,2021-01-12,95991,Sutter,Sutter,1.0,Healthy Places Index Score,33300.8,37870,40861.0,114.0,835.0,0.00279,0.020435,0.023225,0
3,4,2021-01-12,93444,San Luis Obispo,San Luis Obispo,3.0,Healthy Places Index Score,18951.8,20522,21331.0,155.0,441.0,0.007266,0.020674,0.02794,0
4,5,2021-01-12,95039,Monterey,Monterey,1.0,CDPH-Derived ZCTA Score,860.0,1032,1074.0,,,,,,0


In [7]:
drops_vaccine = [
    "Age 12+ Population",
    "Age 5+ Population",
    "Vaccine Source",
    "Health Jurisdiction",
    "Up To Date Count",
    "Fully Vaccinated",
    "Partially Vaccinated",
    "Percent With 1+ Dose",
    "Vaccine Quartile",
    "ID"
    
]

df.drop(columns=drops_vaccine, inplace=True)

In [8]:
# Keeping the latest dates for each Zip Code
df.sort_values(by='Date', ascending=False, inplace=True)
df.drop_duplicates(subset='Zip Code', keep='first', inplace=True)

In [9]:
df

Unnamed: 0,Date,Zip Code,County Name,Total Population,Percent Fully Vaccinated,Percent Partially Vaccinated
250487,2023-09-26,93664,Fresno,380.0,0.794737,0.097368
249315,2023-09-26,95460,Mendocino,2508.0,0.868022,0.072967
249304,2023-09-26,95451,Lake,11603.0,0.593640,0.064811
249305,2023-09-26,92832,Orange,25837.0,0.719743,0.075783
249306,2023-09-26,95148,Santa Clara,48273.0,0.875810,0.056968
...,...,...,...,...,...,...
249904,2023-09-26,93410,San Luis Obispo,,1.000000,1.000000
249905,2023-09-26,95376,San Joaquin,55655.0,0.680065,0.068170
249906,2023-09-26,95628,Sacramento,40855.0,0.735920,0.053604
249907,2023-09-26,94533,Solano,75909.0,0.674215,0.093757


In [10]:
#HPI File
url = "https://api.healthyplacesindex.org/api/hpi?geography=zips&year=2022&indicator=hpi2score&format=json&key=721a0a48-97f3-4a3f-a794-2bc855972617"
data = requests.get(url).json()
hpi_df = pd.DataFrame(data)
hpi_df

Unnamed: 0,geoid,name,population,value,percentile,numerator,denominator
0,94601,94601,53039,-0.380256,0.246002,,
1,94501,94501,63821,0.584656,0.860625,,
2,94560,94560,47171,0.389073,0.740289,,
3,94587,94587,74722,0.448256,0.783701,,
4,94580,94580,30488,0.246554,0.654227,,
...,...,...,...,...,...,...,...
1308,93274,93274,74000,-0.566216,0.150038,,
1309,95370,95370,28096,0.040532,0.525514,,
1310,95372,95372,2056,-0.000992,0.501904,,
1311,95627,95627,3802,-0.119862,0.418126,,


In [11]:
column_name_mapping = {
    'geoid': 'Zip Code',
    'name': 'Location Name',
    'population': 'Population',
    'value': 'HPI Value',
    'percentile': 'HPI Percentile',
    'numerator': 'Numerator',
    'denominator': 'Denominator'
}

# Rename the columns using the dictionary
hpi_df.rename(columns=column_name_mapping, inplace=True)

In [12]:
# Change to float
hpi_df['HPI Value'] = hpi_df['HPI Value'].astype(float)
hpi_df

Unnamed: 0,Zip Code,Location Name,Population,HPI Value,HPI Percentile,Numerator,Denominator
0,94601,94601,53039,-0.380256,0.246002,,
1,94501,94501,63821,0.584656,0.860625,,
2,94560,94560,47171,0.389073,0.740289,,
3,94587,94587,74722,0.448256,0.783701,,
4,94580,94580,30488,0.246554,0.654227,,
...,...,...,...,...,...,...,...
1308,93274,93274,74000,-0.566216,0.150038,,
1309,95370,95370,28096,0.040532,0.525514,,
1310,95372,95372,2056,-0.000992,0.501904,,
1311,95627,95627,3802,-0.119862,0.418126,,


In [13]:
#employed File
url = "https://api.healthyplacesindex.org/api/hpi?geography=zips&year=2015&indicator=employed&format=json&key=721a0a48-97f3-4a3f-a794-2bc855972617"
data = requests.get(url).json()
employed_df = pd.DataFrame(data)
employed_df

Unnamed: 0,geoid,name,population,value,percentile,numerator,denominator
0,90027,90027,44770,0.784,0.853120,25546,32584
1,90265,90265,17954,0.679,0.304033,6897,10157
2,90293,90293,12728,0.831,0.975266,7597,9142
3,91042,91042,26966,0.715,0.435312,12488,17466
4,91342,91342,94595,0.739,0.568874,41882,56674
...,...,...,...,...,...,...,...
1309,95442,95442,3213,0.878,0.995434,1593,1814
1310,95993,95993,37077,0.679,0.304033,14384,21184
1311,95540,95540,14196,0.690,0.338280,5361,7769
1312,95567,95567,1802,0.651,0.208904,615,944


In [14]:
employed_columns= {
    'geoid': 'Zip Code',
    'name': 'Location Name',
    'population': 'Employed Population',
    'value': 'Employed Value',
    'percentile': 'Employed Percentile',
    'numerator': 'Employed Numerator',
    'denominator': 'Employed Denominator'
}

# Rename the columns using the dictionary
employed_df.rename(columns=employed_columns, inplace=True)

employed_df

Unnamed: 0,Zip Code,Location Name,Employed Population,Employed Value,Employed Percentile,Employed Numerator,Employed Denominator
0,90027,90027,44770,0.784,0.853120,25546,32584
1,90265,90265,17954,0.679,0.304033,6897,10157
2,90293,90293,12728,0.831,0.975266,7597,9142
3,91042,91042,26966,0.715,0.435312,12488,17466
4,91342,91342,94595,0.739,0.568874,41882,56674
...,...,...,...,...,...,...,...
1309,95442,95442,3213,0.878,0.995434,1593,1814
1310,95993,95993,37077,0.679,0.304033,14384,21184
1311,95540,95540,14196,0.690,0.338280,5361,7769
1312,95567,95567,1802,0.651,0.208904,615,944


In [15]:
# Change to float
employed_df['Employed Percentile'] = employed_df['Employed Percentile'].astype(float)
employed_df

Unnamed: 0,Zip Code,Location Name,Employed Population,Employed Value,Employed Percentile,Employed Numerator,Employed Denominator
0,90027,90027,44770,0.784,0.853120,25546,32584
1,90265,90265,17954,0.679,0.304033,6897,10157
2,90293,90293,12728,0.831,0.975266,7597,9142
3,91042,91042,26966,0.715,0.435312,12488,17466
4,91342,91342,94595,0.739,0.568874,41882,56674
...,...,...,...,...,...,...,...
1309,95442,95442,3213,0.878,0.995434,1593,1814
1310,95993,95993,37077,0.679,0.304033,14384,21184
1311,95540,95540,14196,0.690,0.338280,5361,7769
1312,95567,95567,1802,0.651,0.208904,615,944


In [16]:
# check type
column_type = employed_df['Employed Percentile'].dtype
print(column_type)


float64


In [17]:
#income files
url = "https://api.healthyplacesindex.org/api/hpi?geography=zips&year=2015&indicator=percapitaincome&format=json&key=721a0a48-97f3-4a3f-a794-2bc855972617"
data = requests.get(url).json()
income_df = pd.DataFrame(data)
income_df

Unnamed: 0,geoid,name,population,value,percentile,numerator,denominator
0,90027,90027,44770,52257,0.801370,,
1,90265,90265,17954,110410,0.987062,,
2,90293,90293,12728,78988,0.936834,,
3,91042,91042,26966,31572,0.439498,,
4,91342,91342,94595,24976,0.261035,,
...,...,...,...,...,...,...,...
1309,95442,95442,3213,88887,0.958143,,
1310,95993,95993,37077,31433,0.435312,,
1311,95540,95540,14196,24543,0.248858,,
1312,95567,95567,1802,33420,0.487823,,


In [18]:
income_columns= {
    'geoid': 'Zip Code',
    'name': 'Location Name',
    'population': 'Income Population',
    'value': 'Income Value',
    'percentile': 'Income Percentile',
    'numerator': 'Income Numerator',
    'denominator': 'Income Denominator'
}

# Rename the columns using the dictionary
income_df.rename(columns=income_columns, inplace=True)

income_df

Unnamed: 0,Zip Code,Location Name,Income Population,Income Value,Income Percentile,Income Numerator,Income Denominator
0,90027,90027,44770,52257,0.801370,,
1,90265,90265,17954,110410,0.987062,,
2,90293,90293,12728,78988,0.936834,,
3,91042,91042,26966,31572,0.439498,,
4,91342,91342,94595,24976,0.261035,,
...,...,...,...,...,...,...,...
1309,95442,95442,3213,88887,0.958143,,
1310,95993,95993,37077,31433,0.435312,,
1311,95540,95540,14196,24543,0.248858,,
1312,95567,95567,1802,33420,0.487823,,


In [19]:
# Change to int
income_df['Income Value'] = income_df['Income Value'].astype(int)
income_df

Unnamed: 0,Zip Code,Location Name,Income Population,Income Value,Income Percentile,Income Numerator,Income Denominator
0,90027,90027,44770,52257,0.801370,,
1,90265,90265,17954,110410,0.987062,,
2,90293,90293,12728,78988,0.936834,,
3,91042,91042,26966,31572,0.439498,,
4,91342,91342,94595,24976,0.261035,,
...,...,...,...,...,...,...,...
1309,95442,95442,3213,88887,0.958143,,
1310,95993,95993,37077,31433,0.435312,,
1311,95540,95540,14196,24543,0.248858,,
1312,95567,95567,1802,33420,0.487823,,


In [20]:
# Merge employed_df and income_df on Zip Codes
merged_df = employed_df.merge(income_df, on="Zip Code", suffixes=("_employed", "_income"))

# Merge the resulting merged_df with hip_df on Zip Codes
hpi_poverty = merged_df.merge(hpi_df, on="Zip Code", suffixes=("_merged", "_hpi"))

In [21]:
columns_to_drop = [
    "Location Name_employed",
    "Employed Numerator",
    "Employed Denominator",
    "Location Name_income",
    "Income Numerator",
    "Income Denominator",
    "Location Name",
    "Numerator",
    "Denominator",
    "Employed Value",
    "Employed Population",
    "Income Population",
    "Income Percentile",
    "Population",
    "HPI Percentile"
]

hpi_poverty.drop(columns=columns_to_drop, inplace=True)

In [22]:
# Change to float
hpi_poverty['Zip Code'] = hpi_poverty['Zip Code'].astype(int)
hpi_poverty

Unnamed: 0,Zip Code,Employed Percentile,Income Value,HPI Value
0,90027,0.853120,52257,0.152726
1,90265,0.304033,110410,0.639108
2,90293,0.975266,78988,0.668620
3,91042,0.435312,31572,-0.256216
4,91342,0.568874,24976,-0.250499
...,...,...,...,...
1308,95442,0.995434,88887,0.558262
1309,95993,0.304033,31433,-0.083302
1310,95540,0.338280,24543,-0.217398
1311,95567,0.208904,33420,-0.253012


In [23]:
vaccines_hpi_poverty_df = pd.merge(df, hpi_poverty, on='Zip Code', how='inner')

In [24]:
vaccines_hpi_poverty_df.head()

Unnamed: 0,Date,Zip Code,County Name,Total Population,Percent Fully Vaccinated,Percent Partially Vaccinated,Employed Percentile,Income Value,HPI Value
0,2023-09-26,95460,Mendocino,2508.0,0.868022,0.072967,0.997717,46675,0.125566
1,2023-09-26,95451,Lake,11603.0,0.59364,0.064811,0.21347,33270,-0.160988
2,2023-09-26,92832,Orange,25837.0,0.719743,0.075783,0.482496,28287,-0.154457
3,2023-09-26,95148,Santa Clara,48273.0,0.87581,0.056968,0.723364,41013,0.419383
4,2023-09-26,94115,San Francisco,34604.0,0.858196,0.099468,0.934551,89097,0.798741


In [25]:
zip_code_93927 = vaccines_hpi_poverty_df[vaccines_hpi_poverty_df["Zip Code"] == 93927]
zip_code_93927

Unnamed: 0,Date,Zip Code,County Name,Total Population,Percent Fully Vaccinated,Percent Partially Vaccinated,Employed Percentile,Income Value,HPI Value
339,2023-09-26,93927,Monterey,18812.0,0.749681,0.093185,0.268645,15961,-0.60393


## SQLITE

In [33]:
conn = sqlite3.connect("../Data/CA_COVID_data.sqlite")

In [34]:
vaccines_hpi_poverty_df.to_sql("vaccines_hpi_poverty", conn, if_exists='replace', index=False)


1313

In [39]:
# Create an SQLAlchemy engine database
engine = create_engine("sqlite:///../Data/CA_COVID_data.sqlite", echo=False)
inspector = inspect(engine)
inspector.get_table_names()

['case_surv', 'vaccine_by_county', 'vaccines_hpi_poverty']

In [40]:

vaccines_hpi_poverty_df.to_sql("vaccines_hpi_poverty", conn, if_exists='replace', index=False) # writes to file

1313

In [41]:
inspector.get_table_names()

['case_surv', 'vaccine_by_county', 'vaccines_hpi_poverty']

In [None]:
conn.close()