# Imports

Create the ID2223_Project folder in Google drive and put the functions.py in it  before run any command. 

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cp /content/drive/MyDrive/ID2223_Project/functions.py .

In [None]:
!pip install python-dotenv
!pip install hopsworks

In [None]:
import pandas as pd
import numpy as np

from functions import *

# Generating History Data

All code below are from the documentation on EPA (US Environmental Protection Agency) website: https://aqs.epa.gov/aqsweb/documents/data_api.html#format 

However, data are only available before 2022-09-30 in this api.

## Parsing Air Quality API Data

Before parsing data, please sign up in EPA with your email using the following link (Replace "myemail@example.com" in the example with your email address.): https://aqs.epa.gov/data/api/signup?email=myemail@example.com


In [None]:
EMAIL = None # Change to your email address (abc@abc.com)
KEY = None # Change to the api key from EPA

In [None]:
def get_air_quality_df(data, param_name):
    col_names = [
        'city',
        'date',
        'aqi',
        param_name
    ]

    new_data = pd.DataFrame(
        data,
        columns=col_names
    )
    # new_data.date = new_data.date.apply(timestamp_2_time)

    return new_data

In [None]:
def call_air_quality_api(param, email, key):
  STATE_CODE = '12' # Code for Florida, US
  COUNTY_CODE = '086' # Code for Miami
  BEGIN_DATE = '20220101' # Begin date of the data in YYYYMMDD format
  END_DATE = '20221231' # End date of the data in YYYYMMDD format (Must be in the same year as the begin date)
  
  return requests.get(f'https://aqs.epa.gov/data/api/dailyData/byCounty?email={email}&key={key}&param={param}&bdate={BEGIN_DATE}&edate={END_DATE}&state={STATE_CODE}&county={COUNTY_CODE}').json()

Code for different pollutant

The following codes for PM2.5 and O3 are from the EPA website.

In [None]:
PM25_CODE = '88101' 
O3_CODE = '44201'

In [None]:
def get_air_quality_data(json, standard_index, standard_value):
    array = []
    for j in json:
      if j['city'] == 'Miami' and j[standard_index] == standard_value:
        array.append( [
            j['city'],
            j['date_local'],
            j['aqi'],
            j['arithmetic_mean']
        ])
    return array

### PM2.5

In [None]:
pm25_data = call_air_quality_api(PM25_CODE, EMAIL, KEY)
print(pm25_data)

In [None]:
data_PM25 = (get_air_quality_data(pm25_data['Data'], 'pollutant_standard', "PM25 24-hour 2012"))

print(data_PM25)

In [None]:
df_PM25 = get_air_quality_df(data_PM25, 'pm25_mean')
df_PM25 = df_PM25.drop(['aqi'], axis=1) # do not need this aqi value anymore
df_PM25

### O3

In [None]:
o3_data = call_air_quality_api(O3_CODE, EMAIL, KEY)
print(o3_data)

In [None]:
data_O3 = (get_air_quality_data(o3_data['Data'], 'pollutant_standard', "Ozone 8-hour 2015"))

print(data_O3)

In [None]:
df_o3 = get_air_quality_df(data_O3, 'o3_mean')

df_o3

## Create Dataset

Combine 2 dataframes

In [None]:
from functools import reduce

d = [df_o3, df_PM25]
df_air_quality = reduce(lambda  left,right: pd.merge(left,right,on=['city', 'date'], how='outer'), d)

df_air_quality = df_air_quality.dropna()
df_air_quality

In [None]:
df_air_quality.date = df_air_quality.date.apply(timestamp_2_time)
df_air_quality.sort_values(by = ['city', 'date'],inplace = True, ignore_index = True)

df_air_quality

## Weather Data

**Weather data will be created in 2_feature_pipeline**

# Connecting to Hopsworks Feature Store

In [None]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store() 

# Creating Feature Groups

## Air Quality Data

In [None]:
air_quality_fg = fs.get_or_create_feature_group(
        name = 'miami_air_quality_fg',
        description = 'Miami Air Quality characteristics of each day',
        version = 2,
        primary_key = ['city','date'],
        online_enabled = True,
        event_time = 'date'
    )    

air_quality_fg.insert(df_air_quality)