# Imports

Make sure the functions.py is in the ID2223_Project folder in Google drive.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cp /content/drive/MyDrive/ID2223_Project/functions.py .

Install required module in functions.py

In [None]:
!pip install python-dotenv
!pip install hopsworks

In [None]:
import pandas as pd
from datetime import datetime, timedelta
import time 
import requests

from functions import *

# API Params

In [None]:
date_today = datetime.now().strftime("%Y-%m-%d")


## Parsing Air Quality API Data

API used here are from AirNow, which is created by EPA and partners. 
https://www.airnow.gov/

The following part use the developer tools on the website
https://docs.airnowapi.org/

**Please create an account using the developer tools link above and request for an API key before continue.**

In [None]:
API_KEY = None # Use your own API key

In [None]:
def get_air_quality_df(data, param_name):
    col_names = [
        'city',
        'date',
        'aqi',
        param_name
    ]

    new_data = pd.DataFrame(
        data,
        columns=col_names
    )

    return new_data

In [None]:
def call_air_quality_api(date, key):
  ZIP_CODE = '33135' # zip code in Miami

  return requests.get(f'https://www.airnowapi.org/aq/observation/zipCode/historical/?format=application/json&zipCode={ZIP_CODE}&date={date}T00-0000&distance=50&API_KEY={key}').json()
  

In [None]:
def get_air_quality_data(json):
    array = []
    pm25_mean = None
    o3_mean = None
    city = None
    this_day = None
    aqi = 0
    for j in json:
      aqi = max(aqi, j['AQI'])
      city = j['ReportingArea']
      this_day = j['DateObserved']
      if j['ParameterName'] == 'PM2.5':
        pm25_mean = j['AQI'] / 10
      else:
        o3_mean = j['AQI'] / 1000

    return [
        city,
        this_day, 
        aqi, 
        o3_mean,
        pm25_mean
    ]

In [None]:
dates = []
tod = datetime.now()

for i in range(105): # start from 2022-10-01
  day = timedelta(days = i)
  a = tod - day
  dates.append(a.strftime("%Y-%m-%d"))


In [None]:
dates_weather = []

for w in range(378): # start from 2022-01-01
  dayw = timedelta(days = w)
  b = tod - dayw
  dates_weather.append(b.strftime("%Y-%m-%d"))


dates_weather = dates_weather[378:]
len(dates_weather)

In [None]:
air_quality_data = []
for d in dates:
  json = call_air_quality_api(d, API_KEY)
  air_quality_data.append(get_air_quality_data(json))

air_quality_data

# Parsing Data

## Weather Data

Weather data are from VisualCrossing
https://www.visualcrossing.com/

API key can be generated here
https://www.visualcrossing.com/weather-api

**Please create an API key with the link above before continue**

In [None]:
WEATHER_API_KEY = None # your API Key

In [None]:
def get_weather_json(city, date, WEATHER_API_KEY):
    return requests.get(f'https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/{city.lower()}/{date}?unitGroup=metric&include=days&key={WEATHER_API_KEY}&contentType=json').json()


def get_weather_data(city_name, date):
    json = get_weather_json(city_name, date, WEATHER_API_KEY)
    # print(json)
    data = json['days'][0]
    # print(data)

    return [
        json['address'].capitalize(),
        data['datetime'],
        data['tempmax'],
        data['tempmin'],
        data['temp'],
        data['feelslikemax'],
        data['feelslikemin'],
        data['feelslike'],
        data['dew'],
        data['humidity'],
        data['precip'],
        data['precipprob'],
        data['precipcover'],
        data['snow'],
        data['snowdepth'],
        data['windgust'],
        data['windspeed'],
        data['winddir'],
        data['pressure'],
        data['cloudcover'],
        data['visibility'],
        data['solarradiation'],
        data['solarenergy'],
        data['uvindex'],
        data['conditions']
    ]

## Parsing Weather Data

In [None]:
data_weather = [get_weather_data('Miami', z) for z in dates_weather]

# Dataset Preparation

## Air Quality Data Frame

In [None]:
def get_air_quality_df(data):
    col_names = [
        'city',
        'date',
        'aqi',
        'o3_mean',
        'pm25_mean'
    ]

    new_data = pd.DataFrame(
        data,
        columns=col_names
    )

    return new_data

In [None]:
df_air_quality = get_air_quality_df(air_quality_data)

df_air_quality = df_air_quality.dropna()
df_air_quality.sort_values(by = ['city', 'date'],inplace = True, ignore_index = True)
df_air_quality.date = df_air_quality.date.str.strip().apply(timestamp_2_time)

df_air_quality

## Weather Data

In [None]:
def get_weather_df(data):
    col_names = [
        'city',
        'date',
        'tempmax',
        'tempmin',
        'temp',
        'feelslikemax',
        'feelslikemin',
        'feelslike',
        'dew',
        'humidity',
        'precip',
        'precipprob',
        'precipcover',
        'snow',
        'snowdepth',
        'windgust',
        'windspeed',
        'winddir',
        'pressure',
        'cloudcover',
        'visibility',
        'solarradiation',
        'solarenergy',
        'uvindex',
        'conditions'
    ]

    new_data = pd.DataFrame(
        data,
        columns=col_names
    )
    new_data.date = new_data.date.apply(timestamp_2_time)

    return new_data

In [None]:
df_weather = get_weather_df(data_weather)

df_weather.head()

# Connecting to Hopsworks Feature Store

In [None]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store() 

air_quality_fg = fs.get_or_create_feature_group(
    name = 'miami_air_quality_fg',
    version = 2
)
weather_fg = fs.get_or_create_feature_group(
    name = 'weather_fg',
    version = 1
)

# Uploading new data to the Feature Store

In [None]:
print(df_air_quality)

In [None]:
air_quality_fg.insert(df_air_quality)

In [None]:
###### Please uncomment the following part only for the first time of creating the feature group #######
# weather_fg = fs.get_or_create_feature_group(
#         name = 'miami_weather_fg',
#         description = 'Miami Weather characteristics of each day',
#         version = 1,
#         primary_key = ['city','date'],
#         online_enabled = True,
#         event_time = 'date'
#     )    

weather_fg.insert(df_weather)

# Creating Training Dataset

In [None]:
query = air_quality_fg.select_all().join(weather_fg.select_all())

query.read()

In [None]:
query_show = query.show(5)
col_names = query_show.columns

query_show

In [None]:
category_cols = ['city','date','conditions','aqi']

mapping_transformers = {col_name:fs.get_transformation_function(name='standard_scaler') for col_name in col_names if col_name not in category_cols}
category_cols = {col_name:fs.get_transformation_function(name='label_encoder') for col_name in category_cols if col_name not in ['date','aqi']}

mapping_transformers.update(category_cols)

## Feature View

In [None]:
feature_view = fs.create_feature_view(
    name = 'miami_air_quality_fv',
    version = 1,
    transformation_functions = mapping_transformers,
    query = query
)

In [None]:
feature_view = fs.get_feature_view(
    name = 'miami_air_quality_fv',
    version = 1
)

## Training Data Creation

In [None]:
feature_view.create_training_data()