# Parameter DataFrame

This notebook will merge the tables that contain information from the PM2.5 Purple Air Sensors.

In [1]:
# Import libraries

# File manipulation

import os # For working with Operating System
import requests # Accessing the Web
import datetime as dt # Working with dates/times
import io # Input/Output Bytes objects
import time # For sleep in for loop

# Analysis

import numpy as np
import pandas as pd
import arcpy

# important as it "enhances" Pandas by importing these classes
from arcgis.features import GeoAccessor, GeoSeriesAccessor

In [2]:
# Set working Directory

# Get CWD

cwd = os.getcwd() # This is a global variable for where the notebook is (must change if running in arcpro)

# Create GeoDataBase
# This is the communal GeoDataBase, only run once

if not os.path.exists(os.path.join(cwd, '..', '..', 'data', 'QAQC.gdb')): # If it doesn't exist, create it

    arcpy.management.CreateFileGDB(os.path.join(cwd, '..', '..', 'data'), 'QAQC')

# Make it workspace

arcpy.env.workspace = os.path.join(cwd, '..', '..', 'data', 'QAQC.gdb')

arcpy.env.overwriteOutput = True # Overwrite layers is okay

## Load Data

In [3]:
# Load Csv's 

data_path = os.path.join(cwd,'..', '..', 'data')

# the historic daily summaries (w & w/out spikes)

summaries = pd.read_csv(os.path.join(data_path, 'daily_summaries.csv'))
summaries['date'] = pd.to_datetime(summaries.date)

summaries_no_spikes = pd.read_csv(os.path.join(data_path, 'daily_summaries_no_spikes.csv'))
summaries_no_spikes['date'] = pd.to_datetime(summaries_no_spikes.date)

# The spikes

spikes = pd.read_csv(os.path.join(data_path, 'all_spikes.csv'))

spikes['timestamp'] = pd.to_datetime(spikes.timestamp)

In [4]:
summaries.head()

Unnamed: 0,sensor_index,date,n_observations,pm25_fullDay_mean,pm25_fullDay_min,pm25_fullDay_minTime,pm25_fullDay_max,pm25_fullDay_maxTime,pm25_fullDay_std,pm25_fullDay_minutesAbove12ug,...,pm25_daytimeAmbient_minTime,pm25_daytimeAmbient_max,pm25_daytimeAmbient_maxTime,pm25_daytimeAmbient_std,pm25_nighttimeAmbient_mean,pm25_nighttimeAmbient_min,pm25_nighttimeAmbient_minTime,pm25_nighttimeAmbient_max,pm25_nighttimeAmbient_maxTime,pm25_nighttimeAmbient_std
0,142724,2022-06-15,144,3.460302,1.017,13:40:00,9.037,00:50:00,1.779624,0,...,13:40:00,4.527,12:30:00,1.326673,5.532053,2.886,02:00:00,9.037,00:50:00,2.612054
1,142734,2022-06-15,144,2.607365,0.489,14:40:00,8.508,00:40:00,1.67018,0,...,14:40:00,3.368,12:40:00,1.217207,4.878737,2.1,02:20:00,8.508,00:40:00,2.649151
2,143214,2022-06-15,144,2.784674,0.75,13:30:00,8.294,00:40:00,1.577514,0,...,13:30:00,3.342,12:30:00,1.063382,4.789,2.262,02:00:00,8.294,00:40:00,2.580582
3,143240,2022-06-15,144,2.816788,0.779,15:40:00,11.234,03:40:00,2.011037,0,...,13:30:00,3.134,12:40:00,0.742325,5.394789,2.271,01:50:00,9.522,02:30:00,2.616261
4,145242,2022-06-15,144,4.31474,1.359,13:20:00,11.249,00:50:00,1.97732,0,...,13:20:00,5.553,12:00:00,1.22875,6.572158,3.514,02:00:00,11.249,00:50:00,3.136699


In [5]:
# Load Relevant Tables from GeoDataBase

arcpy.ListTables()

['MPCA_Permitted_Emissions',
 'MPCA_Facilities_HOLD',
 'purpleair_historic',
 'purpleair_historic_errors']

In [6]:

# As Pandas dataframes

# Permitted Emissions

tablename = 'MPCA_Permitted_Emissions'

columns = [f.name for f in arcpy.ListFields(tablename)] #List the fields you want to include.

emissions = pd.DataFrame(data=arcpy.da.SearchCursor(tablename, columns), columns=columns)

current_pm_emissions = emissions[(emissions.YEAR == emissions.YEAR.max())
                                & (emissions.POLLUTANT == 'PM2.5 Primary')] # most recent PM2.5 Emissions

# Historic PurpleAir

tablename = 'purpleair_historic'

columns = [f.name for f in arcpy.ListFields(tablename)] #List the fields you want to include.

historic_purpleAir = pd.DataFrame(data=arcpy.da.SearchCursor(tablename, columns), columns=columns)

historic_purpleAir['timestamp'] = pd.to_datetime(historic_purpleAir.timestamp)

historic_purpleAir['sensor_index'] = historic_purpleAir.sensor_index.astype(int)

In [7]:
historic_purpleAir.head()

Unnamed: 0,OBJECTID,sensor_index,timestamp,humidity,temperature,pressure,pm2_5
0,1,3088,2023-02-06,47.31,35.801,984.713,15.2405
1,2,3088,2023-01-22,51.829,33.121,985.288,10.3585
2,3,3088,2022-09-30,35.705,69.975,992.578,0.7825
3,4,3088,2022-11-23,45.515,42.585,987.057,19.8195
4,5,3088,2022-10-17,35.644,43.267,991.817,0.0425


In [8]:
# Load Purple Air Spatial Information <- we didn't get all the stations... So We're going to query PurpleAir Instead. Leaving code for reference

In [9]:
# arcpy.ListFeatureClasses()

In [10]:
# As Pandas dataframes?

# sensors_geo = pd.DataFrame.spatial.from_featureclass('PURPLEAIR_STATIONS')

In [11]:
# sensors_geo.head()

In [12]:
# This is my personal API key... Please use responsibly!

api = input('Please enter your Purple Air api key')

Please enter your Purple Air api key 51592903-B445-11ED-B6F4-42010A800007


In [13]:
# Get Sensor Locations

indices_path = os.path.join(data_path, 'PA IDs and indexes.xlsx')

sensor_info = pd.read_excel(indices_path) # Load as DataFrame

sensor_ids = sensor_info['Sensor Index'].dropna().astype(int)

# The function to conduct the query

def getSensorsData(query='', api_read_key=''):

    # my_url is assigned the URL we are going to send our request to.
    url = 'https://api.purpleair.com/v1/sensors?' + query
    
    # print('Here is the full url for the API call:\n\n', url)

    # my_headers is assigned the context of our request we want to make. In this case
    # we will pass through our API read key using the variable created above.
    my_headers = {'X-API-Key':api_read_key}

    # This line creates and sends the request and then assigns its response to the
    # variable, r.
    response = requests.get(url, headers=my_headers)

    # We then return the response we received.
    return response

In [14]:
# Locations

sensor_string = 'show_only=' + '%2C'.join(sensor_ids.astype(str))

query = 'fields=latitude%2Clongitude&' + sensor_string

response = getSensorsData(query, api)

response_dict = response.json() # Read response as a json (dictionary)

col_names = response_dict['fields']
data = np.array(response_dict['data'])

sensors_df = pd.DataFrame(data, columns = col_names)

sensors_df['sensor_index'] = sensors_df.sensor_index.astype(int)

In [15]:
# Change Lat/lons into EPSG26915

# Create a list to store the values
coords_list = []

# Iterate over each row and add the WKT representation of a point geometry
for i, row in sensors_df.iterrows():
    
    point = arcpy.Point(row['longitude'], row['latitude'])
    point_geom = arcpy.PointGeometry(point, arcpy.SpatialReference(4326))
    
    pt_utm = point_geom.projectAs(arcpy.SpatialReference(26915))
    
    sensors_df.loc[i, 'X'] = pt_utm.firstPoint.X
    sensors_df.loc[i, 'Y'] = pt_utm.firstPoint.Y
    

In [43]:
# sensors_df.to_csv('sensor_locs.csv', index = False)
sensors_df

Unnamed: 0,sensor_index,latitude,longitude,X,Y
0,142718,44.995792,-93.295395,476716.651264,4.982525e+06
1,142720,44.956170,-93.254710,479909.656178,4.978113e+06
2,142726,45.015070,-93.289030,477225.987069,4.984665e+06
3,142724,44.937218,-93.243866,480758.650292,4.976005e+06
4,142730,44.992180,-93.296270,476646.215864,4.982124e+06
...,...,...,...,...,...
58,157871,44.934963,-93.270420,478662.669463,4.975761e+06
59,157877,44.900864,-93.208380,483548.171747,4.971959e+06
60,157935,44.929830,-93.324410,474400.339975,4.975207e+06
61,166459,44.895750,-93.268290,478816.334099,4.971405e+06


# Definitions

In [17]:
# Spatial Join

%run merge_dataFrame_w_featureClass.py

help(merge_dataFrame_w_featureClass)

Help on function merge_dataFrame_w_featureClass in module __main__:

merge_dataFrame_w_featureClass(df, fc_name, new_name, left_on, right_on, field_types)
    df should be a pandas dataframe
    fc_name should be a string referring to a feature class in your GDB
    new_name should be a string for the new feature class
    left_on should be the field to merge from on the featureClass
    right_on should be the column to merge from the dataframe
    field_types should be a list of ESRI field types - 
    see https://pro.arcgis.com/en/pro-app/latest/tool-reference/data-management/add-fields.htm



## Get Inverse Weighted Sums to Sources

### Permitted Emissions

In [18]:
# Merge Current PM emissions with facility locations

merge_dataFrame_w_featureClass(current_pm_emissions, 'MPCA_Facilities', 
                               'current_pm_emissions',
                                   'FACILITY_ID',
                                   'FACILITY_ID',
                                  ['LONG', 'LONG', 'LONG', 'TEXT', 'FLOAT'])

emissions_geo = pd.DataFrame.spatial.from_featureclass('current_pm_emissions')

# Traffic

aadt_geo = pd.DataFrame.spatial.from_featureclass('clipped_aadt')

ERROR: Please delete or rename the feature class current_pm_emissions


In [19]:
# Iterate through the PurpleAir Stations to get weighted sums

idw_sum_dict = {} # Storage for results

# Facil
facilities_x = emissions_geo.SHAPE.apply(lambda x: x.x)
facilities_y = emissions_geo.SHAPE.apply(lambda x: x.y)

aadt_x = aadt_geo.SHAPE.apply(lambda x: x.centroid[0])
aadt_y = aadt_geo.SHAPE.apply(lambda x: x.centroid[1])

for i, sensor in sensors_df.iterrows():
    
    # Initialize storage for results for this sensor
    
    idw_sum_dict[sensor.sensor_index] = {}
    
    sensor_x = sensor.X
    sensor_y = sensor.Y
    
    # Get distances to all facilities
    
    dists = np.sqrt((facilities_x-sensor_x)**2 + (facilities_y-sensor_y)**2)
    
    is_within = dists < 2000  # Within 2 km?
    
    # Sum those up
    
    idw_sum = np.sum(emissions_geo.LBS_EMITTED[is_within]/dists[is_within])
    
    idw_sum_dict[sensor.sensor_index]['Facilities'] = idw_sum
    
    # Get distances to all road centroids
    
    dists = np.sqrt((aadt_x-sensor_x)**2 + (aadt_y-sensor_y)**2)
    
    is_within = dists < 2000  # Within 2 km?
    
    # Sum those up
    
    idw_sum = np.sum(aadt_geo.CURRENT_VO[is_within]/dists[is_within])
    
    idw_sum_dict[sensor.sensor_index]['Traffic'] = idw_sum
    

## Fill in Table

In [33]:
# Iterable (date & sensor combinations)

# Select only day/sensors with more than 100 observations

select_summaries = summaries[(summaries.n_observations > 100)
                            & (summaries.date > historic_purpleAir.timestamp.min())]

# The combinations of Date and sensor

date_sensor_combos = set(select_summaries[['date', 'sensor_index']].itertuples(
                                                    index=False, name=None))

In [34]:
# Select Spikes

select_spikes = spikes[spikes.timestamp > historic_purpleAir.timestamp.min()]

In [35]:
# Initialize Dataframe

cols = ['sensor_index', 'date', 'is_weekday', 'n_observations', 
              'pm25_fullDay_mean', 'pm25_fullDay_minutesAbove12ug',
              'n_spikes', 'humidity', 'temperature', 'pressure',
        'idwSum_facilities','idwSum_traffic'
             ]

datatypes = [int, dt.date, int, int,
             float, int,
             int, float, float, float,
            float, float]

dtypes = np.dtype(list(zip(cols, datatypes)))

modeling_df = pd.DataFrame(np.empty(0, dtype = dtypes))


In [36]:
# Iterate

print(len(date_sensor_combos), 'combinations to get information for. Takes about 12 minutes')

starttime = dt.datetime.now()

for i, date_sensor_combo in enumerate(date_sensor_combos):
    
    date = date_sensor_combo[0]
    is_weekday = int(date.dayofweek < 5) # Checks if day of the week is a business day (0-4 = Mon-Fri, 5-6 = Sat-Sun)
    sensor_id = date_sensor_combo[1]
    
    ## Select proper row of dataframes
    
    # Daily Summary
    day_sum = summaries.loc[(summaries.date == date)&
                            (summaries.sensor_index == sensor_id)
                           ]#.iloc[0]
    
    # Historic PurpleAir
    
    day_hist = historic_purpleAir.loc[(historic_purpleAir.timestamp == date)&
                            (historic_purpleAir.sensor_index == sensor_id)
                           ]#.iloc[0]
    
    # Number of Spikes
                               
    day_spikes = select_spikes.loc[(select_spikes.timestamp.dt.date == date.date()) &
                                   (select_spikes.sensor_index == sensor_id)]


    # Check if day has too many observations
    
    if (len(day_sum) > 1) & (len(day_hist) > 1):
        
        print('too many observations for', date_sensor_combo)
    
    ## Check if day is observed
    
    elif (len(day_sum) == 1) & (len(day_hist) == 1):
        
     # If yes, we record into modeling_df

        row = [sensor_id, date.date(), is_weekday, day_sum.n_observations.iloc[0],
                  day_sum.pm25_fullDay_mean.iloc[0], day_sum.pm25_fullDay_minutesAbove12ug.iloc[0],
               len(day_spikes), day_hist.humidity.iloc[0], day_hist.temperature.iloc[0], day_hist.pressure.iloc[0],
               idw_sum_dict[sensor_id]['Facilities'], idw_sum_dict[sensor_id]['Traffic']
              ]

        modeling_df.loc[len(modeling_df.index)] = row

print((dt.datetime.now() - starttime)/60, 'Minutes')

9455 combinations to get information for. Takes about 12 minutes
0:00:11.366596 Minutes


In [37]:
modeling_df.groupby('date').count().sensor_index.max()

53

In [38]:
len(modeling_df)

8064

In [39]:
i

9454

In [40]:
modeling_df.head()

Unnamed: 0,sensor_index,date,is_weekday,n_observations,pm25_fullDay_mean,pm25_fullDay_minutesAbove12ug,n_spikes,humidity,temperature,pressure,idwSum_facilities,idwSum_traffic
0,142752,2022-10-30,0,144,19.220806,1420,13,41.164,61.742,986.696,10.575489,590.2918
1,145242,2023-02-04,0,144,9.97334,440,3,50.364,18.58,987.049,48.217345,539.359407
2,142720,2022-11-27,0,144,6.058875,0,0,49.658,43.992,979.961,41.154877,2133.686116
3,143226,2022-12-05,1,144,7.325174,30,0,42.115,39.646,981.014,0.183773,342.481335
4,157837,2022-11-22,1,144,33.999066,1440,114,48.776,34.329,991.308,1.23745,242.134777


In [42]:
# Save as csv

modeling_df.to_csv(os.path.join(data_path, 'Parameter_df.csv'), index = False)