# PurpleAir Stations QAQC

## Import Packages

In [19]:
### Import Packages

# File manipulation

import os # For working with Operating System
import requests # Accessing the Web
import datetime as dt # Working with dates/times

# Database 

import psycopg2
from psycopg2 import sql

# Analysis

import numpy as np
import arcpy
import pandas as pd

## Set Working Environment

In [2]:
# Get CWD

cwd = os.getcwd() # This is a global variable for where the notebook is (must change if running in arcpro)

# Create GeoDataBase
# This is the communal GeoDataBase

if not os.path.exists(os.path.join(cwd, '..', '..', 'data', 'QAQC.gdb')): # If it doesn't exist, create it

    arcpy.management.CreateFileGDB(os.path.join(cwd, '..', '..', 'data'), 'QAQC')

# Make it workspace

arcpy.env.workspace = os.path.join(cwd, '..', '..', 'data', 'QAQC.gdb')

arcpy.env.overwriteOutput = True # Overwrite layers is okay

## Set Bounds for PurpleAir Parameters

In [3]:
#Setting lat/long for PurpleAir API Parameters
nwlng = arcpy.Describe("mpls_8km_wgs").extent.XMin
nwlat = arcpy.Describe("mpls_8km_wgs").extent.YMax
selng = arcpy.Describe("mpls_8km_wgs").extent.XMax
selat = arcpy.Describe("mpls_8km_wgs").extent.YMin

## Importing PurpleAir Station Data from PurpleAir API

In [4]:
def getSensorsData(query='', api_read_key=''):

    # my_url is assigned the URL we are going to send our request to.
    url = 'https://api.purpleair.com/v1/sensors?' + query
    
    print('Here is the full url for the API call:\n\n', url)

    # my_headers is assigned the context of our request we want to make. In this case
    # we will pass through our API read key using the variable created above.
    my_headers = {'X-API-Key':api_read_key}

    # This line creates and sends the request and then assigns its response to the
    # variable, r.
    response = requests.get(url, headers=my_headers)

    # We then return the response we received.
    return response

In [5]:
#PurpleAir API 'read' key
api = input('Please enter your Purple Air api key')

Please enter your Purple Air api key 51592903-B445-11ED-B6F4-42010A800007


In [6]:
#Set bounding strings for API parameters
bounds_strings = [f'nwlng={nwlng}',
                  f'nwlat={nwlat}',
                  f'selng={selng}',
                  f'selat={selat}']

bounds_string = '&'.join(bounds_strings)

print(bounds_string)

nwlng=-93.43047670599998&nwlat=45.12326797900003&selng=-93.09299994199998&selat=44.81858013100003


In [7]:
#Setting parameters for API
fields = ['name', 'firmware_version','date_created','last_modified','last_seen','uptime','position_rating','channel_state','channel_flags','altitude',
          'location_type','latitude', 'longitude']

fields_string = 'fields=' + '%2C'.join(fields)

print(fields_string)

fields=name%2Cfirmware_version%2Cdate_created%2Clast_modified%2Clast_seen%2Cuptime%2Cposition_rating%2Cchannel_state%2Cchannel_flags%2Caltitude%2Clocation_type%2Clatitude%2Clongitude


In [8]:
#finalizing query for API function
query_string = '&'.join([fields_string, bounds_string])

print(query_string)

fields=name%2Cfirmware_version%2Cdate_created%2Clast_modified%2Clast_seen%2Cuptime%2Cposition_rating%2Cchannel_state%2Cchannel_flags%2Caltitude%2Clocation_type%2Clatitude%2Clongitude&nwlng=-93.43047670599998&nwlat=45.12326797900003&selng=-93.09299994199998&selat=44.81858013100003


In [9]:
#calling the API
response = getSensorsData(query_string, api)

Here is the full url for the API call:

 https://api.purpleair.com/v1/sensors?fields=name%2Cfirmware_version%2Cdate_created%2Clast_modified%2Clast_seen%2Cuptime%2Cposition_rating%2Cchannel_state%2Cchannel_flags%2Caltitude%2Clocation_type%2Clatitude%2Clongitude&nwlng=-93.43047670599998&nwlat=45.12326797900003&selng=-93.09299994199998&selat=44.81858013100003


In [12]:
response_dict = response.json() # Read response as a json (dictionary)

col_names = response_dict['fields']
data = np.array(response_dict['data'])

df = pd.DataFrame(data, columns = col_names)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   sensor_index      87 non-null     object
 1   last_modified     87 non-null     object
 2   date_created      87 non-null     object
 3   last_seen         87 non-null     object
 4   name              87 non-null     object
 5   location_type     87 non-null     object
 6   firmware_version  87 non-null     object
 7   uptime            87 non-null     object
 8   position_rating   87 non-null     object
 9   latitude          87 non-null     object
 10  longitude         87 non-null     object
 11  altitude          87 non-null     object
 12  channel_state     87 non-null     object
 13  channel_flags     87 non-null     object
dtypes: object(14)
memory usage: 9.6+ KB


## Cleaning PurpleAir Station Data

In [13]:
#visualizing API response
df.head()

Unnamed: 0,sensor_index,last_modified,date_created,last_seen,name,location_type,firmware_version,uptime,position_rating,latitude,longitude,altitude,channel_state,channel_flags
0,3088,1504993349,1504040633,1681154171,Howe Neighborhood,0,6.06b,9461,5,44.935818,-93.21752,833,3,0
1,5582,1660166545,1514335701,1681154177,Vircroft Ashnia,0,7.02,4308,5,44.891655,-93.34291,899,3,0
2,137876,1637086469,1637082783,1681154114,King Field Indoors,1,7.02,44756,0,44.928917,-93.284706,886,3,0
3,11134,1529977499,1527023589,1681154110,Linden Hills,0,7.02,25688,5,44.92776,-93.32235,886,3,0
4,142718,1675359061,1642013869,1681154174,City of Minneapolis Community Air Monitoring P...,0,7.02,30211,5,44.995792,-93.295395,865,3,0


In [14]:
#find outside sensors
df_outside = df[df.location_type =='0']
len(df_outside)

79

In [16]:
#drop the location_type now that we have filtered for outdoor sensors only
df_stations = df_outside.drop('location_type', axis=1)

In [17]:
#Converting UNIX date/time to pd
df_stations['last_modified'] = pd.to_datetime(df_stations['last_modified'], unit='s')
df_stations['date_created'] = pd.to_datetime(df_stations['date_created'], unit='s')
df_stations['last_seen'] = pd.to_datetime(df_stations['last_seen'], unit='s')

In [18]:
df_stations.head()

Unnamed: 0,sensor_index,last_modified,date_created,last_seen,name,firmware_version,uptime,position_rating,latitude,longitude,altitude,channel_state,channel_flags
0,3088,2017-09-09 21:42:29,2017-08-29 21:03:53,2023-04-10 19:16:11,Howe Neighborhood,6.06b,9461,5,44.935818,-93.21752,833,3,0
1,5582,2022-08-10 21:22:25,2017-12-27 00:48:21,2023-04-10 19:16:17,Vircroft Ashnia,7.02,4308,5,44.891655,-93.34291,899,3,0
3,11134,2018-06-26 01:44:59,2018-05-22 21:13:09,2023-04-10 19:15:10,Linden Hills,7.02,25688,5,44.92776,-93.32235,886,3,0
4,142718,2023-02-02 17:31:01,2022-01-12 18:57:49,2023-04-10 19:16:14,City of Minneapolis Community Air Monitoring P...,7.02,30211,5,44.995792,-93.295395,865,3,0
5,142720,2023-02-02 17:31:45,2022-01-12 18:57:55,2023-04-10 19:14:49,City of Minneapolis community air monitoring p...,7.02,13229,5,44.95617,-93.25471,856,3,0


## Let the QAQC Begin!

In [175]:
#visual inspection that all the points fall within the boundary + 8KM buffer

### Altitude Check

In [20]:
df_stations['altitude'] = pd.to_numeric(df_stations['altitude'], errors='coerce')
def check_range(value):
    if value is None:
        return 'no value given'  # or any other value that indicates a missing value
    elif value >= 680 and value <= 1120:
        pass
    else:
        return 'out of range (680-1120ft)'
    
df_stations['altitude_error'] = df_stations['altitude'].apply(check_range)

#680 - 1120 Ref https://en-us.topographic-map.com/map-bbpz4/Minneapolis/

In [23]:
df_stations.head()

Unnamed: 0,sensor_index,last_modified,date_created,last_seen,name,firmware_version,uptime,position_rating,latitude,longitude,altitude,channel_state,channel_flags,altitude_error
0,3088,2017-09-09 21:42:29,2017-08-29 21:03:53,2023-04-10 19:16:11,Howe Neighborhood,6.06b,9461,5,44.935818,-93.21752,833,3,0,
1,5582,2022-08-10 21:22:25,2017-12-27 00:48:21,2023-04-10 19:16:17,Vircroft Ashnia,7.02,4308,5,44.891655,-93.34291,899,3,0,
3,11134,2018-06-26 01:44:59,2018-05-22 21:13:09,2023-04-10 19:15:10,Linden Hills,7.02,25688,5,44.92776,-93.32235,886,3,0,
4,142718,2023-02-02 17:31:01,2022-01-12 18:57:49,2023-04-10 19:16:14,City of Minneapolis Community Air Monitoring P...,7.02,30211,5,44.995792,-93.295395,865,3,0,
5,142720,2023-02-02 17:31:45,2022-01-12 18:57:55,2023-04-10 19:14:49,City of Minneapolis community air monitoring p...,7.02,13229,5,44.95617,-93.25471,856,3,0,


## Upload to Local & Remote Databases

### Converting lat/long to WKT for SQL

In [25]:
# Create a list to store the WKT values
wkt_list = []

# Iterate over each row and add the WKT representation of a point geometry
for index, row in df_stations.iterrows():
    latitude, longitude = row['latitude'], row['longitude']
    point = arcpy.Point(longitude, latitude)
    point_geom = arcpy.PointGeometry(point, 4326)
    wkt = point_geom.WKT
    wkt_list.append(wkt)

# Add the WKT column to the DataFrame
df_stations['WKT'] = wkt_list

In [26]:
### Local GDB

# Initialize Feature Class

table_name = 'PURPLEAIR_STATIONS'
geom_type = 'POINT'
out_coordinate_system = arcpy.SpatialReference(26915)

arcpy.management.CreateFeatureclass(arcpy.env.workspace, table_name, geom_type, 
                                    spatial_reference = out_coordinate_system)

# Initialize Fields

cols_for_gdb = ['sensor_index', 'last_modified', 'date_created', 'last_seen', 'name',
       'firmware_version', 'uptime', 'position_rating', 'altitude', 'channel_state', 'channel_flags']

dtypes_for_gdb = ['LONG', 'DATE', 'DATE', 'DATE', 'TEXT',
                  'TEXT', 'LONG', 'SHORT', 'SHORT', 'SHORT', 'SHORT']

field_desc = list(zip(cols_for_gdb, dtypes_for_gdb))

arcpy.management.AddFields(table_name, field_desc)

# Insert into table

with arcpy.da.InsertCursor(table_name, cols_for_gdb + ['SHAPE@']) as cursor:

    for i, df_row in df_stations.iterrows():

        row = [None] * (len(cols_for_gdb)+1)

        row[:len(cols_for_gdb)] = df_row[cols_for_gdb]

        pt = arcpy.FromWKT(df_row['WKT'], arcpy.SpatialReference(4326)).projectAs(out_coordinate_system)

        row[-1] = pt
        
        cursor.insertRow(row)

## Insert Data into SQL Table

In [64]:
# Get credentials

cred_pth = os.path.join(os.getcwd(), '..', '..', 'database', 'db_credentials.txt')

with open(cred_pth, 'r') as f:
    
    creds = f.readlines()[0].rstrip('\n').split(', ')
    
# Connect to PostGIS Database

pg_connection_dict = dict(zip(['dbname', 'user', 'password', 'port', 'host'], creds))

connection = psycopg2.connect(**pg_connection_dict)

0

In [66]:
#connect to the cursor
cur = connection.cursor()

# iterate over the dataframe and insert each row into the database using a SQL INSERT statement
for index, row in df_stations.iterrows():
    cur.execute('''
    INSERT INTO PURPLEAIR_STATIONS (sensor_index, last_modified, date_created, last_seen, name, firmware_version, uptime, 
                        position_rating, latitude, longitude, altitude, channel_state, channel_flags, WKT)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, ST_SetSRID(ST_GeomFromText(%s), 4326)::geometry)
    ''', (row['sensor_index'], row['last_modified'], row['date_created'], row['last_seen'], row['name'], row['firmware_version'], 
          row['uptime'], row['position_rating'], row['altitude'], 
          row['channel_state'], row['channel_flags'], row['WKT']))
    connection.commit()

# Close the cursor and connection
cursor.close()
connection.close()