# PurpleAir Stations QAQC

## Import Packages

In [1]:
### Import Packages

# File manipulation

import os # For working with Operating System
import requests # Accessing the Web
import datetime as dt # Working with dates/times

# Database 

import psycopg2
from psycopg2 import sql

# Analysis

import numpy as np
import geopandas as gpd
import pandas as pd

# Get CWD

cwd = os.getcwd()

## Definitions

In [2]:
# This is my personal API key... Please use responsibly! 51592903-B445-11ED-B6F4-42010A800007

api = input('Please enter your Purple Air api key')

Please enter your Purple Air api key 51592903-B445-11ED-B6F4-42010A800007


In [3]:
#Setting lat/long for PurpleAir API Parameters

datapath = os.path.join(cwd, '..', '..', 'Data')

extent = gpd.read_file(os.path.join(datapath, 'extent.geojson'))

nwlng, selat, selng, nwlat = extent.to_crs('EPSG:4326').total_bounds # Change coordinate reference system

## Importing PurpleAir Station Data from PurpleAir API

In [4]:
def getSensorsData(query='', api_read_key=''):

    # my_url is assigned the URL we are going to send our request to.
    url = 'https://api.purpleair.com/v1/sensors?' + query
    
    print('Here is the full url for the API call:\n\n', url)

    # my_headers is assigned the context of our request we want to make. In this case
    # we will pass through our API read key using the variable created above.
    my_headers = {'X-API-Key':api_read_key}

    # This line creates and sends the request and then assigns its response to the
    # variable, r.
    response = requests.get(url, headers=my_headers)

    # We then return the response we received.
    return response

In [5]:
#Set bounding strings for API parameters
bounds_strings = [f'nwlng={nwlng}',
                  f'nwlat={nwlat}',
                  f'selng={selng}',
                  f'selat={selat}']

bounds_string = '&'.join(bounds_strings)

print(bounds_string)

nwlng=-93.43046973986235&nwlat=45.12326140727048&selng=-93.09304872066019&selat=44.81857616148092


In [6]:
#Setting parameters for API
fields = ['firmware_version','date_created','last_modified','last_seen', 'name', 'uptime','position_rating','channel_state','channel_flags','altitude',
          'location_type','latitude', 'longitude']

fields_string = 'fields=' + '%2C'.join(fields)

print(fields_string)

fields=firmware_version%2Cdate_created%2Clast_modified%2Clast_seen%2Cname%2Cuptime%2Cposition_rating%2Cchannel_state%2Cchannel_flags%2Caltitude%2Clocation_type%2Clatitude%2Clongitude


In [7]:
#finalizing query for API function
query_string = '&'.join([fields_string, bounds_string])

print(query_string)

fields=firmware_version%2Cdate_created%2Clast_modified%2Clast_seen%2Cname%2Cuptime%2Cposition_rating%2Cchannel_state%2Cchannel_flags%2Caltitude%2Clocation_type%2Clatitude%2Clongitude&nwlng=-93.43046973986235&nwlat=45.12326140727048&selng=-93.09304872066019&selat=44.81857616148092


In [8]:
#calling the API
response = getSensorsData(query_string, api)

Here is the full url for the API call:

 https://api.purpleair.com/v1/sensors?fields=firmware_version%2Cdate_created%2Clast_modified%2Clast_seen%2Cname%2Cuptime%2Cposition_rating%2Cchannel_state%2Cchannel_flags%2Caltitude%2Clocation_type%2Clatitude%2Clongitude&nwlng=-93.43046973986235&nwlat=45.12326140727048&selng=-93.09304872066019&selat=44.81857616148092


In [9]:
response_dict = response.json() # Read response as a json (dictionary)

col_names = response_dict['fields']
data = np.array(response_dict['data'])

df = pd.DataFrame(data, columns = col_names)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94 entries, 0 to 93
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   sensor_index      94 non-null     object
 1   last_modified     94 non-null     object
 2   date_created      94 non-null     object
 3   last_seen         94 non-null     object
 4   name              94 non-null     object
 5   location_type     94 non-null     object
 6   firmware_version  94 non-null     object
 7   uptime            94 non-null     object
 8   position_rating   94 non-null     object
 9   latitude          94 non-null     object
 10  longitude         94 non-null     object
 11  altitude          94 non-null     object
 12  channel_state     94 non-null     object
 13  channel_flags     94 non-null     object
dtypes: object(14)
memory usage: 10.4+ KB


### Compare with the list of Ids from City

In [10]:
# Load the list the City provided (or any other list

# sensor_info = pd.read_excel(os.path.join(datapath, 'PA IDs and indexes.xlsx')) # Load as DataFrame

# ids_from_city = sensor_info['Sensor Index'].dropna().astype(int) # List from the City

sensor_info = gpd.read_file(os.path.join(datapath, 'PurpleAir_Stations.geojson'))

ids_from_previous_run = sensor_info.sensor_index.unique().astype(int)

In [11]:
len(ids_from_previous_run)

64

In [12]:
# From what we just queried from PurpleAir 
# Only want the City's monitors

is_city = df.name.apply(lambda x: 'CITY OF MINNEAPOLIS' in x.upper())

ids_from_PurpleAir = df[is_city].sensor_index.astype(int)

In [13]:
len(ids_from_PurpleAir)

46

In [14]:
# Now let's get the union of these two lists of ids

ids_from_either = np.union1d(ids_from_previous_run, ids_from_PurpleAir)

In [15]:
len(ids_from_either)

64

### Final PurpleAir Query

In [16]:
# One final query for this list of ids

# Get start Times

sensor_string = 'show_only=' + '%2C'.join(ids_from_either.astype(str))

query_string = '&'.join([fields_string, sensor_string])

In [17]:
response = getSensorsData(query_string, api)

response_dict = response.json() # Read response as a json (dictionary)

col_names = response_dict['fields']
data = np.array(response_dict['data'])

sensors_df = pd.DataFrame(data, columns = col_names)

Here is the full url for the API call:

 https://api.purpleair.com/v1/sensors?fields=firmware_version%2Cdate_created%2Clast_modified%2Clast_seen%2Cname%2Cuptime%2Cposition_rating%2Cchannel_state%2Cchannel_flags%2Caltitude%2Clocation_type%2Clatitude%2Clongitude&show_only=142718%2C142720%2C142724%2C142726%2C142728%2C142730%2C142732%2C142734%2C142736%2C142744%2C142748%2C142750%2C142752%2C142756%2C142772%2C142774%2C142926%2C143214%2C143216%2C143222%2C143224%2C143226%2C143238%2C143240%2C143242%2C143246%2C143248%2C143636%2C143648%2C143656%2C143660%2C143666%2C143668%2C143916%2C143942%2C143944%2C145202%2C145204%2C145242%2C145250%2C145454%2C145470%2C145498%2C145502%2C145504%2C145506%2C145604%2C145610%2C145614%2C145616%2C156605%2C157747%2C157757%2C157785%2C157787%2C157837%2C157845%2C157861%2C157871%2C157877%2C157935%2C166459%2C168327%2C177765


## Cleaning PurpleAir Station Data

In [18]:
#visualizing API response
sensors_df.head()

Unnamed: 0,sensor_index,last_modified,date_created,last_seen,name,location_type,firmware_version,uptime,position_rating,latitude,longitude,altitude,channel_state,channel_flags
0,142718,1691002217,1642013869,1692665013,City of Minneapolis Community Air Monitoring P...,0,7.02,9565,5,44.99631,-93.29565,857,3,0
1,142720,1690999596,1642013875,1692664994,City of Minneapolis Community Air Monitoring P...,0,7.02,61910,5,44.955555,-93.254974,850,3,0
2,142726,1675359066,1642013897,1692664950,City of Minneapolis Community Air Monitoring P...,0,7.02,468,5,45.01507,-93.28903,889,3,0
3,142724,1690992725,1642013889,1692665001,City of Minneapolis Community Air Monitoring P...,0,7.02,3588,5,44.937733,-93.24356,859,3,0
4,142730,1690990874,1642013916,1687892464,City of Minneapolis Community Air Monitoring P...,0,7.02,15319,5,44.991985,-93.29565,858,3,0


In [19]:
#find outside sensors
df_outside = sensors_df[sensors_df.location_type == 0]
len(df_outside)

64

In [20]:
#drop the location_type now that we have filtered for outdoor sensors only
df_stations = df_outside.drop('location_type', axis=1)

In [21]:
#Converting UNIX date/time to pd
df_stations['last_modified'] = pd.to_datetime(df_stations['last_modified'], unit='s')
df_stations['date_created'] = pd.to_datetime(df_stations['date_created'], unit='s')
df_stations['last_seen'] = pd.to_datetime(df_stations['last_seen'], unit='s')

In [22]:
df_stations.head(3)

Unnamed: 0,sensor_index,last_modified,date_created,last_seen,name,firmware_version,uptime,position_rating,latitude,longitude,altitude,channel_state,channel_flags
0,142718,2023-08-02 18:50:17,2022-01-12 18:57:49,2023-08-22 00:43:33,City of Minneapolis Community Air Monitoring P...,7.02,9565,5,44.99631,-93.29565,857,3,0
1,142720,2023-08-02 18:06:36,2022-01-12 18:57:55,2023-08-22 00:43:14,City of Minneapolis Community Air Monitoring P...,7.02,61910,5,44.955555,-93.254974,850,3,0
2,142726,2023-02-02 17:31:06,2022-01-12 18:58:17,2023-08-22 00:42:30,City of Minneapolis Community Air Monitoring P...,7.02,468,5,45.01507,-93.28903,889,3,0


In [23]:
# Save as geojson

gdf_stations = gpd.GeoDataFrame(df_stations, 
                                geometry = gpd.points_from_xy(
                                    df_stations.longitude,
                                    df_stations.latitude,
                                    crs = 'EPSG:4326')
                               ).to_crs('EPSG:26915')

cols_for_db = ['sensor_index'] + fields[:-3] + ['geometry']

sorted_gdf = gdf_stations[cols_for_db]

sorted_gdf.to_file(os.path.join(datapath, 'PurpleAir_Stations.geojson'))

## Insert Data into SQL Table

In [24]:
cols_for_db

['sensor_index',
 'firmware_version',
 'date_created',
 'last_modified',
 'last_seen',
 'name',
 'uptime',
 'position_rating',
 'channel_state',
 'channel_flags',
 'altitude',
 'geometry']

In [25]:
# Sort the df and correct values

sorted_df = sorted_gdf.drop('geometry', axis=1).copy()

sorted_df['wkt'] = sorted_gdf.geometry.apply(lambda x: x.wkt)

sorted_df['date_created'] = sorted_gdf.date_created.apply(lambda x : x.strftime('%Y-%m-%d %H:%M:%S'))
sorted_df['last_modified'] = sorted_gdf.last_modified.apply(lambda x : x.strftime('%Y-%m-%d %H:%M:%S'))
sorted_df['last_seen'] = sorted_gdf.last_seen.apply(lambda x : x.strftime('%Y-%m-%d %H:%M:%S'))

# Replace non-numeric nans

# sorted_df['firmware_version'] = sorted_df['firmware_version'].fillna(value = 'a', inplace = True)
# sorted_df['firmware_version'] = sorted_df['firmware_version'].fillna(value = 'NaN', inplace = True)

In [28]:
# Get credentials

cred_pth = os.path.join(os.getcwd(), '..', '..', 'Scripts', 'database', 'db_credentials.txt')

with open(cred_pth, 'r') as f:
    
    creds = f.readlines()[0].rstrip('\n').split(', ')
    
# Connect to PostGIS Database

pg_connection_dict = dict(zip(['dbname', 'user', 'password', 'port', 'host'], creds))

conn = psycopg2.connect(**pg_connection_dict)

In [31]:
#connect to the cursor
cur = conn.cursor()

# iterate over the dataframe and insert each row into the database using a SQL INSERT statement

for index, row in sorted_df.copy().iterrows():

    q1 = sql.SQL('INSERT INTO "PurpleAir Stations" ({}) VALUES ({},{});').format(
     sql.SQL(', ').join(map(sql.Identifier, cols_for_db)),
     sql.SQL(', ').join(sql.Placeholder() * (len(cols_for_db)-1)),
     sql.SQL('ST_Transform(ST_SetSRID(ST_GeomFromText(%s), 26915),4326)::geometry'))
    # print(q1.as_string(conn))
    # print(row)
    # break
    
    cur.execute(q1.as_string(conn),
        (list(row.values))
        )
    # Commit command

    conn.commit()

# Close the cursor and connection
cur.close()
conn.close()