# PurpleAir Real-Time QAQC

## Import Packages

In [34]:
import os
import requests 
import pandas as pd
import arcpy
import numpy as np

## Set Working Environment

In [35]:
cwd = os.getcwd() # This is a global variable for where the notebook is (must change if running in arcpro)

# Make it workspace

arcpy.env.workspace = os.path.join(cwd, '..', '..', 'data', 'QAQC.gdb')

arcpy.env.overwriteOutput = True # Overwrite layers is okay

## Set Bounds for PurpleAir Parameters

In [36]:
#Changing the projection from UMT 15 to NAD 84
arcpy.management.Project("mpls_8km", "mpls_8km_Project", 'GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]', "WGS_1984_(ITRF00)_To_NAD_1983", 'PROJCS["NAD_1983_UTM_Zone_15N",GEOGCS["GCS_North_American_1983",DATUM["D_North_American_1983",SPHEROID["GRS_1980",6378137.0,298.257222101]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Transverse_Mercator"],PARAMETER["False_Easting",500000.0],PARAMETER["False_Northing",0.0],PARAMETER["Central_Meridian",-93.0],PARAMETER["Scale_Factor",0.9996],PARAMETER["Latitude_Of_Origin",0.0],UNIT["Meter",1.0]]', "NO_PRESERVE_SHAPE", None, "NO_VERTICAL")

In [37]:
#Create bounding box
arcpy.management.MinimumBoundingGeometry("mpls_8km_Project", "mpls_8km_Proje_MinimumBoundi", "RECTANGLE_BY_AREA")

In [38]:
#Setting lat/long for PurpleAir API Parameters
nwlng = arcpy.Describe("mpls_8km_Proje_MinimumBoundi").extent.XMin
nwlat = arcpy.Describe("mpls_8km_Proje_MinimumBoundi").extent.YMax
selng = arcpy.Describe("mpls_8km_Proje_MinimumBoundi").extent.XMax
selat = arcpy.Describe("mpls_8km_Proje_MinimumBoundi").extent.YMin

## Importing PurpleAir Real-Time Data from PurpleAir API

In [39]:
def getSensorsData(query='', api_read_key=''):

    # my_url is assigned the URL we are going to send our request to.
    url = 'https://api.purpleair.com/v1/sensors?' + query
    
    print('Here is the full url for the API call:\n\n', url)

    # my_headers is assigned the context of our request we want to make. In this case
    # we will pass through our API read key using the variable created above.
    my_headers = {'X-API-Key':api_read_key}

    # This line creates and sends the request and then assigns its response to the
    # variable, r.
    response = requests.get(url, headers=my_headers)

    # We then return the response we received.
    return response

In [40]:
#PurpleAir API 'read' key
api = input('Please enter your Purple Air api key')


Please enter your Purple Air api key 51592903-B445-11ED-B6F4-42010A800007


In [41]:
#Set bounding strings for API parameters
bounds_strings = [f'nwlng={nwlng}',
                  f'nwlat={nwlat}',
                  f'selng={selng}',
                  f'selat={selat}']

bounds_string = '&'.join(bounds_strings)

print(bounds_string)

nwlng=-93.43083707299996&nwlat=45.12366876300007&selng=-93.09225748799997&selat=44.81791263300005


In [42]:
#Setting parameters for API

fields = ['humidity', 'temperature', 'pressure', 'pm2.5_cf_1', 'location_type']


fields_string = 'fields=' + '%2C'.join(fields)

print(fields_string)

fields=humidity%2Ctemperature%2Cpressure%2Cpm2.5_cf_1%2Clocation_type


In [43]:
#finalizing query for API function
query_string = '&'.join([fields_string, bounds_string])

print(query_string)

fields=humidity%2Ctemperature%2Cpressure%2Cpm2.5_cf_1%2Clocation_type&nwlng=-93.43083707299996&nwlat=45.12366876300007&selng=-93.09225748799997&selat=44.81791263300005


In [44]:
#calling the API
response = getSensorsData(query_string, api)

Here is the full url for the API call:

 https://api.purpleair.com/v1/sensors?fields=humidity%2Ctemperature%2Cpressure%2Cpm2.5_cf_1%2Clocation_type&nwlng=-93.43083707299996&nwlat=45.12366876300007&selng=-93.09225748799997&selat=44.81791263300005


In [45]:
response_dict = response.json() # Read response as a json (dictionary)
timestamp = response_dict['data_time_stamp']
col_names = response_dict['fields']
data = np.array(response_dict['data'])

df_realtime = pd.DataFrame(data, columns = col_names)
df_realtime['timestamp'] = timestamp

df_realtime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   sensor_index   88 non-null     object
 1   location_type  88 non-null     object
 2   humidity       85 non-null     object
 3   temperature    85 non-null     object
 4   pressure       84 non-null     object
 5   pm2.5_cf_1     88 non-null     object
 6   timestamp      88 non-null     int64 
dtypes: int64(1), object(6)
memory usage: 4.9+ KB


## Cleaning PurpleAir Station Data

In [46]:
#visualizing API response
df_realtime.head()

Unnamed: 0,sensor_index,location_type,humidity,temperature,pressure,pm2.5_cf_1,timestamp
0,3088,0,37,54,981.19,3.6,1680559083
1,5582,0,34,53,979.28,4.6,1680559083
2,137876,1,32,75,979.72,0.5,1680559083
3,11134,0,0,32,,6.4,1680559083
4,142718,0,42,51,981.77,7.8,1680559083


In [47]:
df_realtime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   sensor_index   88 non-null     object
 1   location_type  88 non-null     object
 2   humidity       85 non-null     object
 3   temperature    85 non-null     object
 4   pressure       84 non-null     object
 5   pm2.5_cf_1     88 non-null     object
 6   timestamp      88 non-null     int64 
dtypes: int64(1), object(6)
memory usage: 4.9+ KB


## Removing 'inside' Sensors

In [48]:
#find outside sensors
df_realtime = df_realtime[df_realtime['location_type'] ==0]
len(df_realtime)

80

In [49]:
#drop the location_type now that we have filtered for outdoor sensors only
df_realtime = df_realtime.drop('location_type', axis=1)

In [50]:
#rename pm2.5 column to pm2_5 for SQL
df_realtime = df_realtime.rename(columns={'pm2.5_cf_1' : 'pm2_5'})

In [51]:
df_realtime['timestamp'] = pd.to_datetime(df_realtime['timestamp'], unit='s')

## Let the QAQC Begin!

In [52]:
#create a blank dataframe to hold the errors

purpleair_realtime_errors = pd.DataFrame(columns = ['humidity_error', 'temperature_error', 'pressure_error', 'pm2_5_error'])
purpleair_realtime_errors['sensor_index'] = df_realtime['sensor_index']
purpleair_realtime_errors['timestamp'] = df_realtime['timestamp']

### Humidity Check

In [59]:
#ranges pulled from https://www.currentresults.com/Weather/Minnesota/humidity-annual.php

def check_range(value):
    if value is None:
        return 'no value given'  # or any other value that indicates a missing value
    elif value >= 10 and value <= 90:
        pass
    else:
        return 'out of range (10%-90%)'
    
purpleair_realtime_errors['humidity_error'] = df_realtime['humidity'].apply(check_range)

print(purpleair_realtime_errors)

            humidity_error temperature_error  \
0                     None              None   
1                     None              None   
3   out of range (10%-90%)              None   
7                     None              None   
10                    None              None   
13                    None              None   
14          no value given    no value given   
15                    None              None   
18                    None              None   
22          no value given    no value given   
23                    None              None   
25                    None              None   
27                    None              None   
29                    None              None   
31                    None              None   
32                    None              None   
39                    None              None   
53                    None              None   
54                    None              None   
58                    None              

### Temperature Check

In [60]:
#winter -4 - 28
#spring 22 - 57
#summer 48 - 81
#fall 29 - 59
#ref from https://www.dnr.state.mn.us/climate/summaries_and_publications/normalsportal.html

def check_range(value):
    if value is None:
        return 'no value given'  # or any other value that indicates a missing value
    elif value >= -20 and value <= 100:
        pass
    else:
        return 'out of range (-20-100F)'
'''
#if we can get time stamp we should use this with a date check too
def check_range(value):
    if value is None:
        return -1
    if value >= -20 and value <=35:
        return 'winter (-20-35F)'
    if value >10 and value <=70:
        return 'spring (10-70F)'
    if value >30 and value <=100:
        return 'summer (30-100F)'
    if value >15 and value <=70:
        return 'fall (15-70F)'
    else:
        return 'out of range'
'''

purpleair_realtime_errors['temperature_error'] = df_realtime['temperature'].apply(check_range)

print(purpleair_realtime_errors)

            humidity_error temperature_error  \
0                     None              None   
1                     None              None   
3   out of range (10%-90%)              None   
7                     None              None   
10                    None              None   
13                    None              None   
14          no value given    no value given   
15                    None              None   
18                    None              None   
22          no value given    no value given   
23                    None              None   
25                    None              None   
27                    None              None   
29                    None              None   
31                    None              None   
32                    None              None   
39                    None              None   
53                    None              None   
54                    None              None   
58                    None              

### Pressure Check

In [55]:
# range is 25 - 35 Hg according to https://barometricpressure.app/minneapolis
# PurpleAir uses Millibars so I used https://www.weather.gov/epz/wxcalc_pressureconvert to convert
# rage is 846.6 - 1185.24

def check_range(value):
    if value is None:
        return 'no value given'  # or any other value that indicates a missing value
    elif value >= 830 and value <= 1200:
        pass
    else:
        return 'out of range (830 - 1200 Millibars)'
    
purpleair_realtime_errors['pressure_error'] = df_realtime['pressure'].apply(check_range)

print(purpleair_realtime_errors)

            humidity_error temperature_error  pressure_error pm2_5_error  \
0   out of range (40%-90%)              None            None         NaN   
1   out of range (40%-90%)              None            None         NaN   
3   out of range (40%-90%)              None  no value given         NaN   
4                     None              None            None         NaN   
5                     None              None            None         NaN   
..                     ...               ...             ...         ...   
83                    None              None            None         NaN   
84                    None              None            None         NaN   
85                    None              None            None         NaN   
86                    None              None            None         NaN   
87  out of range (40%-90%)              None            None         NaN   

   sensor_index           timestamp  
0          3088 2023-04-03 21:58:03  
1          

### PM Check

In [56]:
#Average reading in MPLS is 30 ug/m3 per https://www.epa.gov/air-trends/air-quality-cities-and-counties

def check_range(value):
    if value is None:
        return 'no value given'
#    if value == 0:
 #       return '0'
#    if value >0.1 and value <=10:
#        return 'PM2.5 0.1-10'
#    if value >10 and value <=20:
#        return 'PM2.5 10-20'
#    if value >20 and value <=30:
#        return 'PM2.5 20-30'
#    if value >30 and value <=40:
#        return 'PM2.5 30-40'
#    if value >40 and value <=50:
#        return 'PM2.5 40-50'
#    if value >50 and value <=60:
#        return 'PM2.5 50-60'
#    if value >60 and value <=70:
#        return 'PM2.5 60-70'
    if value >0.1 and value <70:
        pass
    else:
        return 'above 70'
    
purpleair_realtime_errors['pm2_5_error'] = df_realtime['pm2_5'].apply(check_range)

print(purpleair_realtime_errors)

            humidity_error temperature_error  pressure_error pm2_5_error  \
0   out of range (40%-90%)              None            None        None   
1   out of range (40%-90%)              None            None        None   
3   out of range (40%-90%)              None  no value given        None   
4                     None              None            None        None   
5                     None              None            None        None   
..                     ...               ...             ...         ...   
83                    None              None            None        None   
84                    None              None            None        None   
85                    None              None            None        None   
86                    None              None            None        None   
87  out of range (40%-90%)              None            None        None   

   sensor_index           timestamp  
0          3088 2023-04-03 21:58:03  
1          

In [61]:
purpleair_realtime_errors = purpleair_realtime_errors.dropna(subset=purpleair_realtime_errors.columns.difference(['sensor_index', 'timestamp']), how='all')
purpleair_realtime_errors

Unnamed: 0,humidity_error,temperature_error,pressure_error,pm2_5_error,sensor_index,timestamp
3,out of range (10%-90%),,no value given,,11134,2023-04-03 21:58:03
14,no value given,no value given,no value given,,142748,2023-04-03 21:58:03
15,,,out of range (830 - 1200 Millibars),,142752,2023-04-03 21:58:03
22,no value given,no value given,no value given,,143222,2023-04-03 21:58:03
64,out of range (10%-90%),,,,31243,2023-04-03 21:58:03
74,no value given,no value given,no value given,,75827,2023-04-03 21:58:03
77,,,,above 70,98259,2023-04-03 21:58:03


## Connecting to the Server

In [199]:
import psycopg2
from psycopg2 import sql

In [200]:
connection = psycopg2.connect(host = '34.132.44.118',
                              database = 'lab1-2',
                              user = 'postgres',
                              password = 'password',
                              port = '5432')
connection.closed

0

## Insert Data into SQL Table

In [201]:
#connect to the cursor
cur = connection.cursor()

# iterate over the dataframe and insert each row into the database using a SQL INSERT statement
for index, row in df_realtime.iterrows():
    cur.execute('''
    INSERT INTO PURPLEAIR_REALTIME (sensor_index, timestamp, humidity, temperature, pressure, pm2_5) 
    VALUES (%s, %s, %s, %s, %s) 
    ''', (row['sensor_index'], row['timestamp'], row['humidity'], row['temperature'], row['pressure'], row['pm2_5']))
    connection.commit()
    
for i, r in purpleair_realtime_errors.iterrows():
    cur.execute('''
    INSERT INTO PURPLEAIR_ERRORS (sensor_index, humidity_error, temperature_error, pressure_error, pm2_5_error) 
    VALUES (%s, %s, %s, %s, %s) 
    ''', (r['sensor_index'], r['humidity_error'], r['temperature_error'], r['pressure_error'], r['pm2_5_error']))
    connection.commit()
# commit the changes to the database and close the cursor and connection
cur.close()
connection.close()
#CREATE TABLE PURPLEAIR_REALTIME
#CREATE TABLE PURPLEAIR_ERRORS