# PurpleAir Stations QAQC

## Set Working Environment

In [9]:
cwd = os.getcwd() # This is a global variable for where the notebook is (must change if running in arcpro)

# Make it workspace

arcpy.env.workspace = os.path.join(cwd, '..', '..', 'data', 'QAQC.gdb')

arcpy.env.overwriteOutput = True # Overwrite layers is okay

## Import Packages

In [1]:
import os # For working with Operating System
from sys import platform # Diagnose operating system
import requests 
import datetime as dt # Converting unix time
import json

In [2]:
import numpy as np # For working with Arrays
import pandas as pd # Data Manipulation
import arcpy

In [3]:
from pprint import pprint # Pretty Printing
import matplotlib.pyplot as plt # Basic Plotting

## Import Minneapolis Boundary

In [5]:
#Importing the GeoJSON of MPLS boundary
mpls_bndry_path = os.path.join(os.getcwd(), 'mpls_boundary.geojson')

## Set Bounds for PurpleAir Parameters

In [11]:
#Changing the projection from UMT 15 to NAD 84
arcpy.management.Project("mpls_8km", "mpls_8km_Project", 'GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]]', "WGS_1984_(ITRF00)_To_NAD_1983", 'PROJCS["NAD_1983_UTM_Zone_15N",GEOGCS["GCS_North_American_1983",DATUM["D_North_American_1983",SPHEROID["GRS_1980",6378137.0,298.257222101]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Transverse_Mercator"],PARAMETER["False_Easting",500000.0],PARAMETER["False_Northing",0.0],PARAMETER["Central_Meridian",-93.0],PARAMETER["Scale_Factor",0.9996],PARAMETER["Latitude_Of_Origin",0.0],UNIT["Meter",1.0]]', "NO_PRESERVE_SHAPE", None, "NO_VERTICAL")

In [12]:
#Create bounding box
arcpy.management.MinimumBoundingGeometry("mpls_8km_Project", "mpls_8km_Proje_MinimumBoundi", "RECTANGLE_BY_AREA")

In [14]:
#Converting bounding box to feature class
arcpy.conversion.FeatureClassToFeatureClass("mpls_8km_Proje_MinimumBoundi", "PurpleAirQAQC", "mpls_8km", '', 'CTU_ID "CTU_ID" true true false 19 Double 0 0,First,#,mpls_8km_Proje_MinimumBoundi,CTU_ID,-1,-1;CTU_NAME "CTU_NAME" true true false 254 Text 0 0,First,#,mpls_8km_Proje_MinimumBoundi,CTU_NAME,0,254;CTU_CODE "CTU_CODE" true true false 254 Text 0 0,First,#,mpls_8km_Proje_MinimumBoundi,CTU_CODE,0,254;BUFF_DIST "BUFF_DIST" true true false 19 Double 0 0,First,#,mpls_8km_Proje_MinimumBoundi,BUFF_DIST,-1,-1;ORIG_FID "ORIG_FID" true true false 10 Long 0 10,First,#,mpls_8km_Proje_MinimumBoundi,ORIG_FID,-1,-1;ORIG_FID_1 "ORIG_FID_1" true true false 10 Long 0 10,First,#,mpls_8km_Proje_MinimumBoundi,ORIG_FID_1,-1,-1', '')

ExecuteError: Failed to execute. Parameters are not valid.
ERROR 000732: Output Location: Dataset PurpleAirQAQC does not exist or is not supported
Failed to execute (FeatureClassToFeatureClass).


In [16]:
#Setting lat/long for PurpleAir API Parameters
nwlng = arcpy.Describe("mpls_8km_Proje_MinimumBoundi").extent.XMin
nwlat = arcpy.Describe("mpls_8km_Proje_MinimumBoundi").extent.YMax
selng = arcpy.Describe("mpls_8km_Proje_MinimumBoundi").extent.XMax
selat = arcpy.Describe("mpls_8km_Proje_MinimumBoundi").extent.YMin

## Importing PurpleAir STATION Data from PurpleAir API

In [6]:
def getSensorsData(query='', api_read_key=''):

    # my_url is assigned the URL we are going to send our request to.
    url = 'https://api.purpleair.com/v1/sensors?' + query
    
    print('Here is the full url for the API call:\n\n', url)

    # my_headers is assigned the context of our request we want to make. In this case
    # we will pass through our API read key using the variable created above.
    my_headers = {'X-API-Key':api_read_key}

    # This line creates and sends the request and then assigns its response to the
    # variable, r.
    response = requests.get(url, headers=my_headers)

    # We then return the response we received.
    return response

In [7]:
#PurpleAir API 'read' key
api = input('Please enter your Purple Air api key')

Please enter your Purple Air api key 51592903-B445-11ED-B6F4-42010A800007


In [17]:
#Set bounding strings for API parameters
bounds_strings = [f'nwlng={nwlng}',
                  f'nwlat={nwlat}',
                  f'selng={selng}',
                  f'selat={selat}']

bounds_string = '&'.join(bounds_strings)

print(bounds_string)

nwlng=-93.43083707299996&nwlat=45.12366876300007&selng=-93.09225748799997&selat=44.81791263300005


In [18]:
#Setting parameters for API
fields = ['name', 'firmware_version','date_created','last_modified','last_seen','uptime','position_rating','channel_state','channel_flags','altitude',
          'location_type','latitude', 'longitude']


#ignore this - I am trying to pull both station params as well as real-time
#fields = ['time_stamp', 'humidity', 'temperature', 'pressure', 'pm2.5']

fields_string = 'fields=' + '%2C'.join(fields)

print(fields_string)

fields=name%2Cfirmware_version%2Cdate_created%2Clast_modified%2Clast_seen%2Cuptime%2Cposition_rating%2Cchannel_state%2Cchannel_flags%2Caltitude%2Clocation_type%2Clatitude%2Clongitude


In [19]:
#finalizing query for API function
query_string = '&'.join([fields_string, bounds_string])

print(query_string)

fields=name%2Cfirmware_version%2Cdate_created%2Clast_modified%2Clast_seen%2Cuptime%2Cposition_rating%2Cchannel_state%2Cchannel_flags%2Caltitude%2Clocation_type%2Clatitude%2Clongitude&nwlng=-93.43083707299996&nwlat=45.12366876300007&selng=-93.09225748799997&selat=44.81791263300005


In [20]:
#calling the API
response = getSensorsData(query_string, api)

Here is the full url for the API call:

 https://api.purpleair.com/v1/sensors?fields=name%2Cfirmware_version%2Cdate_created%2Clast_modified%2Clast_seen%2Cuptime%2Cposition_rating%2Cchannel_state%2Cchannel_flags%2Caltitude%2Clocation_type%2Clatitude%2Clongitude&nwlng=-93.43083707299996&nwlat=45.12366876300007&selng=-93.09225748799997&selat=44.81791263300005


In [21]:
response_dict = response.json() # Read response as a json (dictionary)

col_names = response_dict['fields']
data = np.array(response_dict['data'])

df_stations = pd.DataFrame(data, columns = col_names)

df_stations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   sensor_index      88 non-null     object
 1   last_modified     88 non-null     object
 2   date_created      88 non-null     object
 3   last_seen         88 non-null     object
 4   name              88 non-null     object
 5   location_type     88 non-null     object
 6   firmware_version  88 non-null     object
 7   uptime            88 non-null     object
 8   position_rating   88 non-null     object
 9   latitude          88 non-null     object
 10  longitude         88 non-null     object
 11  altitude          88 non-null     object
 12  channel_state     88 non-null     object
 13  channel_flags     88 non-null     object
dtypes: object(14)
memory usage: 9.8+ KB


## Cleaning PurpleAir Station Data

In [114]:
#visualizing API response
df_stations.head()

Unnamed: 0,sensor_index,last_modified,date_created,last_seen,name,location_type,firmware_version,uptime,position_rating,latitude,longitude,altitude,channel_state,channel_flags
0,3088,1504993349,1504040633,1679971604,Howe Neighborhood,0,6.06b,1166,5,44.935818,-93.21752,833,3,0
1,5582,1660166545,1514335701,1679971590,Vircroft Ashnia,0,7.02,664,5,44.891655,-93.34291,899,3,0
2,137876,1637086469,1637082783,1679971516,King Field Indoors,1,7.02,25046,0,44.928917,-93.284706,886,3,0
3,11134,1529977499,1527023589,1679971517,Linden Hills,0,7.02,5978,5,44.92776,-93.32235,886,3,0
4,142718,1675359061,1642013869,1679971567,City of Minneapolis Community Air Monitoring P...,0,7.02,10501,5,44.995792,-93.295395,865,3,0


In [115]:
#filtering for all 'outside' sensors and removing 'inside' sensors
outside_sensors = df_stations[df_stations.location_type=='0'] # 0 = outside

len(outside_sensors)

80

In [113]:
'''
savepath_timestamps = os.path.join(cwd, '..','..','data', "outside_sensors.csv")
outside_sensors_timestamp = outside_sensors.to_csv(savepath_timestamps, index = False)
outside_sensors_timestamp = pd.read_csv(savepath_timestamps)
'''

'\nsavepath_timestamps = os.path.join(cwd, \'..\',\'..\',\'data\', "outside_sensors.csv")\noutside_sensors_timestamp = outside_sensors.to_csv(savepath_timestamps, index = False)\noutside_sensors_timestamp = pd.read_csv(savepath_timestamps)\n'

In [112]:
'''
#Converting the UNIX dates to time stamps

#keeping commented for now because this is a new addition and I will need to update my SQL table to accommodate
#outside_sensors.loc[:,'last_seen_timestamp'] = outside_sensors.last_seen.apply(lambda x: dt.datetime.fromtimestamp(int(x))).values
#outside_sensors.loc[:,'last_modified_timestamp'] = outside_sensors.last_modified.apply(lambda x: dt.datetime.fromtimestamp(int(x))).values
#outside_sensors.loc[:,'date_created_timestamp'] = outside_sensors.date_created.apply(lambda x: dt.datetime.fromtimestamp(int(x))).values

#print('Least recent sensor was seen: ', last_seen.min())
#print('Most recent sensor was seen: ', last_seen.max())

import csv
import datetime

# Open the input CSV file
#input_path = 'input.csv'
with open(savepath_timestamps, 'r') as input_file:
    # Open the output CSV file
    output_path = 'output.csv'
    with open(output_path, 'w', newline='') as output_file:
        # Create a CSV reader and writer
        reader = csv.reader(input_file)
        writer = csv.writer(output_file)
        
        # Iterate over each row in the input CSV file
        for row in reader:
            # Get the UNIX timestamp value from the original date/time column
            unix_timestamp = int(row[1])  # Replace 2 with the index of the original date/time column
            
            # Convert UNIX timestamp to datetime object
            datetime_obj = datetime.datetime.fromtimestamp(unix_timestamp)
            
            # Add the new datetime value to the end of the row
            row.append(datetime_obj.strftime('%Y-%m-%d %H:%M:%S'))
            
            # Write the updated row to the output CSV file
            writer.writerow(row)
'''

"\n#Converting the UNIX dates to time stamps\n\n#keeping commented for now because this is a new addition and I will need to update my SQL table to accommodate\n#outside_sensors.loc[:,'last_seen_timestamp'] = outside_sensors.last_seen.apply(lambda x: dt.datetime.fromtimestamp(int(x))).values\n#outside_sensors.loc[:,'last_modified_timestamp'] = outside_sensors.last_modified.apply(lambda x: dt.datetime.fromtimestamp(int(x))).values\n#outside_sensors.loc[:,'date_created_timestamp'] = outside_sensors.date_created.apply(lambda x: dt.datetime.fromtimestamp(int(x))).values\n\n#print('Least recent sensor was seen: ', last_seen.min())\n#print('Most recent sensor was seen: ', last_seen.max())\n\nimport csv\nimport datetime\n\n# Open the input CSV file\n#input_path = 'input.csv'\nwith open(savepath_timestamps, 'r') as input_file:\n    # Open the output CSV file\n    output_path = 'output.csv'\n    with open(output_path, 'w', newline='') as output_file:\n        # Create a CSV reader and writer\n 

In [116]:
outside_sensors.head()

Unnamed: 0,sensor_index,last_modified,date_created,last_seen,name,location_type,firmware_version,uptime,position_rating,latitude,longitude,altitude,channel_state,channel_flags
0,3088,1504993349,1504040633,1679971604,Howe Neighborhood,0,6.06b,1166,5,44.935818,-93.21752,833,3,0
1,5582,1660166545,1514335701,1679971590,Vircroft Ashnia,0,7.02,664,5,44.891655,-93.34291,899,3,0
3,11134,1529977499,1527023589,1679971517,Linden Hills,0,7.02,5978,5,44.92776,-93.32235,886,3,0
4,142718,1675359061,1642013869,1679971567,City of Minneapolis Community Air Monitoring P...,0,7.02,10501,5,44.995792,-93.295395,865,3,0
5,142720,1675359105,1642013875,1679971586,City of Minneapolis community air monitoring p...,0,7.02,65104,5,44.95617,-93.25471,856,3,0


## Exporting cleaned data CSV to use with Arcpy

In [117]:
savepath = os.path.join(cwd, '..','..','data', "purpleair_stations_df.csv")
purpleair_stations_df = outside_sensors.to_csv(savepath, index = False)
purpleair_stations_df = pd.read_csv(savepath)

## Let the QAQC Begin!

In [118]:
#visual inspection of points

arcpy.management.XYTableToPoint(savepath, "purpleair_stations_XYTableToPoint", "longitude", "latitude", None, 'GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137.0,298.257223563]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]];-400 -400 1000000000;-100000 10000;-100000 10000;8.98315284119521E-09;0.001;0.001;IsHighPrecision')

In [119]:
#checking the geometry of lat/longs
result = arcpy.management.CheckGeometry("purpleair_stations_XYTableToPoint", "purpleair_stat_CheckGeometry", "OGC")
if result[0] == 'true':
    print(result.getMessages())

else:
    # No problems found
    print("No problems found")

No problems found


In [120]:
#removing null values

bool_series = pd.notnull(purpleair_stations_df)
purpleair_stations_df = purpleair_stations_df[bool_series]
purpleair_stations_df.info()
#still 80 rows

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   sensor_index      80 non-null     int64  
 1   last_modified     80 non-null     int64  
 2   date_created      80 non-null     int64  
 3   last_seen         80 non-null     int64  
 4   name              80 non-null     object 
 5   location_type     80 non-null     int64  
 6   firmware_version  80 non-null     object 
 7   uptime            80 non-null     int64  
 8   position_rating   80 non-null     int64  
 9   latitude          80 non-null     float64
 10  longitude         80 non-null     float64
 11  altitude          80 non-null     int64  
 12  channel_state     80 non-null     int64  
 13  channel_flags     80 non-null     int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 8.9+ KB


In [163]:
# Checking altitude

# Commented out for now becasue not working properly
#Altitude should be between 687 (Minnehaha Falls) to 830

#outside_sensors['altitude'].astype(int)
#altitude_check = outside_sensors[outside_sensors['altitude'] > 686 and < 831]

SyntaxError: invalid syntax (<string>, line 6)

In [121]:
#converting lat/long to WKT for SQL

import csv
from shapely.geometry import Point

csv_path = r"C:\Users\tande\Documents\GitHub\QualityAirQualityCities\PurpleAirQAQC\purpleair_stations_df.csv"

# Load the input DataFrame from a CSV file
df = pd.read_csv(savepath)

# Create a list to store the WKT values
wkt_list = []

# Iterate over each row and add the WKT representation of a point geometry
for index, row in df.iterrows():
    latitude, longitude = row['latitude'], row['longitude']
    point = Point(longitude, latitude)
    wkt = point.wkt
    wkt_list.append(wkt)

# Add the WKT column to the DataFrame
df['WKT'] = wkt_list

# Write the DataFrame to a CSV file
df.to_csv('output.csv', index=False)

## Connecting to the Server

In [100]:
import psycopg2
from psycopg2 import sql

In [122]:
connection = psycopg2.connect(host = '34.132.44.118',
                              database = 'lab1-2',
                              user = 'postgres',
                              password = 'password')
connection.closed

0

## Create SQL Table

In [124]:
cursor = connection.cursor()
cursor.execute("DROP TABLE IF EXISTS purpleair_stations;")
new_table = sql.SQL("CREATE TABLE purpleair_stations (sensor_index integer, last_modified integer, date_created integer, last_seen integer, name varchar(100), location_type integer, firmware_version varchar(30), uptime integer, position_rating integer, latitude float, longitude float, altitude integer, channel_state integer, channel_flags integer, WKT geometry);")

cursor.execute(new_table)
connection.commit()

## Insert Data into SQL Table

In [125]:
# Open the CSV file
csv_path = r"C:\Users\tande\Documents\ArcGIS\Projects\PurpleAir_PM_QAQC\output.csv"
with open(csv_path) as f:
    # Use the CSV reader to read the file
    reader = csv.reader(f)
    # Skip the header row
    next(reader)
    # Iterate over each row and insert into the table
    for row in reader:
        # Unpack the row fields into separate variables
        sensor_index, last_modified, date_created, last_seen, name, location_type, firmware_version, uptime, position_rating, latitude, longitude, altitude, channel_state, channel_flags, wkt = row
        # Construct the INSERT statement with explicit field names
        query = """INSERT INTO purpleair_stations (
                        sensor_index, last_modified, date_created, last_seen, name, location_type, firmware_version, uptime, 
                        position_rating, latitude, longitude, altitude, channel_state, channel_flags, WKT) 
                   VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
        # Execute the INSERT statement with the row fields as parameters
        cursor.execute(query, (sensor_index, last_modified, date_created, last_seen, name, location_type, firmware_version, 
                               uptime, position_rating, latitude, longitude, altitude, channel_state, channel_flags, wkt))

# Commit the changes to the database
connection.commit()

# Close the cursor and connection
cursor.close()
connection.close()