In [1]:
### Import Packages

# File manipulation

import os # For working with Operating System
import requests # Accessing the Web
import datetime as dt # Working with dates/times

# Database 

import psycopg2
from psycopg2 import sql

# Analysis

import numpy as np
import arcpy
import pandas as pd
from scipy.spatial.distance import cdist
import pyproj


import os # For working with Operating System
import shutil # For deleting folders
import urllib # For accessing websites
import zipfile # For extracting from Zipfiles
from io import BytesIO # For reading bytes objects

# Database 

import psycopg2

# Analysis

import arcpy
import pandas as pd

In [2]:
# Get CWD

cwd = os.getcwd() # This is a global variable for where the notebook is (must change if running in arcpro)

# Create GeoDataBase
# This is the communal GeoDataBase

if not os.path.exists(os.path.join(cwd, '..', '..', 'data', 'QAQC.gdb')): # If it doesn't exist, create it

    arcpy.management.CreateFileGDB(os.path.join(cwd, '..', '..', 'data'), 'QAQC')

# Make it workspace



arcpy.env.overwriteOutput = True # Overwrite layers is okay

save_path_gdb = os.path.join(cwd, '..', '..', 'data', 'QAQC.gdb')

### Bring in boundary of Minneapoolis

In [3]:
### Definitions

def extract_zip_from_url(url=None, path=None):
    '''Extract a zipfile from the internet
    then unpack it in to it's own folder 
    within the working directory.
    Takes a single url (string).'''

    if not os.path.exists(path):
        os.mkdir(path)
    # Unload zip into the new folder
    response = urllib.request.urlopen(url) # Get a response
    zip_folder = zipfile.ZipFile(BytesIO(response.read())) # Read Response
    zip_folder.extractall(path=path) # Extract files
    zip_folder.close() # Close zip object

In [4]:
# Download Data

## Twin Cities Metro Boundaries - Downloaded from MN GeospatialCommons gisdata.mn.gov  (~ 5mb)

url = "https://resources.gisdata.mn.gov/pub/gdrs/data/pub/us_mn_state_metc/bdry_census2020counties_ctus/shp_bdry_census2020counties_ctus.zip"

# Create folder name for file
folder_name = url.split('/')[-1][:-4]
# Make folder for files
savepath = os.path.join(cwd, '..','..','data', folder_name)

extract_zip_from_url(url, savepath)

In [5]:
# Read & Select

# Get path

filename = 'Census2020CTUs.shp'
path = os.path.join(savepath, filename)

arcpy.MakeFeatureLayer_management(path, "TCMA_lyr")

# Select Minneapolis

mpls_boundary = arcpy.management.SelectLayerByAttribute("TCMA_lyr", "SUBSET_SELECTION",
                                                        arcpy.AddFieldDelimiters(datasource='TCMA_lyr', field= 'CTU_NAME') + "= 'Minneapolis'")

# Write the selected features to a new featureclass
arcpy.management.CopyFeatures(mpls_boundary, "mpls_boundary")

ExecuteError: ERROR 000210: Cannot create output mpls_boundary
Failed to execute (CopyFeatures).


In [6]:
# Buffer

arcpy.analysis.Buffer('mpls_boundary', 'mpls_8km', '8 Kilometers')

# Reproject

out_coordinate_system = arcpy.SpatialReference(4326)
mpls_8k_reproject = arcpy.Project_management("mpls_8km", "mpls_8km_wgs", out_coordinate_system)

# # Save as a geojson (Don't need)

# arcpy.conversion.FeaturesToJSON(mpls_boundary, 'mpls_boundary.geojson', geoJSON='GEOJSON')

ExecuteError: Failed to execute. Parameters are not valid.
ERROR 000732: Input Features: Dataset mpls_boundary does not exist or is not supported
Failed to execute (Buffer).


### call the right sensors

In [24]:
#Setting lat/long for PurpleAir API Parameters
nwlng = arcpy.Describe("mpls_8km_wgs").extent.XMin
nwlat = arcpy.Describe("mpls_8km_wgs").extent.YMax
selng = arcpy.Describe("mpls_8km_wgs").extent.XMax
selat = arcpy.Describe("mpls_8km_wgs").extent.YMin

In [25]:
def getSensorsData(query='', api_read_key=''):

    # my_url is assigned the URL we are going to send our request to.
    url = 'https://api.purpleair.com/v1/sensors?' + query
    
    print('Here is the full url for the API call:\n\n', url)

    # my_headers is assigned the context of our request we want to make. In this case
    # we will pass through our API read key using the variable created above.
    my_headers = {'X-API-Key':api_read_key}

    # This line creates and sends the request and then assigns its response to the
    # variable, r.
    response = requests.get(url, headers=my_headers)

    # We then return the response we received.
    return response

In [26]:
#PurpleAir API 'read' key
api = input('Please enter your Purple Air api key')

Please enter your Purple Air api keyA9B09E48-AEE2-11ED-B6F4-42010A800007


In [27]:
#Set bounding strings for API parameters
bounds_strings = [f'nwlng={nwlng}',
                  f'nwlat={nwlat}',
                  f'selng={selng}',
                  f'selat={selat}']

bounds_string = '&'.join(bounds_strings)

print(bounds_string)

nwlng=-93.43047670599998&nwlat=45.12326797900003&selng=-93.09299994199998&selat=44.81858013100003


In [28]:
#Setting parameters for API
fields = ['name', 'firmware_version','date_created','last_modified','last_seen','uptime','position_rating','channel_state','channel_flags','altitude',
          'location_type','latitude', 'longitude']

fields_string = 'fields=' + '%2C'.join(fields)

print(fields_string)

fields=name%2Cfirmware_version%2Cdate_created%2Clast_modified%2Clast_seen%2Cuptime%2Cposition_rating%2Cchannel_state%2Cchannel_flags%2Caltitude%2Clocation_type%2Clatitude%2Clongitude


In [29]:
#finalizing query for API function
query_string = '&'.join([fields_string, bounds_string])

print(query_string)

fields=name%2Cfirmware_version%2Cdate_created%2Clast_modified%2Clast_seen%2Cuptime%2Cposition_rating%2Cchannel_state%2Cchannel_flags%2Caltitude%2Clocation_type%2Clatitude%2Clongitude&nwlng=-93.43047670599998&nwlat=45.12326797900003&selng=-93.09299994199998&selat=44.81858013100003


In [30]:
#calling the API
response = getSensorsData(query_string, api)

Here is the full url for the API call:

 https://api.purpleair.com/v1/sensors?fields=name%2Cfirmware_version%2Cdate_created%2Clast_modified%2Clast_seen%2Cuptime%2Cposition_rating%2Cchannel_state%2Cchannel_flags%2Caltitude%2Clocation_type%2Clatitude%2Clongitude&nwlng=-93.43047670599998&nwlat=45.12326797900003&selng=-93.09299994199998&selat=44.81858013100003


In [31]:
response_dict = response.json() # Read response as a json (dictionary)

col_names = response_dict['fields']
data = np.array(response_dict['data'])

df = pd.DataFrame(data, columns = col_names)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   sensor_index      87 non-null     object
 1   last_modified     87 non-null     object
 2   date_created      87 non-null     object
 3   last_seen         87 non-null     object
 4   name              87 non-null     object
 5   location_type     87 non-null     object
 6   firmware_version  87 non-null     object
 7   uptime            87 non-null     object
 8   position_rating   87 non-null     object
 9   latitude          87 non-null     object
 10  longitude         87 non-null     object
 11  altitude          87 non-null     object
 12  channel_state     87 non-null     object
 13  channel_flags     87 non-null     object
dtypes: object(14)
memory usage: 9.6+ KB


In [77]:
#visualizing API response
pd.set_option('display.max_rows', None)
print(df)


    sensor_index last_modified date_created   last_seen  \
0           3088    1504993349   1504040633  1683676773   
1         137876    1637086469   1637082783  1683676773   
2          11134    1529977499   1527023589  1683676770   
3         142718    1675359061   1642013869  1683676762   
4         142720    1675359105   1642013875  1683676689   
5         142726    1675359066   1642013897  1683676736   
6         142724    1681913464   1642013889  1683676751   
7         142730    1675359088   1642013916  1683676749   
8         142728    1675359054   1642013901  1683676695   
9         142734    1675359072   1642013929  1683674958   
10        142732    1675359127   1642013923  1683676771   
11        142736    1675358959   1642013936  1683476819   
12        142742    1683554221   1642014280  1642438720   
13        142744    1675359042   1642014285  1683263261   
14        142750    1675791761   1642014335  1683676746   
15        142748    1675359111   1642014328  1683676808 

# Analysis of PM 2.5 in Minneapolis 

### This notebook models the spread of PM 2.5 throughout Minneapolis using a Huff gravity model. An accuracy assessment is also completed on the model.

### Bring in data of pm 2.5 spikes

In [33]:
spikeData = pd.read_csv('Parameter_df.csv')
spikeData

Unnamed: 0,sensor_index,date,is_weekday,n_observations,pm25_fullDay_mean,pm25_fullDay_minutesAbove12ug,n_spikes,humidity_fullDay_mean,temperature_fullDay_mean,pressure_fullDay_mean,idwSum_facilities,idwSum_traffic
0,143656,2023-03-18,0,144,5.341125,190,0,45.833,23.505,988.205,0.005054,861.930809
1,142774,2022-11-05,0,144,8.901722,370,0,62.438,42.682,973.451,0.004609,452.986261
2,145454,2022-12-06,1,144,11.355750,690,2,48.174,26.181,987.556,67.350629,587.483355
3,142748,2022-12-07,1,144,11.848437,560,3,53.589,25.477,994.330,2.827060,3827.349398
4,143636,2023-01-22,0,144,24.739590,1440,50,60.596,28.776,983.963,24.669941,453.501189
...,...,...,...,...,...,...,...,...,...,...,...,...
10367,142728,2023-01-15,0,144,17.993132,1400,4,58.071,37.246,979.981,52.390284,2373.255885
10368,142774,2023-01-08,0,144,74.232264,1440,144,55.522,18.940,994.828,0.004609,452.986261
10369,143226,2022-06-26,0,144,3.610160,0,0,37.410,76.454,989.615,0.183773,342.481335
10370,143214,2023-03-09,1,144,8.922403,340,0,61.694,39.656,999.879,0.308809,327.047615


# Bring in wind speed averages for mlps

## Average them into daily wind speed and direction

In [34]:
# Load Data
msp_url = 'https://www.ncei.noaa.gov/data/normals-hourly/1991-2020/access/USW00014922.csv'
response = requests.get(msp_url)
msp_wind = pd.read_csv(msp_url, usecols=['STATION', 'LATITUDE', 'LONGITUDE', 'DATE', 'month', 'day', 'hour', 'HLY-WIND-AVGSPD', 'HLY-WIND-VCTDIR'])

# Rename columns
msp_wind = msp_wind.rename(columns={'month': 'MONTH', 'hour':'HOUR', 'day':'DAY', 'HLY-WIND-AVGSPD':'HLY_WIND_AVGSPD', 'HLY-WIND-VCTDIR':'HLY_WIND_VCDIR'})

# Convert 'DATE' to datetime format
msp_wind['DATE'] = pd.to_datetime(msp_wind['DATE'], format='%m-%dT%H:%M:%S')

# Group by station and date
grouped_wind_data = msp_wind.groupby([msp_wind.STATION, msp_wind.DATE.dt.date]).mean().reset_index()

# Rename the 'DATE' column to 'date' for consistency
grouped_wind_data = grouped_wind_data.rename(columns={'DATE': 'date'})

# Calculate daily averages
grouped_wind_data['daily_wind_speed_avg'] = grouped_wind_data['HLY_WIND_AVGSPD']
grouped_wind_data['daily_wind_vector_direction_avg'] = grouped_wind_data['HLY_WIND_VCDIR']

# Keep only relevant columns
daily_wind_data = grouped_wind_data[['STATION', 'date', 'daily_wind_speed_avg', 'daily_wind_vector_direction_avg']]

print(daily_wind_data)


         STATION        date  daily_wind_speed_avg  \
0    USW00014922  1900-01-01              9.075000   
1    USW00014922  1900-01-02              9.150000   
2    USW00014922  1900-01-03              9.216667   
3    USW00014922  1900-01-04              9.216667   
4    USW00014922  1900-01-05              9.275000   
..           ...         ...                   ...   
360  USW00014922  1900-12-27              9.170833   
361  USW00014922  1900-12-28              9.183333   
362  USW00014922  1900-12-29              9.154167   
363  USW00014922  1900-12-30              9.183333   
364  USW00014922  1900-12-31              9.108333   

     daily_wind_vector_direction_avg  
0                         287.666667  
1                         286.875000  
2                         284.958333  
3                         286.875000  
4                         288.125000  
..                               ...  
360                       286.625000  
361                       291.166667  


  grouped_wind_data = msp_wind.groupby([msp_wind.STATION, msp_wind.DATE.dt.date]).mean().reset_index()


### Join the wind and spike data

In [35]:
# Convert the 'date' columns in both DataFrames to datetime objects
spikeData['date'] = pd.to_datetime(spikeData['date'])
daily_wind_data['date'] = pd.to_datetime(daily_wind_data['date'])

# Extract the month and day from the 'date' columns
spikeData['month'] = spikeData['date'].dt.month
spikeData['day'] = spikeData['date'].dt.day

daily_wind_data['month'] = daily_wind_data['date'].dt.month
daily_wind_data['day'] = daily_wind_data['date'].dt.day

# Merge the DataFrames based on the 'month' and 'day' columns
merged_data = spikeData.merge(daily_wind_data, on=['month', 'day'], suffixes=('', '_wind'))

# Drop unnecessary columns
merged_data = merged_data.drop(columns=['STATION', 'date_wind'])

# Display the merged DataFrame
print(merged_data)


       sensor_index       date  is_weekday  n_observations  pm25_fullDay_mean  \
0            143656 2023-03-18           0             144           5.341125   
1            142730 2023-03-18           0             144           2.830403   
2            143214 2023-03-18           0             144           2.742826   
3            145242 2023-03-18           0             144           4.682944   
4            142774 2023-03-18           0             144           2.842639   
...             ...        ...         ...             ...                ...   
10367        142720 2022-06-22           1             144           3.474497   
10368        142724 2022-06-22           1             144           3.284281   
10369        143214 2022-06-22           1             144           2.914250   
10370        145242 2022-06-22           1             144           4.760962   
10371        145470 2022-06-22           1             144           4.151205   

       pm25_fullDay_minutes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  daily_wind_data['date'] = pd.to_datetime(daily_wind_data['date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  daily_wind_data['month'] = daily_wind_data['date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  daily_wind_data['day'] = daily_wind_data['date'].dt.day


### Join station location with joined spike and wind data

In [36]:
# Convert the 'sensor_index' columns to the same data type (e.g., int64)
df['sensor_index'] = df['sensor_index'].astype('int64')
merged_data['sensor_index'] = merged_data['sensor_index'].astype('int64')

# Merge the DataFrames based on the 'sensor_index' column
stationSpikeWindData = merged_data.merge(df, on='sensor_index')

# stationSpikeWindData.to_csv('station_spike_wind_data.csv', index=False)


print(stationSpikeWindData)

       sensor_index       date  is_weekday  n_observations  pm25_fullDay_mean  \
0            143656 2023-03-18           0             144           5.341125   
1            143656 2022-11-05           0             144           8.760681   
2            143656 2022-12-06           1             144           9.107910   
3            143656 2022-12-07           1             144          14.958146   
4            143656 2023-01-22           0             144          26.534174   
...             ...        ...         ...             ...                ...   
10122        145604 2023-04-17           1             144           2.503681   
10123        145604 2023-04-13           1             139           9.498234   
10124        145604 2023-04-15           0             142          10.386077   
10125        145604 2023-04-10           1             144          11.901583   
10126        145604 2023-04-20           1             144           8.795903   

       pm25_fullDay_minutes

## Normalize wind speed

In [37]:
# Assuming 'weight_speed' and 'weight_speed_direction' are column names in your merged_data DataFrame
weight_speed = stationSpikeWindData['daily_wind_speed_avg']

# Normalize weight_speed
weight_speed_min = weight_speed.min()
weight_speed_max = weight_speed.max()
stationSpikeWindData['weight_speed_normalized'] = (weight_speed - weight_speed_min) / (weight_speed_max - weight_speed_min)

# Print the merged_data DataFrame with the new columns 'weight_speed_normalized' and 'weight_speed_direction_normalized'
print(stationSpikeWindData)

       sensor_index       date  is_weekday  n_observations  pm25_fullDay_mean  \
0            143656 2023-03-18           0             144           5.341125   
1            143656 2022-11-05           0             144           8.760681   
2            143656 2022-12-06           1             144           9.107910   
3            143656 2022-12-07           1             144          14.958146   
4            143656 2023-01-22           0             144          26.534174   
...             ...        ...         ...             ...                ...   
10122        145604 2023-04-17           1             144           2.503681   
10123        145604 2023-04-13           1             139           9.498234   
10124        145604 2023-04-15           0             142          10.386077   
10125        145604 2023-04-10           1             144          11.901583   
10126        145604 2023-04-20           1             144           8.795903   

       pm25_fullDay_minutes

## Normalize wind direction

In [38]:
# Assuming 'wind_direction' is a column name in your stationSpikeWindData DataFrame
wind_direction = stationSpikeWindData['daily_wind_vector_direction_avg']

# Normalize wind_direction
wind_direction_normalized = wind_direction / 360

# Add the normalized wind direction as a new column to the stationSpikeWindData DataFrame
stationSpikeWindData['wind_direction_normalized'] = wind_direction_normalized

# Print the updated stationSpikeWindData DataFrame
print(stationSpikeWindData)

       sensor_index       date  is_weekday  n_observations  pm25_fullDay_mean  \
0            143656 2023-03-18           0             144           5.341125   
1            143656 2022-11-05           0             144           8.760681   
2            143656 2022-12-06           1             144           9.107910   
3            143656 2022-12-07           1             144          14.958146   
4            143656 2023-01-22           0             144          26.534174   
...             ...        ...         ...             ...                ...   
10122        145604 2023-04-17           1             144           2.503681   
10123        145604 2023-04-13           1             139           9.498234   
10124        145604 2023-04-15           0             142          10.386077   
10125        145604 2023-04-10           1             144          11.901583   
10126        145604 2023-04-20           1             144           8.795903   

       pm25_fullDay_minutes

# Historic Data daily average

In [39]:
## create new column of present absence based on N_spikes. 
stationSpikeWindData['present_absent'] = stationSpikeWindData['n_spikes'].apply(lambda x: 0 if x == 0 else 1)
stationSpikeWindData

Unnamed: 0,sensor_index,date,is_weekday,n_observations,pm25_fullDay_mean,pm25_fullDay_minutesAbove12ug,n_spikes,humidity_fullDay_mean,temperature_fullDay_mean,pressure_fullDay_mean,...,uptime,position_rating,latitude,longitude,altitude,channel_state,channel_flags,weight_speed_normalized,wind_direction_normalized,present_absent
0,143656,2023-03-18,0,144,5.341125,190,0,45.833,23.505,988.205,...,14805,5,44.932407,-93.28342,874,3,0,0.551807,0.548032,0
1,143656,2022-11-05,0,144,8.760681,290,0,60.125,44.010,971.738,...,14805,5,44.932407,-93.28342,874,3,0,0.577108,0.728472,0
2,143656,2022-12-06,1,144,9.107910,340,0,49.004,27.014,986.269,...,14805,5,44.932407,-93.28342,874,3,0,0.351807,0.753819,0
3,143656,2022-12-07,1,144,14.958146,610,14,49.725,28.465,993.579,...,14805,5,44.932407,-93.28342,874,3,0,0.374699,0.750579,1
4,143656,2023-01-22,0,144,26.534174,1430,55,61.631,28.517,984.403,...,14805,5,44.932407,-93.28342,874,3,0,0.500000,0.810995,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,145604,2023-04-17,1,144,2.503681,0,0,56.064,41.486,981.399,...,34397,5,44.961617,-93.23804,841,3,0,0.885542,0.116667,0
10123,145604,2023-04-13,1,139,9.498234,250,4,31.505,80.174,975.448,...,34397,5,44.961617,-93.23804,841,3,0,0.966265,0.092708,1
10124,145604,2023-04-15,0,142,10.386077,700,5,67.308,55.249,979.440,...,34397,5,44.961617,-93.23804,841,3,0,0.900000,0.091782,1
10125,145604,2023-04-10,1,144,11.901583,490,11,36.110,64.933,994.016,...,34397,5,44.961617,-93.23804,841,3,0,0.965060,0.118171,1


In [40]:
# create more columns of true postive, etc. these will be filled in later
stationSpikeWindData['overall_TP'] = 0
stationSpikeWindData['overall_FP'] = 0
stationSpikeWindData['overall_FN'] = 0
stationSpikeWindData['overall_TN'] = 0
stationSpikeWindData['Presence'] = 0
stationSpikeWindData['Accuracy'] = 0
stationSpikeWindData['Rank'] = 0
stationSpikeWindData

Unnamed: 0,sensor_index,date,is_weekday,n_observations,pm25_fullDay_mean,pm25_fullDay_minutesAbove12ug,n_spikes,humidity_fullDay_mean,temperature_fullDay_mean,pressure_fullDay_mean,...,weight_speed_normalized,wind_direction_normalized,present_absent,overall_TP,overall_FP,overall_FN,overall_TN,Presence,Accuracy,Rank
0,143656,2023-03-18,0,144,5.341125,190,0,45.833,23.505,988.205,...,0.551807,0.548032,0,0,0,0,0,0,0,0
1,143656,2022-11-05,0,144,8.760681,290,0,60.125,44.010,971.738,...,0.577108,0.728472,0,0,0,0,0,0,0,0
2,143656,2022-12-06,1,144,9.107910,340,0,49.004,27.014,986.269,...,0.351807,0.753819,0,0,0,0,0,0,0,0
3,143656,2022-12-07,1,144,14.958146,610,14,49.725,28.465,993.579,...,0.374699,0.750579,1,0,0,0,0,0,0,0
4,143656,2023-01-22,0,144,26.534174,1430,55,61.631,28.517,984.403,...,0.500000,0.810995,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,145604,2023-04-17,1,144,2.503681,0,0,56.064,41.486,981.399,...,0.885542,0.116667,0,0,0,0,0,0,0,0
10123,145604,2023-04-13,1,139,9.498234,250,4,31.505,80.174,975.448,...,0.966265,0.092708,1,0,0,0,0,0,0,0
10124,145604,2023-04-15,0,142,10.386077,700,5,67.308,55.249,979.440,...,0.900000,0.091782,1,0,0,0,0,0,0,0
10125,145604,2023-04-10,1,144,11.901583,490,11,36.110,64.933,994.016,...,0.965060,0.118171,1,0,0,0,0,0,0,0


In [41]:
# find population total for each sensor_index by summing N_spikes. this is our population

# divide by number of days they have been up. Have to find the nubmers of days each sensor has been up

total_n_spikes = stationSpikeWindData.groupby('sensor_index')['n_spikes'].agg(['sum', 'count']).reset_index()
total_n_spikes.columns = ['sensor_index', 'total_n_spikes','total_days']
total_n_spikes['spikes_per_day'] = total_n_spikes['total_n_spikes']/total_n_spikes['total_days']

total_n_spikes

Unnamed: 0,sensor_index,total_n_spikes,total_days,spikes_per_day
0,142718,4654,243,19.152263
1,142720,4363,316,13.806962
2,142724,335,178,1.882022
3,142726,4422,285,15.515789
4,142728,4379,222,19.725225
5,142730,2563,120,21.358333
6,142732,3668,292,12.561644
7,142734,3236,278,11.640288
8,142736,2438,133,18.330827
9,142744,3914,223,17.55157


In [42]:
# find distance between stations as a table

# Extract unique stations and their latitudes and longitudes
unique_stations = stationSpikeWindData[['sensor_index', 'latitude', 'longitude']].drop_duplicates()

# Convert latitude and longitude to Cartesian coordinates (x, y, z)
def latlon_to_xyz(lat, lon):
    proj = pyproj.Proj(proj='utm', zone=15, ellps='WGS84')
    x, y = proj(lon, lat)
    return x, y

unique_stations['x'], unique_stations['y'] = zip(*unique_stations.apply(lambda row: latlon_to_xyz(row['latitude'], row['longitude']), axis=1))

# Calculate pairwise distances between all stations
locations = unique_stations[['x', 'y']].values
distance_matrix = cdist(locations, locations, metric='euclidean')

# Create a DataFrame with the distance matrix
distance_df = pd.DataFrame(distance_matrix, columns=unique_stations['sensor_index'], index=unique_stations['sensor_index'])
distance_df

sensor_index,143656,142730,143214,145242,142774,143636,157935,145250,157871,145470,...,156605,142726,145506,142728,145616,143942,143944,142724,166459,145604
sensor_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
143656,0.0,6716.915796,4959.093743,8384.389769,3532.275768,8629.053685,3247.1738,4768.662836,1064.357853,4924.246726,...,3485.911529,9193.475201,2840.24383,4114.897578,5415.178397,5622.415595,4831.229429,3166.437375,4243.605066,4831.675609
142730,6716.915796,0.0,10784.62749,2985.440459,4610.393571,1948.599124,7273.188115,3127.710367,6675.026944,10658.368203,...,5985.422397,2606.03944,9087.836354,5499.221049,1499.196709,1369.534977,6716.074165,7372.792659,10937.183558,5710.257459
143214,4959.093743,10784.62749,0.0,11425.775457,8481.544812,12483.238182,7715.77646,9570.334628,4371.38651,211.40234,...,5084.482295,12877.667846,2155.298682,5839.230641,9794.030415,10009.310604,5295.46633,3547.415928,3066.951279,6156.776122
145242,8384.389769,2985.440459,11425.775457,0.0,7230.682812,2404.988394,9760.564284,5956.427311,7978.953323,11259.438784,...,6342.500513,2156.856295,10158.408436,5596.483317,4248.563783,4202.712509,6440.145943,7881.580726,12324.555907,5416.900152
142774,3532.275768,4610.393571,8481.544812,7230.682812,0.0,6499.45043,2680.682876,1638.576536,4210.60596,8436.430352,...,5483.819585,7199.426822,6372.474009,5647.093606,3114.496259,3245.049411,6847.821669,6047.335064,7470.791733,6299.272561
143636,8629.053685,1948.599124,12483.238182,2404.988394,6499.45043,0.0,9176.94519,4939.218304,8515.952674,12343.223273,...,7514.192078,741.472337,10895.798878,6907.842847,3423.273866,3265.916036,8000.331381,8988.191042,12824.923448,6955.221864
157935,3247.1738,7273.188115,7715.77646,9760.564284,2680.682876,9176.94519,0.0,4296.571152,4298.257538,7737.991379,...,6490.367914,9871.533569,5589.444226,6950.929545,5773.992599,5915.577558,7898.468383,6408.246357,5827.179317,7674.365865
145250,4768.662836,3127.710367,9570.334628,5956.427311,1638.576536,4939.218304,4296.571152,0.0,5199.766684,9497.272136,...,5793.679958,5657.956436,7551.059738,5722.721048,1708.058505,1775.130736,7014.164232,6708.679508,8910.221375,6262.845435
157871,1064.357853,6675.026944,4371.38651,7978.953323,4210.60596,8515.952674,4298.257538,5199.766684,0.0,4297.952035,...,2516.250361,9019.059731,2434.360721,3217.847972,5523.638153,5738.269143,3808.845324,2110.118973,4359.269893,3910.456355
145470,4924.246726,10658.368203,211.40234,11259.438784,8436.430352,12343.223273,7737.991379,9497.272136,4297.952035,0.0,...,4921.301747,12729.911875,2155.563109,5668.328878,9687.331449,9902.421537,5101.847426,3387.726739,3226.1921,5973.102407


In [43]:
# delete duplicate stations

# Melt the distance matrix DataFrame into a long format
distance_long = pd.melt(distance_df.reset_index(), id_vars=['sensor_index'], var_name='station_2', value_name='distance')
distance_long = distance_long.rename(columns={'sensor_index': 'station_1'})

# Remove duplicate station pairs
distance_long['station_pair'] = distance_long.apply(lambda row: frozenset([row['station_1'], row['station_2']]), axis=1)
distance_unique = distance_long.drop_duplicates(subset=['station_pair']).drop(columns=['station_pair'])

# Remove self-distances (i.e., distance of a station to itself)
distance_unique = distance_unique[distance_unique['station_1'] != distance_unique['station_2']]
distance_unique

Unnamed: 0,station_1,station_2,distance
1,142730,143656,6716.915796
2,143214,143656,4959.093743
3,145242,143656,8384.389769
4,142774,143656,3532.275768
5,143636,143656,8629.053685
...,...,...,...
2546,166459,143944,7131.547418
2547,145604,143944,1047.227506
2598,166459,142724,4993.639880
2599,145604,142724,2749.095572


# Bring in all spike data. And choose a random date to run analysis on

In [44]:
# All spikes read in csv

allSpikeDf = pd.read_csv('all_spikes.csv')
allSpikeDf['timestamp'] = pd.to_datetime(allSpikeDf.timestamp)
allSpikeDf

# Work to select a random day and finding earliest spike of 10 min interval
first_date = dt.datetime(2022, 6, 15) # June 15th, 2022?
datelist = pd.date_range(start = first_date, 
    end = dt.datetime.today(),
    normalize = True)
# print('Last Run on ', dt.datetime.today())

# select random date
rand_index = int(np.random.uniform(0, len(datelist),1)[0])
datelist[rand_index]

# select data from this random day
previous_readings = allSpikeDf[(allSpikeDf.timestamp.dt.date == datelist[rand_index].date())]
init_time = previous_readings.timestamp.min()

print('Selected day' , init_time)

print('Print list of selected spikes:', len(previous_readings))

Selected day 2023-02-02 00:00:00
Print list of selected spikes: 3296


In [70]:
allSpikeDf

Unnamed: 0,sensor_index,timestamp,pm25
0,143226,2022-06-17 22:10:00,58.910
1,143226,2022-06-17 22:20:00,35.495
2,143226,2022-06-17 22:00:00,319.660
3,142720,2022-06-18 03:20:00,33.595
4,142720,2022-06-18 03:30:00,36.026
...,...,...,...
175256,157871,2023-04-30 02:20:00,64.306
175257,157871,2023-04-30 02:40:00,122.301
175258,157871,2023-04-30 01:50:00,396.923
175259,157871,2023-04-30 03:10:00,39.789


In [99]:
# Convert 'timestamp' to datetime
allSpikeDf['timestamp'] = pd.to_datetime(allSpikeDf['timestamp'])

# Define the start and end times for the desired period
start_time = pd.Timestamp('2023-02-02 00:00:00')
end_time = pd.Timestamp('2023-02-02 02:00:00')

# Filter the DataFrame to include only the desired timestamps
filtered_df = allSpikeDf[(allSpikeDf['timestamp'] >= start_time) & (allSpikeDf['timestamp'] <= end_time)]

# Compute the count of 'pm25' values for each 'sensor_index' in the filtered DataFrame
originalPm25 = filtered_df.groupby('sensor_index').size().reset_index(name='originalPm2.5')

# Assuming 'stationSpikeWindData' is the other dataframe you mentioned
# Merging dataframes
originalPm2_5Spikes = pd.merge(originalPm25, df[['sensor_index', 'latitude', 'longitude']].drop_duplicates(), on='sensor_index', how='left')
originalPm2_5Spikes = result.dropna(subset=['latitude', 'longitude'])
originalPm2_5Spikes

Unnamed: 0,station_1,Presence,latitude,longitude
0,142718,3,44.995792,-93.295395
1,142720,2,44.95617,-93.25471
2,142724,1,44.937218,-93.243866
3,142726,3,45.01507,-93.28903
4,142728,0,44.95804,-93.245766
5,142730,0,44.99218,-93.29627
6,142732,0,44.916794,-93.273834
7,142734,0,44.903934,-93.28091
8,142736,0,44.972507,-93.28301
9,142744,45,44.99616,-93.29655


In [45]:
# create inital time of spike

init_presences = previous_readings[previous_readings.timestamp == init_time].sensor_index
init_presences

129116    145454
Name: sensor_index, dtype: int64

In [46]:
# add probabilty column to station distance table

distance_unique['MovementProbability'] = 0
distance_unique['Presence'] = 0
distance_unique

Unnamed: 0,station_1,station_2,distance,MovementProbability,Presence
1,142730,143656,6716.915796,0,0
2,143214,143656,4959.093743,0,0
3,145242,143656,8384.389769,0,0
4,142774,143656,3532.275768,0,0
5,143636,143656,8629.053685,0,0
...,...,...,...,...,...
2546,166459,143944,7131.547418,0,0
2547,145604,143944,1047.227506,0,0
2598,166459,142724,4993.639880,0,0
2599,145604,142724,2749.095572,0,0


In [47]:
# Merge distance_unique with total_n_spikes on station_1 and sensor_index
distance_unique = distance_unique.merge(total_n_spikes, left_on='station_1', right_on='sensor_index', how='left').copy()
distance_unique = distance_unique.rename(columns={'spikes_per_day': 'station_1_spikes_per_day'})

# Drop unnecessary columns
distance_unique = distance_unique.drop(columns=['sensor_index', 'total_n_spikes', 'total_days'])

# Merge distance_unique with total_n_spikes on station_2 and sensor_index
distance_unique = distance_unique.merge(total_n_spikes, left_on='station_2', right_on='sensor_index', how='left').copy()
distance_unique = distance_unique.rename(columns={'spikes_per_day': 'station_2_spikes_per_day'})

# Drop unnecessary columns
distance_unique = distance_unique.drop(columns=['sensor_index', 'total_n_spikes', 'total_days'])

# Display the resulting dataframe
distance_unique

Unnamed: 0,station_1,station_2,distance,MovementProbability,Presence,station_1_spikes_per_day,station_2_spikes_per_day
0,142730,143656,6716.915796,0,0,21.358333,20.983871
1,143214,143656,4959.093743,0,0,11.858491,20.983871
2,145242,143656,8384.389769,0,0,17.825532,20.983871
3,142774,143656,3532.275768,0,0,20.737430,20.983871
4,143636,143656,8629.053685,0,0,13.704861,20.983871
...,...,...,...,...,...,...,...
1321,166459,143944,7131.547418,0,0,2.638889,29.212121
1322,145604,143944,1047.227506,0,0,2.653846,29.212121
1323,166459,142724,4993.639880,0,0,2.638889,1.882022
1324,145604,142724,2749.095572,0,0,2.653846,1.882022


# Run Huff Model

## This helped us frame and understand how to run the model
#### https://gisgeography.com/huff-gravity-model/

In [48]:
# REtry

In [52]:
sensor_indexes = distance_unique.station_1.unique()

In [49]:
def run_single_simulation(distance_unique, alpha=2, beta=2):
    distance_unique_simulation = distance_unique.copy()
    distance_unique_simulation['attractiveness'] = distance_unique_simulation.station_2_spikes_per_day**alpha/distance_unique_simulation.distance**beta
    distance_unique_simulation['MovementProbability'] = distance_unique_simulation.attractiveness / distance_unique_simulation.attractiveness.sum()
    distance_unique_simulation['Presence'] = np.random.rand(len(distance_unique_simulation)) < distance_unique_simulation['MovementProbability']
    distance_unique_simulation['Presence'] = distance_unique_simulation['Presence'].astype(int)
    return distance_unique_simulation

n_simulations = 100
n_iterations = 12

simulation_results = []

for sim in range(n_simulations):
    for _ in range(n_iterations):
        simulation = run_single_simulation(distance_unique)
    simulation_results.append(simulation)

# Combine all simulation results
combined_results = pd.concat(simulation_results, ignore_index=True)
combined_results


Unnamed: 0,station_1,station_2,distance,MovementProbability,Presence,station_1_spikes_per_day,station_2_spikes_per_day,attractiveness
0,142730,143656,6716.915796,0.000099,0,21.358333,20.983871,9.759586e-06
1,143214,143656,4959.093743,0.000181,0,11.858491,20.983871,1.790468e-05
2,145242,143656,8384.389769,0.000063,0,17.825532,20.983871,6.263662e-06
3,142774,143656,3532.275768,0.000357,0,20.737430,20.983871,3.529084e-05
4,143636,143656,8629.053685,0.000060,0,13.704861,20.983871,5.913504e-06
...,...,...,...,...,...,...,...,...
132595,166459,143944,7131.547418,0.000170,0,2.638889,29.212121,1.677871e-05
132596,145604,143944,1047.227506,0.007880,0,2.653846,29.212121,7.781156e-04
132597,166459,142724,4993.639880,0.000001,0,2.638889,1.882022,1.420415e-07
132598,145604,142724,2749.095572,0.000005,0,2.653846,1.882022,4.686730e-07


In [64]:
# how many present and absent
combined_results.Presence.value_counts()

0    132504
1        96
Name: Presence, dtype: int64

In [97]:
# Merge 'PredictedsumPresence' with 'df' on 'station_1'/'sensor_index'
predictedPM2_5 = pd.merge(PredictedsumPresence, df[['sensor_index', 'latitude', 'longitude']], 
                  left_on='station_1', right_on='sensor_index', how='left').drop(columns='sensor_index')

# Drop duplicates based on 'station_1'
predictedPM2_5 = result.drop_duplicates(subset='station_1')
predictedPM2_5

Unnamed: 0,station_1,Presence,latitude,longitude
0,142718,3,44.995792,-93.295395
1,142720,2,44.95617,-93.25471
2,142724,1,44.937218,-93.243866
3,142726,3,45.01507,-93.28903
4,142728,0,44.95804,-93.245766
5,142730,0,44.99218,-93.29627
6,142732,0,44.916794,-93.273834
7,142734,0,44.903934,-93.28091
8,142736,0,44.972507,-93.28301
9,142744,45,44.99616,-93.29655


In [50]:
# end time after 120 minutes

end_time = init_time + dt.timedelta(minutes = (n_iterations) * 10)

# select out spikes
times = pd.date_range(start = init_time, 
                end = end_time, freq = '10min')


# Iterate through time stamps and identify the true presence of PM2.5 at each station

sim_results = []

for i, time in enumerate(times[1:]):
    
    true_presences = previous_readings[previous_readings.timestamp == time].sensor_index
    
    temp = simulation_results[i].copy() 
    
    temp['True Presence'] = 0
    
    condition = temp.station_1.isin(init_presences)
    temp.loc[condition, 'True Presence'] = 1
    
    sim_results.append(temp)
    
    
# combine orignal and predicted

original_predicted_results = pd.concat(sim_results, ignore_index=True)
original_predicted_results


y_true = original_predicted_results['True Presence'] 
y_pred = original_predicted_results['Presence']

In [None]:
# Accuraccy assesment

In [53]:
# accuracy assesment for alpha 2

# Update the table with the computed values
for station_index in sensor_indexes:
    select_df = original_predicted_results[original_predicted_results.station_1==station_index]
    y_true = select_df['True Presence'] 
    y_pred = select_df['Presence']
    
    
    TP = np.sum((y_true == 1) & (y_pred == 1))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))
    TN = np.sum((y_true == 0) & (y_pred == 0))

    
    distance_unique.loc[distance_unique['station_1'] == station_index, 'TP'] = TP
    distance_unique.loc[distance_unique['station_1'] == station_index, 'FP'] = FP
    distance_unique.loc[distance_unique['station_1'] == station_index, 'FN'] = FN
    distance_unique.loc[distance_unique['station_1'] == station_index, 'TN'] = TN
    
# Calculate accuracy
distance_unique['Accuracy'] = (distance_unique['TP'] + distance_unique['TN']) / (distance_unique['TP'] + distance_unique['FP'] + distance_unique['FN'] + distance_unique['TN'])
distance_unique

Unnamed: 0,station_1,station_2,distance,MovementProbability,Presence,station_1_spikes_per_day,station_2_spikes_per_day,TP,FP,FN,TN,Accuracy
0,142730,143656,6716.915796,0,0,21.358333,20.983871,0.0,0.0,0.0,12.0,1.0
1,143214,143656,4959.093743,0,0,11.858491,20.983871,0.0,0.0,0.0,24.0,1.0
2,145242,143656,8384.389769,0,0,17.825532,20.983871,0.0,0.0,0.0,36.0,1.0
3,142774,143656,3532.275768,0,0,20.737430,20.983871,0.0,0.0,0.0,48.0,1.0
4,143636,143656,8629.053685,0,0,13.704861,20.983871,0.0,0.0,0.0,60.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1321,166459,143944,7131.547418,0,0,2.638889,29.212121,0.0,0.0,0.0,600.0,1.0
1322,145604,143944,1047.227506,0,0,2.653846,29.212121,0.0,0.0,0.0,612.0,1.0
1323,166459,142724,4993.639880,0,0,2.638889,1.882022,0.0,0.0,0.0,600.0,1.0
1324,145604,142724,2749.095572,0,0,2.653846,1.882022,0.0,0.0,0.0,612.0,1.0


In [82]:
# average accuracy = alpha 2

alpha2AverageAccuracy = distance_unique['Accuracy'].mean()
alpha2AverageAccuracy

0.9730392156862745

# Run this for alpha 1 and alpha 1.5

In [341]:
sensor_indexes = distance_unique.station_1.unique()

def run_single_simulation(distance_unique, alpha=1, beta=2):
    distance_unique_simulation = distance_unique.copy()
    distance_unique_simulation['attractiveness'] = distance_unique_simulation.station_2_spikes_per_day**alpha/distance_unique_simulation.distance**beta
    distance_unique_simulation['MovementProbability'] = distance_unique_simulation.attractiveness / distance_unique_simulation.attractiveness.sum()
    distance_unique_simulation['Presence'] = np.random.rand(len(distance_unique_simulation)) < distance_unique_simulation['MovementProbability']
    distance_unique_simulation['Presence'] = distance_unique_simulation['Presence'].astype(int)
    return distance_unique_simulation

n_simulations = 100
n_iterations = 12

    # Run for alpha = 1
simulation_results_alpha1 = []

for sim in range(n_simulations):
    for _ in range(n_iterations):
        simulation = run_single_simulation(distance_unique, alpha=1)
    simulation_results_alpha1.append(simulation)

combined_results_alpha1 = pd.concat(simulation_results_alpha1, ignore_index=True)

    # Run for alpha = 1.5
simulation_results_alpha1_5 = []

for sim in range(n_simulations):
    for _ in range(n_iterations):
        simulation = run_single_simulation(distance_unique, alpha=1.5)
    simulation_results_alpha1_5.append(simulation)

combined_results_alpha1_5 = pd.concat(simulation_results_alpha1_5, ignore_index=True)

Unnamed: 0,station_1,station_2,distance,MovementProbability,Presence,station_1_spikes_per_day,station_2_spikes_per_day,TP,FP,FN,TN,Accuracy,attractiveness
0,142730,143656,6716.915796,9.2e-05,0,21.358333,20.983871,0.0,0.0,0.0,12.0,1.0,1e-05
1,143214,143656,4959.093743,0.00017,0,11.858491,20.983871,0.0,0.0,0.0,24.0,1.0,1.8e-05
2,145242,143656,8384.389769,5.9e-05,0,17.825532,20.983871,0.0,0.0,0.0,36.0,1.0,6e-06
3,142774,143656,3532.275768,0.000334,0,20.73743,20.983871,0.0,0.0,0.0,48.0,1.0,3.5e-05
4,143636,143656,8629.053685,5.6e-05,0,13.704861,20.983871,0.0,0.0,0.0,60.0,1.0,6e-06


In [342]:
# Combine all simulation results for 1
combined_results_alpha1 = pd.concat(simulation_results_alpha1, ignore_index=True)
combined_results_alpha1.head()

Unnamed: 0,station_1,station_2,distance,MovementProbability,Presence,station_1_spikes_per_day,station_2_spikes_per_day,TP,FP,FN,TN,Accuracy,attractiveness
0,142730,143656,6716.915796,7.9e-05,0,21.358333,20.983871,0.0,0.0,0.0,12.0,1.0,4.650994e-07
1,143214,143656,4959.093743,0.000145,0,11.858491,20.983871,0.0,0.0,0.0,24.0,1.0,8.532592e-07
2,145242,143656,8384.389769,5.1e-05,0,17.825532,20.983871,0.0,0.0,0.0,36.0,1.0,2.984989e-07
3,142774,143656,3532.275768,0.000286,0,20.73743,20.983871,0.0,0.0,0.0,48.0,1.0,1.681808e-06
4,143636,143656,8629.053685,4.8e-05,0,13.704861,20.983871,0.0,0.0,0.0,60.0,1.0,2.818119e-07


In [343]:
# Combine all simulation results for 1.5
combined_results_alpha1_5 = pd.concat(simulation_results_alpha1_5, ignore_index=True)
combined_results_alpha1_5.head()

Unnamed: 0,station_1,station_2,distance,MovementProbability,Presence,station_1_spikes_per_day,station_2_spikes_per_day,TP,FP,FN,TN,Accuracy,attractiveness
0,142730,143656,6716.915796,8.6e-05,0,21.358333,20.983871,0.0,0.0,0.0,12.0,1.0,2e-06
1,143214,143656,4959.093743,0.000158,0,11.858491,20.983871,0.0,0.0,0.0,24.0,1.0,4e-06
2,145242,143656,8384.389769,5.5e-05,0,17.825532,20.983871,0.0,0.0,0.0,36.0,1.0,1e-06
3,142774,143656,3532.275768,0.000311,0,20.73743,20.983871,0.0,0.0,0.0,48.0,1.0,8e-06
4,143636,143656,8629.053685,5.2e-05,0,13.704861,20.983871,0.0,0.0,0.0,60.0,1.0,1e-06


In [297]:
# how many present and absent
combined_results.Presence.value_counts()

0    143016
1        84
Name: Presence, dtype: int64

In [348]:
# end time after 120 minutes

end_time = init_time + dt.timedelta(minutes = (n_iterations) * 10)

# select out spikes
times = pd.date_range(start = init_time, 
                end = end_time, freq = '10min')

# simulate spread of PM 2.5 over time.

### combine true presence and predicted presence to run accuracy asssement on

In [349]:
# Iterate through time stamps and identify the true presence of PM2.5 at each station for alpha 1

sim_results1 = []

for i, time in enumerate(times[1:]):
    
    true_presences = previous_readings[previous_readings.timestamp == time].sensor_index
    
    temp = simulation_results_alpha1[i].copy() 
    
    temp['True Presence'] = 0
    
    condition = temp.station_1.isin(init_presences)
    temp.loc[condition, 'True Presence'] = 1
    
    sim_results1.append(temp)
    
    # combine orignal and predicted for 1

original_predicted_results1 = pd.concat(sim_results1, ignore_index=True)

y_true = original_predicted_results1['True Presence'] 
y_pred = original_predicted_results1['Presence']

original_predicted_results1

Unnamed: 0,station_1,station_2,distance,MovementProbability,Presence,station_1_spikes_per_day,station_2_spikes_per_day,TP,FP,FN,TN,Accuracy,attractiveness,True Presence
0,142730,143656,6716.915796,0.000079,0,21.358333,20.983871,0.0,0.0,0.0,12.0,1.0,4.650994e-07,0
1,143214,143656,4959.093743,0.000145,0,11.858491,20.983871,0.0,0.0,0.0,24.0,1.0,8.532592e-07,0
2,145242,143656,8384.389769,0.000051,0,17.825532,20.983871,0.0,0.0,0.0,36.0,1.0,2.984989e-07,0
3,142774,143656,3532.275768,0.000286,0,20.737430,20.983871,0.0,0.0,0.0,48.0,1.0,1.681808e-06,0
4,143636,143656,8629.053685,0.000048,0,13.704861,20.983871,0.0,0.0,0.0,60.0,1.0,2.818119e-07,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17167,157757,142724,1938.099172,0.000085,0,2.074074,1.882022,0.0,0.0,0.0,624.0,1.0,5.010405e-07,0
17168,145604,142724,2749.095572,0.000042,0,2.653846,1.882022,0.0,0.0,0.0,636.0,1.0,2.490262e-07,0
17169,157757,166459,6681.443427,0.000010,0,2.074074,2.638889,0.0,0.0,0.0,624.0,1.0,5.911266e-08,0
17170,145604,166459,7696.486014,0.000008,0,2.653846,2.638889,0.0,0.0,0.0,636.0,1.0,4.454881e-08,0


In [350]:
# Iterate through time stamps and identify the true presence of PM2.5 at each station for alpha 1.5

sim_results1_5 = []

for i, time in enumerate(times[1:]):
    
    true_presences = previous_readings[previous_readings.timestamp == time].sensor_index
    
    temp = simulation_results_alpha1_5[i].copy() 
    
    temp['True Presence'] = 0
    
    condition = temp.station_1.isin(init_presences)
    temp.loc[condition, 'True Presence'] = 1
    
    sim_results1_5.append(temp)
    
    # combine orignal and predicted for 1.5

original_predicted_results1_5 = pd.concat(sim_results1_5, ignore_index=True)
original_predicted_results1_5


y_true = original_predicted_results1_5['True Presence'] 
y_pred = original_predicted_results1_5['Presence']
original_predicted_results1_5

Unnamed: 0,station_1,station_2,distance,MovementProbability,Presence,station_1_spikes_per_day,station_2_spikes_per_day,TP,FP,FN,TN,Accuracy,attractiveness,True Presence
0,142730,143656,6716.915796,0.000086,0,21.358333,20.983871,0.0,0.0,0.0,12.0,1.0,2.130535e-06,0
1,143214,143656,4959.093743,0.000158,0,11.858491,20.983871,0.0,0.0,0.0,24.0,1.0,3.908623e-06,0
2,145242,143656,8384.389769,0.000055,0,17.825532,20.983871,0.0,0.0,0.0,36.0,1.0,1.367368e-06,0
3,142774,143656,3532.275768,0.000311,0,20.737430,20.983871,0.0,0.0,0.0,48.0,1.0,7.704052e-06,0
4,143636,143656,8629.053685,0.000052,0,13.704861,20.983871,0.0,0.0,0.0,60.0,1.0,1.290928e-06,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17167,157757,142724,1938.099172,0.000028,0,2.074074,1.882022,0.0,0.0,0.0,624.0,1.0,6.873615e-07,0
17168,145604,142724,2749.095572,0.000014,0,2.653846,1.882022,0.0,0.0,0.0,636.0,1.0,3.416312e-07,0
17169,157757,166459,6681.443427,0.000004,0,2.074074,2.638889,0.0,0.0,0.0,624.0,1.0,9.602649e-08,0
17170,145604,166459,7696.486014,0.000003,0,2.653846,2.638889,0.0,0.0,0.0,636.0,1.0,7.236802e-08,0


# Accuracy Assessment

In [363]:
# accuracy assesment for alpha 1

# Update the table with the computed values
for station_index in sensor_indexes:
    select_df = original_predicted_results1[original_predicted_results1.station_1==station_index]
    y_true = select_df['True Presence'] 
    y_pred = select_df['Presence']
    
    
    TP = np.sum((y_true == 1) & (y_pred == 1))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))
    TN = np.sum((y_true == 0) & (y_pred == 0))

    
    distance_unique.loc[distance_unique['station_1'] == station_index, 'TP'] = TP
    distance_unique.loc[distance_unique['station_1'] == station_index, 'FP'] = FP
    distance_unique.loc[distance_unique['station_1'] == station_index, 'FN'] = FN
    distance_unique.loc[distance_unique['station_1'] == station_index, 'TN'] = TN
    
# Calculate accuracy
distance_unique['Accuracy'] = (distance_unique['TP'] + distance_unique['TN']) / (distance_unique['TP'] + distance_unique['FP'] + distance_unique['FN'] + distance_unique['TN'])

acc1 = distance_unique
acc1

Unnamed: 0,station_1,station_2,distance,MovementProbability,Presence,station_1_spikes_per_day,station_2_spikes_per_day,TP,FP,FN,TN,Accuracy
0,142730,143656,6716.915796,0,0,21.358333,20.983871,0.0,0.0,0.0,12.0,1.0
1,143214,143656,4959.093743,0,0,11.858491,20.983871,0.0,0.0,0.0,24.0,1.0
2,145242,143656,8384.389769,0,0,17.825532,20.983871,0.0,0.0,0.0,36.0,1.0
3,142774,143656,3532.275768,0,0,20.737430,20.983871,0.0,0.0,0.0,48.0,1.0
4,143636,143656,8629.053685,0,0,13.704861,20.983871,0.0,0.0,0.0,60.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1426,157757,142724,1938.099172,0,0,2.074074,1.882022,0.0,0.0,0.0,624.0,1.0
1427,145604,142724,2749.095572,0,0,2.653846,1.882022,0.0,0.0,0.0,636.0,1.0
1428,157757,166459,6681.443427,0,0,2.074074,2.638889,0.0,0.0,0.0,624.0,1.0
1429,145604,166459,7696.486014,0,0,2.653846,2.638889,0.0,0.0,0.0,636.0,1.0


In [366]:
# accuracy assesment for alpha 1.5

# Update the table with the computed values
for station_index in sensor_indexes:
    select_df = original_predicted_results1_5[original_predicted_results1_5.station_1==station_index]
    y_true = select_df['True Presence'] 
    y_pred = select_df['Presence']
    
    
    TP = np.sum((y_true == 1) & (y_pred == 1))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))
    TN = np.sum((y_true == 0) & (y_pred == 0))

    
    distance_unique.loc[distance_unique['station_1'] == station_index, 'TP'] = TP
    distance_unique.loc[distance_unique['station_1'] == station_index, 'FP'] = FP
    distance_unique.loc[distance_unique['station_1'] == station_index, 'FN'] = FN
    distance_unique.loc[distance_unique['station_1'] == station_index, 'TN'] = TN
    
# Calculate accuracy
distance_unique['Accuracy'] = (distance_unique['TP'] + distance_unique['TN']) / (distance_unique['TP'] + distance_unique['FP'] + distance_unique['FN'] + distance_unique['TN'])


acc1_5 = distance_unique
acc1_5

Unnamed: 0,station_1,station_2,distance,MovementProbability,Presence,station_1_spikes_per_day,station_2_spikes_per_day,TP,FP,FN,TN,Accuracy
0,142730,143656,6716.915796,0,0,21.358333,20.983871,0.0,0.0,0.0,12.0,1.0
1,143214,143656,4959.093743,0,0,11.858491,20.983871,0.0,0.0,0.0,24.0,1.0
2,145242,143656,8384.389769,0,0,17.825532,20.983871,0.0,0.0,0.0,36.0,1.0
3,142774,143656,3532.275768,0,0,20.737430,20.983871,0.0,0.0,0.0,48.0,1.0
4,143636,143656,8629.053685,0,0,13.704861,20.983871,0.0,0.0,0.0,60.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1426,157757,142724,1938.099172,0,0,2.074074,1.882022,0.0,0.0,0.0,624.0,1.0
1427,145604,142724,2749.095572,0,0,2.653846,1.882022,0.0,0.0,0.0,636.0,1.0
1428,157757,166459,6681.443427,0,0,2.074074,2.638889,0.0,0.0,0.0,624.0,1.0
1429,145604,166459,7696.486014,0,0,2.653846,2.638889,0.0,0.0,0.0,636.0,1.0


In [364]:
# average accuracy = alpha 1

alpha1AverageAccuracy = acc1['Accuracy'].mean()
alpha1AverageAccuracy

0.9708245981830887

In [367]:
# average accuracy = alpha 1.5

alpha1_5AverageAccuracy = acc1_5['Accuracy'].mean()
alpha1_5AverageAccuracy

0.9707081295131611

# Save to Local and Remote Databases

In [89]:
# Get credentials

cred_pth = os.path.join(os.getcwd(), '..', '..', 'database', 'db_credentials.txt')

with open(cred_pth, 'r') as f:
    
    creds = f.readlines()[0].split(', ')

# Connect to PostGIS Database

pg_connection_dict = dict(zip(['dbname', 'user', 'password', 'port', 'host'], creds))

try:
    conn = psycopg2.connect(**pg_connection_dict)
    print("connected")
except:
    print("connection failed")

connected


In [None]:
#Taylor's test fix

points = os.path.join(save_path_gdb, predictedPM2_5)
fields_points = ['station_1', 'predicted_pm25', "latitude", "longitude"]

# Create SQL table
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS predictedPM2_5")
cursor.execute("""
    CREATE TABLE predictedPM2_5 (
        id SERIAL,
        station_1 INT,
        predicted_pm25 FLOAT,
        latitude DOUBLE PRECISION,
        longitude DOUBLE PRECISION
        )
""")
conn.commit()

# Populate PostGIS
with arcpy.da.SearchCursor(points, fields_points) as da_cursor:
    for row in da_cursor:
        cursor.execute("INSERT INTO predictedPM2_5 (station_1, predicted_pm25, latitude, longitude) VALUES (%s, %s, %s, %s)", (row[0], row[1], row[2], row[3]))
        conn.commit()

In [101]:
# Create and fill table for predictedPM2_5 and originalPm2_5Spikes

points = os.path.join(save_path_gdb, "predictedPM2_5")
fields_points = ['pointid', 'grid_code', "SHAPE@WKT"]

# For 'predictedPM2_5'
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS predictedPM2_5")
cursor.execute("""
    CREATE TABLE predictedPM2_5 (
        id SERIAL,
        station_1 INT,
        predicted_pm25 FLOAT,
        latitude FLOAT,
        longitude FLOAT
        )
""")
conn.commit()

# Populate the table
for _, row in predictedPM2_5.iterrows():
    cursor.execute(
        "INSERT INTO predictedPM2_5 (station_1, predicted_pm25, latitude, longitude) VALUES (%s, %s, %s, %s)", 
        (row['station_1'], row['predicted_pm25'], row['latitude'], row['longitude'])
    )
conn.commit()

KeyError: 'predicted_pm25'

In [102]:

points = os.path.join(save_path_gdb, "originalPm2_5Spikes")
fields_points = ['pointid', 'grid_code', "SHAPE@WKT"]


# For 'originalPm2_5Spikes'
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS originalPm2_5Spikes")
cursor.execute("""
    CREATE TABLE originalPm2_5Spikes (
        id SERIAL,
        station_1 INT,
        original_pm25 FLOAT,
        latitude FLOAT,
        longitude FLOAT
        )
""")
conn.commit()

# Populate the table
for _, row in originalPm2_5Spikes.iterrows():
    cursor.execute(
        "INSERT INTO originalPm2_5Spikes (sensor_index, original_pm25, latitude, longitude) VALUES (%s, %s, %s, %s)", 
        (row['sensor_index'], row['original_pm25'], row['latitude'], row['longitude'])
    )
conn.commit()


KeyError: 'sensor_index'

In [None]:
conn.close