In [10]:
import pandas as pd
import requests
import datetime as dt
import json
import sys
import os
import sqlite3 as db

In [11]:
ebird_token = 'j6c7l80ga2ib'
cnx = db.connect('trailheadDirectBirds_sous.db')
cur = cnx.cursor()

In [12]:
##get latest trailhead hotspot dataset
##TODO: #19 hardcode is an anti-pattern, fix to fetch and load the latest file --DONE
trailheadHotspots = pd.DataFrame()
trailheadHotspots = pd.read_sql('select * from Hotspots',con=cnx)
trailheadHotspots.set_index('index',inplace=True)
##convert data to useful types
trailheadHotspots['latestObsDt'] = trailheadHotspots['latestObsDt'].astype('datetime64[ns]')
trailheadHotspots['latestUpdate'] = trailheadHotspots['latestUpdate'].astype('datetime64[ns]')

Duplicate Check

Duplicate checking is done here rather than in the hotspot factory to prevent shelling of the eBird API while still allowing trailheads in close proximity to each other to pull the same observation data. 

In [13]:
##TODO #21 --DONE
##get a unique list of trailhead hotspots
allHotspots = pd.DataFrame()
allHotspots = trailheadHotspots.sort_values(by=['locId','latestObsDt'],ascending=False,ignore_index=True)
allHotspots = allHotspots[['locId','latestObsDt']].drop_duplicates(subset=['locId'],keep='first')

##Checking for new hotspots and observations
1. Compare the latest trailheadHotspots eBird locId's with the eBird locId's in the cooking DB
2. For new locId's:
    1. Fetch historical data from the beginning of 2019 to today()-1 from the eBird REST API
    2. Concat and export the results as a .csv
3. For locId's with observations in the DB, obtain the most recent observation date from the eBird REST API for that hotspot:
    1. Make a list of locId's WHERE eBird's latest observation date is more recent than the latest observation date in the DB. Using the most recent DB obsDt as a start date and the most recent eBird API obsDt as an end date, fetch historical sightings from the eBird service
    2. IF the latest obsDt for a locId/hotspot from eBird is within 15 days of the latest obsDt in the DB, remove the locId from the list of locId's to be passed into the historical eBird sighting service
        - TODO: #20 write 'aging' and 'decayed' testing, error logging

In [56]:
##TODO #27 build comparison table between the latest hotspot observation vs latest obs in 'historicObservations' table --DONE
##fetch the latest observation for each hotspot in the 'historicObservations' table
histHotObs = pd.DataFrame()
histHotObs = pd.read_sql('select * from historicObservations',con=cnx)
##clean up and pull out only what we need
histHotObs['obsDt'] = histHotObs['obsDt'].astype('datetime64[ns]')
histHotObs = histHotObs[['locId','obsDt']].sort_values(by=['locId','obsDt'],ascending=False,ignore_index=True)
histHotObs.drop_duplicates(subset=['locId'],keep='first',inplace=True)
##merge
dfk = pd.merge(
    left=allHotspots,
    right=histHotObs,
    how='left',
    left_on='locId',
    right_on='locId'
)

In [57]:
##replace null, NaT values with the earliest value found in the table
min_obsDt = pd.read_sql('select obsDt from historicObservations',con=cnx)
min_obsDt['obsDt'] = min_obsDt['obsDt'].astype('datetime64[ns]')
min_obsDt = min(min_obsDt['obsDt'])

dfk.loc[(dfk['obsDt'].isna()) | (dfk['obsDt'].isnull()) | (dfk['obsDt'] == ''),'obsDt'] = min_obsDt

In [165]:
##get the latest observation date for each locId/hotspot
##22: update the mess in the two lines below to get only subnational2code from trailheadHotspots
THlocIds = trailheadHotspots[['subnational2Code','locId']].sort_values(by=['subnational2Code','locId'],ascending=False,ignore_index=True)
THlocIds.drop_duplicates(subset=['locId'],keep='first',inplace=True)
ebirdLocs = []
##22: update loop below with fetches of https://api.ebird.org/v2/ref/hotspot/info/{{subnational2Code}}
    ##result contains the latest sighting date at each location
for locId in THlocIds.itertuples():
    time.sleep(0.3)
    ebird_baseUrl = 'https://api.ebird.org/v2/ref/hotspot/info/'
    ebird_url = ebird_baseUrl + locId.locId
    ebird_auth = {'X-eBirdApiToken': ebird_token}
    ebird_params = {
        'back':'30',
        'fmt':'json'
        }
    ebird_request = requests.get(ebird_url,params=ebird_params,headers=ebird_auth)
    if ebird_request.status_code == requests.codes.ok:
        ebird_resp = pd.DataFrame(ebird_request.json(),index=[0])
        ebirdLocs.append(ebird_resp)
    ebird_request.raise_for_status()

ebirdLocs_toDate = pd.concat(ebirdLocs)

In [167]:
##TODO #22 recency refactor: get the latest date some other way; eBird hotspot info does not contain the latest date
ebirdLocs_toDate.dtypes
##TODO #23 recency refactor: pare down results of hotspot batches to match known Trailhead Direct eBird hotspots

locId                object
name                 object
latitude            float64
longitude           float64
countryCode          object
countryName          object
subnational1Name     object
subnational1Code     object
subnational2Code     object
subnational2Name     object
isHotspot              bool
locName              object
lat                 float64
lng                 float64
hierarchicalName     object
locID                object
dtype: object

In [58]:
##todo #8: write recency check engine and updater. --DONE
##TODO: #15 Check for a gap between the latest run time and current date --DONE

for x in dfk:
    dateDiff = dfk['latestObsDt']-dfk['obsDt']
    dfk['DaysDiff'] = dateDiff.dt.days

##avoiding running the big differences for now
dfk['run'] = dfk.loc[(dfk['DaysDiff']>8) & (dfk['DaysDiff']<900),'run'] = True
dfk.loc[(dfk['DaysDiff']<8) | (dfk['DaysDiff']<900),'run'] = False
dfk = dfk[dfk['run'] == False]
dfk.head()

Unnamed: 0,locId,latestObsDt,obsDt,DaysDiff,run
1,L854460,2022-01-23 10:16:00,2020-12-07 07:25:00,412.0,False
2,L8365620,2022-01-31 16:16:00,2020-08-28 13:30:00,521.0,False
6,L8102503,2022-01-27 08:01:00,2020-10-17 09:22:00,466.0,False
8,L7672326,2021-08-21 12:30:00,2020-08-15 15:30:00,370.0,False
9,L7485982,2021-02-20 12:24:00,2019-01-01 08:18:00,781.0,False


In [63]:
##TODO #28 write a daterange series for each locId in dfk that fails (false) the run test --DONE
dfk_test = pd.DataFrame(dfk.head(2))
dfk_test.head()

Unnamed: 0,locId,latestObsDt,obsDt,DaysDiff,run
1,L854460,2022-01-23 10:16:00,2020-12-07 07:25:00,412.0,False
2,L8365620,2022-01-31 16:16:00,2020-08-28 13:30:00,521.0,False


In [None]:
##TODO #30 write a test that aborts the rest of the scripts if all locId's are up to date (true)

In [74]:
ObsHist = []
import time

for locId in dfk_test.itertuples():
    for x in pd.date_range(start=locId.obsDt,end=locId.latestObsDt,freq='D'):
        time.sleep(0.3)
        ymd = '{}/{}/{}'.format(x.year,x.month,x.day)
        ebird_baseUrl = 'https://api.ebird.org/v2/data/obs/'
        ebird_url = ebird_baseUrl + locId.locId + '/historic/' + ymd
        ebird_auth = {'X-eBirdApiToken': ebird_token}
        ebird_params = {
            'fmt':'json',
            'detail':'simple'
        }
        ebird_request = requests.get(ebird_url,params=ebird_params,headers=ebird_auth)
        if ebird_request.status_code == requests.codes.ok:
            ebird_resp = pd.DataFrame(ebird_request.json())
            ObsHist.append(ebird_resp)
        ebird_request.raise_for_status()

In [75]:
##TODO #29 write the new observations to the database
historicObservations = pd.concat(ObsHist)

In [76]:
historicObservations.to_sql(name='historicObservations_cooking',con=cnx,if_exists='append')

In [24]:
##removing made obsolete by other scripts written for issue 28
import time

ObsHist = []

for locId in xCookedHotspots.itertuples():
    for date in dateList_hist:
        time.sleep(0.5)
        date = date
        ymd = '{}/{}/{}'.format(date.year,date.month,date.day)
        ebird_baseUrl = 'https://api.ebird.org/v2/data/obs/'
        ebird_url = ebird_baseUrl + locId.locId + '/historic/' + ymd
        ebird_auth = {'X-eBirdApiToken': ebird_token}
        ebird_params = {
            'fmt':'json',
            'detail':'simple'
        }
        ebird_request = requests.get(ebird_url,params=ebird_params,headers=ebird_auth)
        if ebird_request.status_code == requests.codes.ok:
            ebird_resp = pd.DataFrame(ebird_request.json())
            ObsHist.append(ebird_resp)
        ebird_request.raise_for_status()

In [25]:
batch_ObsHist = pd.concat(ObsHist)
batch_ObsHist.head()

Unnamed: 0,speciesCode,comName,sciName,locId,locName,obsDt,howMany,lat,lng,obsValid,obsReviewed,locationPrivate,subId
0,commer,Common Merganser,Mergus merganser,L4381196,Cedar Grove Natual Area,2020-04-26 13:30,1,47.462973,-122.080936,True,False,False,S67866709
1,baleag,Bald Eagle,Haliaeetus leucocephalus,L4381196,Cedar Grove Natual Area,2020-04-26 13:30,2,47.462973,-122.080936,True,False,False,S67866709
2,rebsap,Red-breasted Sapsucker,Sphyrapicus ruber,L4381196,Cedar Grove Natual Area,2020-04-26 13:30,2,47.462973,-122.080936,True,False,False,S67866709
3,amerob,American Robin,Turdus migratorius,L4381196,Cedar Grove Natual Area,2020-04-26 13:30,2,47.462973,-122.080936,True,False,False,S67866709
0,rocpig,Rock Pigeon,Columba livia,L4381196,Cedar Grove Natual Area,2020-04-28 12:18,3,47.462973,-122.080936,True,False,False,S70289791


In [26]:
##TODO #16 dynamically name files to denote recency
batch_ObsHist.to_csv('MountSi_Obs_2019to2021.csv',sep=',',index=False)