In [15]:
import pandas as pd
import requests
import pytest
import datetime as dt
import json
import sys
import os
import sqlite3 as db

In [16]:
ebird_token = 'j6c7l80ga2ib'
db_name = 'trailheadDirectBirds_sous.db'

In [17]:
##connect to database
def connectDB():
    try:
        cnx = db.connect(db_name)
    except Exception as cnxError:
        raise UserWarning(f'Unable to connect to database due to: {cnxError}')
    return cnx

In [88]:
def get_hotspots():
    from datetime import timedelta
    trailheadHotspots = pd.DataFrame()
    histHotObs = pd.DataFrame()
    cnx = connectDB()
    try:
        #get the latest hotspots from the database
        #trailheadHotspots = pd.read_sql('select * from Hotspots',con=cnx,parse_dates=['latestObsDt','latestUpdate'])   #Excluding Discovery Park until a later date
        trailheadHotspots = pd.read_sql('select * from Hotspots where Hotspots.StopName not like "DiscoveryPark%"',con=cnx,parse_dates=['latestObsDt','latestUpdate'])
        trailheadHotspots.set_index('index',inplace=True)
        trailheadHotspots.sort_values(by=['locId','latestObsDt'],ascending=False,ignore_index=True,inplace=True)
        uniqueHotspots = trailheadHotspots[['locId','latestObsDt']].drop_duplicates(subset=['locId'],keep='first')
        #get hotspots from the observations in the database
        histHotObs = pd.read_sql('select locId,obsDt,loadDate from historicObservations',con=cnx,parse_dates=['obsDt','loadDate'])
        min_obsDt = min(histHotObs['obsDt'])
        histHotObs.sort_values(by=['locId','obsDt'],ascending=False,ignore_index=True,inplace=True)
        histHotObs.drop_duplicates(subset=['locId'],keep='first',inplace=True)
        #merge the tables
        hotspots = pd.merge(left=uniqueHotspots,right=histHotObs,how='left',left_on='locId',right_on='locId')
        #remove hotspots without a recent checklist
        agingMask = hotspots.apply(lambda y: bool(y.loadDate is not(pd.NaT) or bool(y.latestObsDt < y.loadDate)),axis=1)
        hotspots = hotspots[~agingMask]
        #replace null, NaT observation date values with the earliest value found in the table
        hotspots.loc[(hotspots['obsDt'].isna()) | (hotspots['obsDt'].isnull()) | (hotspots['obsDt'] == ''),'obsDt'] = min_obsDt
        #recency check
        for x in hotspots:
            dateDiff = hotspots['latestObsDt']-hotspots['obsDt']
            hotspots['dateDiff'] = dateDiff.dt.days
        hotspots['run'] = hotspots['dateDiff'].between(7,700,inclusive='both')
        hotspots = hotspots[hotspots['run'] == True]
        dfk = hotspots[['locId','latestObsDt','obsDt']]
    except Exception as ee:
        raise UserWarning(ee)
    finally: cnx.close()
    return dfk

##Checking for new hotspots and observations
1. Compare the latest trailheadHotspots eBird locId's with the eBird locId's in the cooking DB
2. For new locId's:
    1. Fetch historical data from the beginning of 2019 to today()-1 from the eBird REST API
    2. Concat and export the results as a .csv
3. For locId's with observations in the DB, obtain the most recent observation date from the eBird REST API for that hotspot:
    1. Make a list of locId's WHERE eBird's latest observation date is more recent than the latest observation date in the DB. Using the most recent DB obsDt as a start date and the most recent eBird API obsDt as an end date, fetch historical sightings from the eBird service
    2. IF the latest obsDt for a locId/hotspot from eBird is within 15 days of the latest obsDt in the DB, remove the locId from the list of locId's to be passed into the historical eBird sighting service
        - TODO: #20 write 'aging' and 'decayed' testing, error logging

In [33]:
##TODO #22 recency refactor: get the latest date some other way; eBird hotspot info does not contain the latest date
##TODO #23 recency refactor: pare down results of hotspot batches to match known Trailhead Direct eBird hotspots

In [90]:
def get_histObs():
    import time
    from datetime import timedelta
    ObsHist = []
    cnx = connectDB()
    try:
        dfk_test = pd.DataFrame(get_hotspots().head(2))
        if dfk_test.empty:
            raise Exception('The provided dataframe is empty. There are either no eBird hotspots with new checklists or an error has occurred in the evaluation of the get_hotposts() function.')
        for locId in dfk_test.itertuples():
            startDate = locId.obsDt + timedelta(days=1)
            for x in pd.date_range(start=startDate,end=locId.latestObsDt,freq='D'):
                time.sleep(0.5)
                ymd = '{}/{}/{}'.format(x.year,x.month,x.day)
                ebird_baseUrl = 'https://api.ebird.org/v2/data/obs/'
                ebird_url = ebird_baseUrl + locId.locId + '/historic/' + ymd
                ebird_auth = {'X-eBirdApiToken': ebird_token}
                ebird_params = {
                    'fmt':'json',
                    'detail':'simple'
                }
                ebird_request = requests.get(ebird_url,params=ebird_params,headers=ebird_auth)
                if ebird_request.status_code == requests.codes.ok:
                    ebird_resp = pd.DataFrame(ebird_request.json())
                    ebird_resp['loadDate'] = dt.datetime.today()
                    ObsHist.append(ebird_resp)
                ebird_request.raise_for_status()
        historicObservations = pd.concat(ObsHist)
        historicObservations.to_sql(name='historicObservations_cooking',con=cnx,if_exists='append')
        cnx.close()
    except Exception as exc:
        raise exc
    return historicObservations

In [91]:
xr = get_histObs()
xr.head()

Unnamed: 0,loadDate,speciesCode,comName,sciName,locId,locName,obsDt,howMany,lat,lng,obsValid,obsReviewed,locationPrivate,subId
0,2022-02-23 13:15:40.822073,cangoo,Canada Goose,Branta canadensis,L4703260,Square Lake State Park,2021-03-30 14:33,3.0,47.48027,-122.684439,True,False,False,S84423174
1,2022-02-23 13:15:40.822073,mallar3,Mallard,Anas platyrhynchos,L4703260,Square Lake State Park,2021-03-30 14:33,8.0,47.48027,-122.684439,True,False,False,S84423174
2,2022-02-23 13:15:40.822073,gnwtea,Green-winged Teal,Anas crecca,L4703260,Square Lake State Park,2021-03-30 14:33,6.0,47.48027,-122.684439,True,False,False,S84423174
3,2022-02-23 13:15:40.822073,buffle,Bufflehead,Bucephala albeola,L4703260,Square Lake State Park,2021-03-30 14:33,10.0,47.48027,-122.684439,True,False,False,S84423174
4,2022-02-23 13:15:40.822073,killde,Killdeer,Charadrius vociferus,L4703260,Square Lake State Park,2021-03-30 14:33,3.0,47.48027,-122.684439,True,False,False,S84423174
