In [355]:
import pandas as pd
import sqlite3 as db
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score,explained_variance_score
from sklearn.cluster import KMeans
from statsmodels.tsa.seasonal import seasonal_decompose

In [3]:
github_userName = 'Tanag3r'
ebird_token = 'j6c7l80ga2ib'
db_name = 'trailheadDirectBirds_sous.db'

In [4]:
##connect to database
def connectDB():
    try:
        cnx = db.connect(db_name)
    except Exception as cnxError:
        raise UserWarning(f'Unable to connect to database due to: {cnxError}')
    return cnx

In [5]:
def stopQualityMask(speciesCode: str,closestStop: str):
    cnx = connectDB()
    try:
        gap = dt.date.today().year-2018
        query = f'SELECT COUNT(DISTINCT(year)) as "frq" FROM coefficients_bySpecies WHERE speciesCode = "{speciesCode}" AND closestStop = "{closestStop}"'
        coeficients = pd.read_sql(query,con=cnx)
        coeficients['frq'] = coeficients.apply(lambda g: (g.frq/gap),axis=1)
    except Exception as maskExc:
        raise maskExc
    return coeficients['frq']

In [224]:
def log_stopMetrics(speciesCode,key,testBlob,latestUpdate):
    cnx = connectDB()
    cur = cnx.cursor()
    try:
        sqliteInsert = """INSERT INTO stopMetricsBlob (speciesCode,key,testBlob,latestUpdate) VALUES (?,?,?,?);"""
        logTuple = (speciesCode,key,testBlob,latestUpdate)
        cur.execute(sqliteInsert,logTuple)
        cnx.commit()
        cur.close()
    except db.Error as sqlError:
        raise sqlError
    finally:
        if cnx:
            cnx.close()

In [274]:
#baseline request from the application layer
#outputs a list of birds at the stop and a classification based solely off the number of observations
def birdList_request(StopName: str,cnx):
    try:
        query = f'SELECT speciesCode,count(subId) as "checklists",(SELECT count(subId) FROM historicObservations hxobx WHERE hxobx.speciesCode=hsob.speciesCode) as "sightings" FROM historicObservations hsob LEFT JOIN closestStop on hsob.locId=closestStop.locId WHERE StopName = "{StopName}" GROUP BY speciesCode;'
        sightings = pd.read_sql(sql=query,con=cnx)
    #rareness at the stop
        sightings['stopGroup'] = int
        bucket = sightings['checklists'].quantile([0,0.15,0.5,0.85,1])
        sightings.loc[sightings['checklists'] <= bucket[0.15],'stopGroup'] = 1  #mythic
        sightings.loc[(sightings['checklists'] > bucket[0.15]) & (sightings['checklists'] <= bucket[0.5]),'stopGroup'] = 2  #rare
        sightings.loc[(sightings['checklists'] > bucket[0.5]) & (sightings['checklists'] < bucket[0.85]),'stopGroup'] = 3   #uncommon
        sightings.loc[(sightings['checklists'] >= bucket[0.85]) & (sightings['checklists'] <=bucket[1]),'stopGroup'] = 4    #common
    #overall rareness
        sightings['overall'] = int
        bucket = sightings['sightings'].quantile([0,0.15,0.5,0.85,1])
        sightings.loc[sightings['sightings'] <= bucket[0.15],'overall'] = 1
        sightings.loc[(sightings['sightings'] > bucket[0.15]) & (sightings['sightings'] <= bucket[0.5]),'overall'] = 2
        sightings.loc[(sightings['sightings'] > bucket[0.5]) & (sightings['sightings'] < bucket[0.85]),'overall'] = 3
        sightings.loc[(sightings['sightings'] >= bucket[0.85]) & (sightings['sightings'] <=bucket[1]),'overall'] = 4
        sightings['StopName'] = StopName
    #raise an exception if the stopName given is not valid and return a list of valid stop names
    except Exception as ex:
        raise ex
    return sightings


In [275]:
birdlist = birdList_request(StopName='EastSunsetWay',cnx=connectDB())

In [442]:
birdlist

Unnamed: 0,speciesCode,checklists,sightings,stopGroup,overall,StopName
0,amecro,91,293,4,4,EastSunsetWay
1,amedip,10,10,3,2,EastSunsetWay
2,amegfi,11,46,3,2,EastSunsetWay
3,amekes,1,1,1,1,EastSunsetWay
4,amerob,80,371,4,4,EastSunsetWay
...,...,...,...,...,...,...
100,wooduc,5,95,2,3,EastSunsetWay
101,x00051,1,1,1,1,EastSunsetWay
102,y00475,9,36,3,2,EastSunsetWay
103,yelwar,3,16,2,2,EastSunsetWay


For birds with robust data, do they behave the same at all stops? If not, what stops does each species appear to prefer? Is that preference explained by habitat?

In [483]:
#TODO #92 change over flat annual average to weekly average in wklyAbd_selectSpecies() and other calcs
def wklyAbd_selectSpecies(cnx,speciesList: list):
    cnx=cnx
    try:
        querySpecies = []
        for i in speciesList:
            i = str(i)
            querySpecies.append(i)
        querySpecies = str(querySpecies).strip('[]')
        #query = f'SELECT speciesCode,FX.locId,StopName,obsDt,howMany FROM historicObservations AS FX  LEFT JOIN closestStop on FX.locId = closestStop.locId WHERE (SELECT count(distinct(subId)) FROM historicObservations AS QA WHERE QA.comName = FX.comName) > 2 AND FX.locId in ({query_locIds})'
    #call baseline
        queryBaseline = f'SELECT speciesCode,obsDt,howMany FROM historicObservations WHERE speciesCode in ({querySpecies})'
        baseline = pd.read_sql(sql=queryBaseline,con=cnx,parse_dates=['obsDt'])
        baseline = baseline.assign(obsDt_week=baseline.obsDt.dt.isocalendar().week)
        baseline['howMany'].fillna(1,inplace=True)
        baselineAvg = baseline.groupby(['speciesCode'])['howMany'].mean()

        query = f'SELECT speciesCode,FX.locId,StopName,obsDt,howMany FROM historicObservations AS FX LEFT JOIN closestStop on FX.locId = closestStop.locId WHERE (SELECT count(distinct(subId)) FROM historicObservations AS QA WHERE QA.comName = FX.comName) > 2 AND FX.speciesCode in ({querySpecies});'
        obsData = pd.read_sql(query,con=cnx,parse_dates=['obsDt'])
        obsData = obsData.assign(obsDt_week=obsData.obsDt.dt.isocalendar().week)
        obsData['howMany'].fillna(1,inplace=True)
        obsData = obsData.groupby(['speciesCode','StopName','obsDt_week'])['howMany'].mean().reset_index()
    #compute relative abundance
        obsData['relativeAbundance'] = obsData.apply(lambda x: (x.howMany/baselineAvg[x.speciesCode]),axis=1)  #baseline relative abd, same process as Fink et. all
        #obsData['relativeAbundance'] = obsData.apply(lambda f: stopQualityMask(f.speciesCode,f.StopName)*f.relativeAbundance,axis=1)  #apply frequency mask
        #avgAbd = obsData.groupby(['speciesCode'])['relativeAbundance'].mean()
        #obsData['relativeAbundance'] = obsData.apply(lambda n: ((n.relativeAbundance)/(avgAbd[n.speciesCode])),axis=1)  #normalizing around average relative abundance
        obsData.sort_values(by=['StopName','obsDt_week'],ascending=True,inplace=True)
    except db.DatabaseError as dbExc:
        raise f'there was an issue with the database request: {dbExc}'
    except Exception as ex:
        raise ex
    finally: cnx.close()
    return obsData[['obsDt_week','speciesCode','StopName','relativeAbundance']]

Interpolation testing

In [492]:
gocks = wklyAbd_selectSpecies(cnx=connectDB(),speciesList=['gockin'])

In [565]:
gocksWeek = gocks[gocks['StopName']=='EastSunsetWay']
allweek = pd.DataFrame({'obsDt_week':range(1,53)})
gocksWeek = pd.merge(left=allweek,left_on='obsDt_week',right=gocksWeek,right_on='obsDt_week',how='left')
#mask
gocksWeek['mask'] = gocksWeek['relativeAbundance'].interpolate(method='ffill',limit=4,limit_direction='forward')
gocksWeek['mask'] = gocksWeek['mask'].interpolate(method='bfill',limit=2,limit_direction='backward')
gocksWeek.loc[gocksWeek['mask'].isna() == True,'relativeAbundance'] = 0
gocksWeek = gocksWeek[gocksWeek['relativeAbundance'].notna()].drop(columns=['mask'])
gocksWeek

Unnamed: 0,obsDt_week,speciesCode,StopName,relativeAbundance
0,1,gockin,EastSunsetWay,0.296554
1,2,gockin,EastSunsetWay,0.533797
2,3,gockin,EastSunsetWay,0.177932
3,4,gockin,EastSunsetWay,1.957256
4,5,gockin,EastSunsetWay,0.355865
7,8,gockin,EastSunsetWay,1.779324
9,10,gockin,EastSunsetWay,0.355865
12,13,gockin,EastSunsetWay,0.355865
13,14,gockin,EastSunsetWay,0.355865
18,19,,,0.0


In [645]:
sample_gocksWeek = gocksWeek[['obsDt_week','relativeAbundance']].sample(frac=0.35,random_state=1)

In [646]:
interpTest = pd.merge(left=gocksWeek,left_on='obsDt_week',right=sample_gocksWeek,right_on='obsDt_week',how='left')

In [648]:
interpTest['cubic3'] = interpTest['relativeAbundance_y'].interpolate(method='cubic',order=3,limit_direction='both').fillna(interpTest['relativeAbundance_x'])
interpTest['slinear'] = interpTest['relativeAbundance_y'].interpolate(method='slinear',order=5,limit_direction='both').fillna(interpTest['relativeAbundance_x'])
interpTest['linear'] = interpTest['relativeAbundance_y'].interpolate(method='linear',limit_direction='both').fillna(interpTest['relativeAbundance_x'])
interpTest['polynomial3'] = interpTest['relativeAbundance_y'].interpolate(method='polynomial',order=3,limit=5).fillna(interpTest['relativeAbundance_x'])
interpTest['polynomial5'] = interpTest['relativeAbundance_y'].interpolate(method='polynomial',order=5,limit=5).fillna(interpTest['relativeAbundance_x'])
interpTest['polynomial7'] = interpTest['relativeAbundance_y'].interpolate(method='polynomial',order=7).fillna(interpTest['relativeAbundance_x'])
interpTest['spline3'] = interpTest['relativeAbundance_y'].interpolate(method='spline',order=3).fillna(interpTest['relativeAbundance_x'])
interpTest['spline5'] = interpTest['relativeAbundance_y'].interpolate(method='spline',order=5).fillna(interpTest['relativeAbundance_x'])
interpTest['quadratic'] = interpTest['relativeAbundance_y'].interpolate(method='quadratic').fillna(interpTest['relativeAbundance_x'])


In [649]:
r2score = [(method,r2_score(interpTest['relativeAbundance_x'],interpTest[method])) for method in list(interpTest)[5:]]
r2score

[('cubic3', 0.6479143895715169),
 ('slinear', 0.8143718841492182),
 ('linear', -0.03983576806099798),
 ('polynomial3', 0.6689218010201817),
 ('polynomial5', -1417.5000433013677),
 ('polynomial7', -908333.8506988395),
 ('spline3', 0.1652746427785614),
 ('spline5', -1.055052335806987),
 ('quadratic', 0.7987413187884767)]

In [650]:
max(r2score,key=lambda x:x[1])

('slinear', 0.8143718841492182)

In [435]:
##TODO #90 rewrite stopCovariance to run for a single species at a time --DONE

def stopCovariance(obsData: pd.DataFrame,species: str,StopKey: str,cnx):
    try:
        obsData.sort_values(by=['StopName','obsDt_week'],ascending=True,inplace=True)
        container = []
        resultsContainer = []
        stopKeys = obsData.drop_duplicates(subset=['StopName'])
        for StopName in stopKeys.itertuples():
            stop_obsData = obsData[obsData['speciesCode']==species]
            stop_obsData = stop_obsData[stop_obsData['StopName']==StopName.StopName]
            allweek = pd.DataFrame({'obsDt_week':range(1,53)})
            stop_obsData.drop(columns=['speciesCode','StopName'],inplace=True)
            stop_obsData = pd.merge(left=stop_obsData,right=allweek,left_on='obsDt_week',right_on='obsDt_week',how='outer')
            stop_obsData.set_index('obsDt_week',inplace=True)
            stop_obsData.sort_index(axis='index',ascending=True,inplace=True)
        #TODO #91 write interpolation switches dependent on rarity for stopCovariance()
            stop_obsData['mask'] = stop_obsData['relativeAbundance'].interpolate(method='ffill',limit=5,limit_direction='forward')  #mask, values do not matter
            stop_obsData['mask'] = stop_obsData['mask'].interpolate(method='bfill',limit=2,limit_direction='backward')  #mask, values do not matter
            stop_obsData.loc[stop_obsData['mask'].isna() == True,'relativeAbundance'] = 0
            stop_obsData['fx_relativeAbundance'] = stop_obsData['relativeAbundance'].interpolate(method='linear',limit=5,limit_direction='forward')
            stop_obsData['fx_relativeAbundance'] = stop_obsData['fx_relativeAbundance'].interpolate(method='linear',limit=2,limit_direction='backward')

            stop_obsData.drop(columns=['relativeAbundance','mask'],inplace=True)
            stop_obsData.rename(columns={'fx_relativeAbundance':StopName.StopName},inplace=True)              
            container.append(stop_obsData)
        weeklySpeciesAbd = pd.concat(container,ignore_index=False,axis=1)
        weeklySpeciesAbd = weeklySpeciesAbd.fillna(value=0,axis=0)
        weeklySpeciesAbd = weeklySpeciesAbd.apply(lambda n:(np.log1p(n)),axis=1)
        y_true = weeklySpeciesAbd[StopKey]
    #return related stops
        fit = [(stop,r2_score(y_true,weeklySpeciesAbd[stop])) for stop in list(weeklySpeciesAbd)]
        rSqr = filter(lambda r: (r[1] > 0.25) and (r[1]<1),fit)
        var = [(stop,explained_variance_score(y_true,weeklySpeciesAbd[stop])) for stop in list(weeklySpeciesAbd)]
        expVar = filter(lambda e: (e[1]>0.25) and (e[1]<1),var)
        blob = {'rSquared':list(rSqr),'explVar':list(expVar)}
        log_stopMetrics(speciesCode=species,key=StopKey,testBlob=str(blob),latestUpdate=str(dt.datetime.today()))

    except Exception as ex:
        raise ex
    return {'speciesCode':species,'stopKey':StopKey,'rSquared':list(rSqr),'explVar':list(expVar)}
    #return y_true

In [490]:
amerob = wklyAbd_selectSpecies(cnx=connectDB(),speciesList=['amecro'])

for x in list(amerob['StopName'].drop_duplicates()):
    stopCovariance(obsData=amerob,species='amecro',StopKey=x,cnx=connectDB())

Habitats

In [332]:
def kmeans_habitat(cnx,distinctHabitats: int):
    try:
        data = pd.read_sql(sql='SELECT * FROM FAO_by_locId;',con=cnx)
        data = data.drop(columns=['locName']).set_index('locId')
        data.fillna(0,inplace=True)
    #normalize
        maxValue = data.apply(max,axis=1)
        data = data.apply(lambda x: (x/maxValue[x.index]),axis=0)   #min-max normalizing to smooth in proportionality
    #compute kmeans for each locId
        habitat_kmeans = KMeans(n_clusters=distinctHabitats,init='k-means++')
        habitat_kmeans = habitat_kmeans.fit(data.values)
        clusterLabels = habitat_kmeans.labels_
    #define habitats
        habitatFrame = pd.DataFrame(data=clusterLabels,columns=['clusterLabel'],index=data.index).sort_values(by='clusterLabel').reset_index()
        habitatFrame = pd.merge(left=habitatFrame,left_on='locId',right=data,right_on='locId',how='left')
    except Exception as kmeansExc:
        raise kmeansExc
    return habitatFrame     

In [477]:
def wklyAbd_selectSpecies_locIds(cnx,speciesList: list,locIdList: list):
    cnx=cnx
    try:
        querySpecies = []
        for i in speciesList:
            i = str(i)
            querySpecies.append(i)
        querySpecies = str(querySpecies).strip('[]')
        queryLocs = []
        for i in locIdList:
            i = str(i)
            queryLocs.append(i)
        queryLocs = str(queryLocs).strip('[]')
    #baseline
        baseQuery = f'SELECT speciesCode,obsDt,howMany FROM historicObservations WHERE speciesCode in ({querySpecies});'
        baseline = pd.read_sql(baseQuery,con=cnx,parse_dates=['obsDt'])
        baseline = baseline.assign(obsDt_week=baseline.obsDt.dt.isocalendar().week)
        baselineAvg = baseline.groupby(['speciesCode'])['howMany'].mean()
        #query = f'SELECT speciesCode,FX.locId,StopName,obsDt,howMany FROM historicObservations AS FX  LEFT JOIN closestStop on FX.locId = closestStop.locId WHERE (SELECT count(distinct(subId)) FROM historicObservations AS QA WHERE QA.comName = FX.comName) > 2 AND FX.speciesCode in ({querySpecies}) AND FX.locId IN ({queryLocs});'
        query = f'SELECT speciesCode,FX.locId,obsDt,howMany FROM historicObservations AS FX WHERE (SELECT count(distinct(subId)) FROM historicObservations AS QA WHERE QA.comName = FX.comName) > 2 AND FX.speciesCode in ({querySpecies}) AND FX.locId IN ({queryLocs});'
        obsData = pd.read_sql(query,con=cnx,parse_dates=['obsDt'])
        obsData = obsData.assign(obsDt_week=obsData.obsDt.dt.isocalendar().week)
        obsData['howMany'].fillna(1,inplace=True)
        obsData = obsData.groupby(['speciesCode','obsDt_week'])['howMany'].mean().reset_index()
        #maxCount = obsData.groupby(['speciesCode'])['howMany'].max()
        
    #derive relative abundance
        obsData['relativeAbundance'] = obsData.apply(lambda x: (x.howMany/baselineAvg[x.speciesCode]),axis=1)  #baseline relative abd, same process as Fink et. all
        #obsData['relativeAbundance'] = obsData.apply(lambda f: stopQualityMask(f.speciesCode,f.StopName)*f.relativeAbundance,axis=1)  #apply frequency mask
        #avgAnnualAbd = obsData.groupby(['speciesCode'])['relativeAbundance'].mean()
        #obsData['relativeAbundance'] = obsData.apply(lambda n: ((n.relativeAbundance)/(avgAnnualAbd[n.speciesCode])),axis=1)  #normalizing around average relative abundance
        obsData.sort_values(by=['obsDt_week'],ascending=True,inplace=True)
    except db.DatabaseError as dbExc:
        raise f'there was an issue with the database request: {dbExc}'
    except Exception as ex:
        raise ex
    finally: cnx.close()
    return obsData

In [384]:
habs = kmeans_habitat(cnx=connectDB(),distinctHabitats=10)
habs.value_counts(subset=['clusterLabel'])

clusterLabel
6               13
0                9
2                8
1                5
8                5
9                5
3                4
7                3
4                2
5                2
dtype: int64

In [479]:
labelList = habs[habs['clusterLabel']==0]
labelZero = list(labelList['locId'])

In [480]:
result = wklyAbd_selectSpecies_locIds(cnx=connectDB(),speciesList=['gockin'],locIdList=labelZero)
result

Unnamed: 0,speciesCode,obsDt_week,howMany,relativeAbundance
0,gockin,1,4.666667,0.80347
1,gockin,2,8.0,1.377377
2,gockin,3,9.25,1.592593
3,gockin,4,7.0,1.205205
4,gockin,5,2.0,0.344344
5,gockin,8,10.0,1.721722
6,gockin,9,3.0,0.516517
7,gockin,10,6.0,1.033033
8,gockin,13,2.0,0.344344
9,gockin,14,2.5,0.43043


In [402]:
np.sqrt(((5.837-2.901)**2/3))

1.6951003903407413

In [391]:
gockin = amerob[amerob['speciesCode']=='gockin']

In [544]:
list(amerob[2,])

KeyError: (2,)