In [83]:
import pandas as pd
import sqlite3 as db
import datetime as dt
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score,explained_variance_score
from statsmodels.tsa.seasonal import seasonal_decompose

In [3]:
github_userName = 'Tanag3r'
ebird_token = 'j6c7l80ga2ib'
db_name = 'trailheadDirectBirds_sous.db'

In [4]:
##connect to database
def connectDB():
    try:
        cnx = db.connect(db_name)
    except Exception as cnxError:
        raise UserWarning(f'Unable to connect to database due to: {cnxError}')
    return cnx

In [5]:
def stopQualityMask(speciesCode: str,closestStop: str):
    cnx = connectDB()
    try:
        gap = dt.date.today().year-2018
        query = f'SELECT COUNT(DISTINCT(year)) as "frq" FROM coefficients_bySpecies WHERE speciesCode = "{speciesCode}" AND closestStop = "{closestStop}"'
        coeficients = pd.read_sql(query,con=cnx)
        coeficients['frq'] = coeficients.apply(lambda g: log(g.frq/gap),axis=1)
    except Exception as maskExc:
        raise maskExc
    return coeficients['frq']

In [17]:
#baseline request from the application layer
#outputs a list of birds at the stop and a classification based solely off the number of observations
def birdList_request(StopName: str,cnx):
    try:
        query = f'SELECT speciesCode,count(subId) as "checklists",(SELECT count(subId) FROM historicObservations hxobx WHERE hxobx.speciesCode=hsob.speciesCode) as "sightings" FROM historicObservations hsob LEFT JOIN closestStop on hsob.locId=closestStop.locId WHERE StopName = "{StopName}" GROUP BY speciesCode;'
        sightings = pd.read_sql(sql=query,con=cnx)
    #rareness at the stop
        sightings['stopGroup'] = str
        bucket = sightings['checklists'].quantile([0,0.15,0.5,0.85,1])
        sightings.loc[sightings['checklists'] <= bucket[0.15],'stopGroup'] = 'rare'
        sightings.loc[(sightings['checklists'] > bucket[0.15]) & (sightings['checklists'] <= bucket[0.5]),'stopGroup'] = 'uncommon'
        sightings.loc[(sightings['checklists'] > bucket[0.5]) & (sightings['checklists'] < bucket[0.85]),'stopGroup'] = 'normal'
        sightings.loc[(sightings['checklists'] >= bucket[0.85]) & (sightings['checklists'] <=bucket[1]),'stopGroup'] = 'common'
    #overall rareness
        sightings['overall'] = str
        bucket = sightings['sightings'].quantile([0,0.15,0.5,0.85,1])
        sightings.loc[sightings['sightings'] <= bucket[0.15],'overall'] = 'rare'
        sightings.loc[(sightings['sightings'] > bucket[0.15]) & (sightings['sightings'] <= bucket[0.5]),'overall'] = 'uncommon'
        sightings.loc[(sightings['sightings'] > bucket[0.5]) & (sightings['sightings'] < bucket[0.85]),'overall'] = 'normal'
        sightings.loc[(sightings['sightings'] >= bucket[0.85]) & (sightings['sightings'] <=bucket[1]),'overall'] = 'common'
    #raise an exception if the stopName given is not valid and return a list of valid stop names
    except Exception as ex:
        raise ex
    return sightings


In [18]:
birdList_request(StopName='EastSunsetWay',cnx=connectDB())

Unnamed: 0,speciesCode,checklists,sightings,stopGroup,overall
0,amecro,90,286,common,common
1,amedip,10,10,normal,uncommon
2,amegfi,11,46,normal,uncommon
3,amekes,1,1,rare,rare
4,amerob,79,361,common,common
...,...,...,...,...,...
100,wooduc,5,94,uncommon,normal
101,x00051,1,1,rare,rare
102,y00475,9,36,normal,uncommon
103,yelwar,3,16,uncommon,uncommon


For birds with robust data, do they behave the same at all stops? If not, what stops does each species appear to prefer? Is that preference explained by habitat?

In [85]:
def wklyAbd_selectSpecies(cnx,speciesList: list):
    cnx=cnx
    try:
        querySpecies = []
        for i in speciesList:
            i = str(i)
            querySpecies.append(i)
        querySpecies = str(querySpecies).strip('[]')
        #query = f'SELECT speciesCode,FX.locId,StopName,obsDt,howMany FROM historicObservations AS FX  LEFT JOIN closestStop on FX.locId = closestStop.locId WHERE (SELECT count(distinct(subId)) FROM historicObservations AS QA WHERE QA.comName = FX.comName) > 2 AND FX.locId in ({query_locIds})'
        ##todo #83 write an exception to convert non-string speciesList values to strings that returns the db.error if the strings do not represent columns --DONE
        query = f'SELECT speciesCode,FX.locId,StopName,obsDt,howMany FROM historicObservations AS FX  LEFT JOIN closestStop on FX.locId = closestStop.locId WHERE (SELECT count(distinct(subId)) FROM historicObservations AS QA WHERE QA.comName = FX.comName) > 2 AND FX.speciesCode in ({querySpecies});'
        obsData = pd.read_sql(query,con=cnx,parse_dates=['obsDt'])
        obsData = obsData.assign(obsDt_week=obsData.obsDt.dt.isocalendar().week)
        obsData['howMany'].fillna(1,inplace=True)
        obsData = obsData.groupby(['speciesCode','StopName','obsDt_week'])['howMany'].mean().reset_index()
        #maxCount = obsData.groupby(['speciesCode'])['howMany'].max()
        avgCount = obsData.groupby(['speciesCode'])['howMany'].mean()
        #derive relative abundance
        obsData['relativeAbundance'] = obsData.apply(lambda x: (x.howMany/(avgCount[x.speciesCode])),axis=1)  #baseline relative abd, same process as Fink et. all
        obsData['relativeAbundance'] = obsData.apply(lambda f: stopQualityMask(f.speciesCode,f.StopName)*f.relativeAbundance,axis=1)  #apply frequency mask
        avgAbd = obsData.groupby(['speciesCode'])['relativeAbundance'].mean()
        obsData['relativeAbundance'] = obsData.apply(lambda n: ((n.relativeAbundance)/(avgAbd[n.speciesCode])),axis=1)  #normalizing around average relative abundance
        obsData.sort_values(by=['StopName','obsDt_week'],ascending=True,inplace=True)
    except db.DatabaseError as dbExc:
        raise f'there was an issue with the database request: {dbExc}'
    except Exception as ex:
        raise ex
    finally: cnx.close()
    return obsData

In [153]:
##TODO #90 rewrite stopCovariance to run for a single species at a time

def stopCovariance(obsData: pd.DataFrame,species: str,cnx):
    try:
        obsData.sort_values(by=['StopName','obsDt_week'],ascending=True,inplace=True)
        container = []
        resultsContainer = []
        speciesContainer = []
        blobs = []
        stopKeys = obsData.drop_duplicates(subset=['StopName'])
        for StopName in stopKeys.itertuples():
            stop_obsData = obsData[obsData['speciesCode']==species]
            stop_obsData = stop_obsData[stop_obsData['StopName']==StopName.StopName]
            allweek = pd.DataFrame({'obsDt_week':range(1,53)})
            stop_obsData.drop(columns=['speciesCode','StopName','howMany'],inplace=True)
            stop_obsData = pd.merge(left=stop_obsData,right=allweek,left_on='obsDt_week',right_on='obsDt_week',how='outer')
            stop_obsData.set_index('obsDt_week',inplace=True)
            stop_obsData.sort_index(axis='index',ascending=True,inplace=True)
            stop_obsData['mask'] = stop_obsData['relativeAbundance'].interpolate(method='ffill',limit=5,limit_direction='forward')  #mask, values do not matter
            stop_obsData['mask'] = stop_obsData['mask'].interpolate(method='bfill',limit=2,limit_direction='backward')  #mask, values do not matter
            stop_obsData.loc[stop_obsData['mask'].isna() == True,'relativeAbundance'] = 0
            stop_obsData['fx_relativeAbundance'] = stop_obsData['relativeAbundance'].interpolate(method='linear',limit=5,limit_direction='forward')
            stop_obsData['fx_relativeAbundance'] = stop_obsData['fx_relativeAbundance'].interpolate(method='linear',limit=2,limit_direction='backward')
            stop_obsData.drop(columns=['relativeAbundance','mask'],inplace=True)
            stop_obsData.rename(columns={'fx_relativeAbundance':StopName.StopName},inplace=True)
            
                #stop_obsData.reset_index(inplace=True)
                #stop_obsData['StopName'] = StopName.StopName
                #stop_obsData['speciesCode'] = x
                #stop_obsData.drop(columns=['mask'],inplace=True)
                
            container.append(stop_obsData)
        weeklySpeciesAbd = pd.concat(container,ignore_index=False,axis=1)
        weeklySpeciesAbd = weeklySpeciesAbd.fillna(value=0,axis=0)
        weeklySpeciesAbd = weeklySpeciesAbd.apply(lambda n:(np.log1p(n)),axis=1)
    #write r2, var to db
        for y in list(weeklySpeciesAbd):
            fit = [(stop,r2_score(weeklySpeciesAbd[y],weeklySpeciesAbd[stop])) for stop in list(weeklySpeciesAbd)]
            var = [(stop,explained_variance_score(weeklySpeciesAbd[y],weeklySpeciesAbd[stop])) for stop in list(weeklySpeciesAbd)]
            blob = {'species':species,'key':y,'r2score':fit,'expVar':var}
        #popularSpecies = pd.DataFrame(pd.concat(speciesContainer,ignore_index=True))
    except Exception as ex:
        raise ex
    return weeklySpeciesAbd

In [130]:
amerob = wklyAbd_selectSpecies(cnx=connectDB(),speciesList=['gockin','amerob','amegfi'])


In [154]:
tr = stopCovariance(obsData=amerob,species='gockin',cnx=connectDB())
tr

Unnamed: 0_level_0,ChiricoTrail_PooPooPoint,EastSunsetWay,HighSchool,MargaretsWay,MountTeneriffe,SquakMountain
obsDt_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,0.317886,1.144388,1.144388,0.381463,0.953657
2,0.190731,0.572194,1.072864,1.525851,1.525851,0.66756
3,0.381463,0.190731,1.00134,2.288777,1.398697,0.286097
4,0.572194,2.098045,0.929816,0.381463,1.271543,2.67024
5,1.549693,0.381463,0.858291,0.381463,1.144388,0.619877
6,2.527191,0.89008,0.786767,0.429146,1.025181,0.381463
7,3.50469,1.398697,0.715243,0.476829,0.905974,1.573534
8,4.482188,1.907314,0.801072,0.524511,0.786767,0.190731
9,5.459686,1.144388,0.886901,0.572194,0.66756,0.254309
10,6.437185,0.381463,0.97273,1.144388,0.548353,0.317886


In [71]:
fit = [(stop,r2_score(tr.MountTeneriffe,tr[stop])) for stop in list(tr)]
var = [(stop,explained_variance_score(tr.MountTeneriffe,tr[stop])) for stop in list(tr)]

#fit_df = pd.DataFrame(data=[fit,var],columns=['stopName','r_squared','explainedVariance'])
#fit_df.sort_values(by='r_squared',ascending=False,inplace=True)
#fit_df['r_squared'] = fit_df['r_squared'].astype('float64')
#fit_df['error'] = 1-fit_df['r_squared']
#fit_df = fit_df[fit_df['r_squared'] > 0.65]
fit

[('obsDt_week', -19529.88130867905),
 ('ChiricoTrail_PooPooPoint', -9.710450300191956),
 ('EastSunsetWay', -0.85410725363617),
 ('HighSchool', -0.03400692575890707),
 ('MargaretsWay', -6.5087812350932746),
 ('MountTeneriffe', 1.0),
 ('SquakMountain', -0.6615870492345806)]

In [32]:
list(amerob)

['obsDt_week',
 'ChiricoTrail_PooPooPoint',
 'EastSunsetWay',
 'HighSchool',
 'MargaretsWay',
 'MountTeneriffe',
 'SquakMountain']

In [74]:
fitList = []
for x in list(tr):
    score = [(stop,explained_variance_score(tr[x],tr[stop])) for stop in list(tr)]
    fitList.append(score)
    fitList.append(x)
fitList

[[('obsDt_week', 1.0),
  ('ChiricoTrail_PooPooPoint', -0.012699299879923798),
  ('EastSunsetWay', 0.014737193782515035),
  ('HighSchool', -0.012078096389026571),
  ('MargaretsWay', -0.017287708118643996),
  ('MountTeneriffe', -0.004256623847688656),
  ('SquakMountain', 0.008274247479162722)],
 'obsDt_week',
 [('obsDt_week', -467.43503426668104),
  ('ChiricoTrail_PooPooPoint', 1.0),
  ('EastSunsetWay', 0.19226074192253273),
  ('HighSchool', 0.16230112119154216),
  ('MargaretsWay', 0.075202445144674),
  ('MountTeneriffe', 0.12683119632080808),
  ('SquakMountain', 0.07879682917609043)],
 'ChiricoTrail_PooPooPoint',
 [('obsDt_week', -1685.8834171691663),
  ('ChiricoTrail_PooPooPoint', -1.989753017500714),
  ('EastSunsetWay', 1.0),
  ('HighSchool', 0.11457865857019867),
  ('MargaretsWay', -2.2020718959407235),
  ('MountTeneriffe', 0.3797443139794656),
  ('SquakMountain', 0.5002153640305175)],
 'EastSunsetWay',
 [('obsDt_week', -5483.603697154399),
  ('ChiricoTrail_PooPooPoint', -8.814095994

In [61]:
selection = amerob[amerob['StopName']=='MountTeneriffe']
series = pd.Series(name='MountTeneriffe',data=selection['fx_relativeAbundance'],index=range(1,53))
series

1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
10   NaN
11   NaN
12   NaN
13   NaN
14   NaN
15   NaN
16   NaN
17   NaN
18   NaN
19   NaN
20   NaN
21   NaN
22   NaN
23   NaN
24   NaN
25   NaN
26   NaN
27   NaN
28   NaN
29   NaN
30   NaN
31   NaN
32   NaN
33   NaN
34   NaN
35   NaN
36   NaN
37   NaN
38   NaN
39   NaN
40   NaN
41   NaN
42   NaN
43   NaN
44   NaN
45   NaN
46   NaN
47   NaN
48   NaN
49   NaN
50   NaN
51   NaN
52   NaN
Name: MountTeneriffe, dtype: float64