In [563]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import datetime as dt
import sqlite3 as db
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans

In [4]:
github_userName = 'Tanag3r'
ebird_token = 'j6c7l80ga2ib'
db_name = 'trailheadDirectBirds_sous.db'

In [5]:
##connect to database
def connectDB():
    try:
        cnx = db.connect(db_name)
    except Exception as cnxError:
        raise UserWarning(f'Unable to connect to database due to: {cnxError}')
    return cnx

In [162]:
def stopQualityMask(speciesCode: str,closestStop: str):
    cnx = connectDB()
    try:
        gap = dt.date.today().year-2018
        query = f'SELECT COUNT(DISTINCT(year)) as "frq" FROM coefficients_bySpecies WHERE speciesCode = "{speciesCode}" AND closestStop = "{closestStop}"'
        coeficients = pd.read_sql(query,con=cnx)
        coeficients['frq'] = coeficients.apply(lambda g: (g.frq/gap),axis=1)
    except Exception as maskExc:
        raise maskExc
    return coeficients['frq']

In [161]:
##TODO #88 change normalization method for relative abundance to studentized t-test
def weeklyAbundance():
    cnx = connectDB()
    try:
        #note that the query filters out birds only seen by two people
        ObsDataset = pd.read_sql('SELECT speciesCode,FX.locId,StopName,obsDt,howMany,subId FROM historicObservations AS FX  LEFT JOIN closestStop on FX.locId = closestStop.locId WHERE (SELECT count(distinct(subId)) FROM historicObservations AS QA WHERE QA.speciesCode = FX.speciesCode) > 2;',con=cnx,parse_dates=['obsDt']) #Canada Jay filter is for testing purposes only
        ObsDataset['obsDt_week'] = ObsDataset['obsDt'].dt.isocalendar().week
        ObsDataset['howMany'] = ObsDataset['howMany'].fillna(1)
    #group up data
        ObsDataset = ObsDataset.groupby(['speciesCode','StopName','locId','obsDt_week'])['howMany'].mean().reset_index()
        #maxCount = ObsDataset.groupby(['speciesCode'])['howMany'].max()
        avgCount = ObsDataset.groupby(['speciesCode'])['howMany'].mean()
    #calculations
        ObsDataset['relativeAbundance'] = ObsDataset.apply(lambda x: (x.howMany/(avgCount[x.speciesCode])),axis=1)  #baseline
        ObsDataset['relativeAbundance'] = ObsDataset.apply(lambda f: stopQualityMask(f.speciesCode,f.StopName)*f.relativeAbundance,axis=1)  #apply frequency mask
        avgAbd = ObsDataset.groupby(['speciesCode'])['relativeAbundance'].mean() 
        ObsDataset['relativeAbundance'] = ObsDataset.apply(lambda n: ((n.relativeAbundance)/(avgAbd[n.speciesCode])),axis=1)  #normalizing around average relative abundance
        ObsDataset.sort_values(by=['speciesCode','obsDt_week'],ascending=True,inplace=True)
    #apply absence smoothing mask
        list = []
        ObsDataset = ObsDataset.groupby(['speciesCode','obsDt_week'])['relativeAbundance'].mean().reset_index()
        speciesKeys = ObsDataset.drop_duplicates(subset=['speciesCode'])
        for speciesCode in speciesKeys.itertuples():
            species_obsDataset = ObsDataset[ObsDataset['speciesCode']==speciesCode.speciesCode]
            #weekMin = (species_obsDataset['obsDt_week'].min()-8)
            #if weekMin <= 0: weekMin = 1
            #weekMax = (species_obsDataset['obsDt_week'].max()+8)
            #if weekMax > 53: weekMax = 53
            #allweek = pd.DataFrame({'obsDt_week':range(weekMin,weekMax)})
            allweek = pd.DataFrame({'obsDt_week':range(1,53)})
            species_obsDataset.drop(columns=['speciesCode'],inplace=True)
            species_obsDataset = pd.merge(left=species_obsDataset,right=allweek,left_on='obsDt_week',right_on='obsDt_week',how='outer')
            species_obsDataset.set_index('obsDt_week',inplace=True)
            species_obsDataset.sort_index(axis='index',ascending=True,inplace=True)
            species_obsDataset['mask'] = species_obsDataset['relativeAbundance'].interpolate(method='index',limit=5,limit_direction='both')  #mask, values do not matter
            species_obsDataset.loc[species_obsDataset['mask'].isna() == True,'relativeAbundance'] = 0
            species_obsDataset.reset_index(inplace=True)
            species_obsDataset['speciesCode'] = speciesCode.speciesCode
            list.append(species_obsDataset)
        weeklyAbd = pd.DataFrame(pd.concat(list,ignore_index=True))
    except Exception as calcEx:
        raise calcEx
    finally: cnx.close()
    #return weeklyAbd[['speciesCode','StopName','obsDt_week','locId','relativeAbundance']]
    return weeklyAbd

In [346]:
##TODO #82 write a function that returns a dataframe of the annual relative abundance for a given list of birds
##TODO #84 add StopName as an optional argument for the function wklyAbd_setSpecies()
##TODO #86 replace comName with speciesCode in all functions --DONE

##TODO #89 update the function wklyAbd_selectLocId to take a list of locId's 
def wklyAbd_selectLocId(locIdList: list):
    cnx=connectDB()
    try:
        query_locIds = []
        for i in locIdList:
            i = str(i)
            query_locIds.append(i)
        query_locIds = str(query_locIds).strip('[]')
        query = f'SELECT speciesCode,FX.locId,StopName,obsDt,howMany FROM historicObservations AS FX  LEFT JOIN closestStop on FX.locId = closestStop.locId WHERE (SELECT count(distinct(subId)) FROM historicObservations AS QA WHERE QA.comName = FX.comName) > 2 AND FX.locId in ({query_locIds})'
        ##todo #83 write an exception to convert non-string speciesList values to strings that returns the db.error if the strings do not represent columns --DONE
        #query = f'SELECT speciesCode,FX.locId,StopName,obsDt,howMany FROM historicObservations AS FX  LEFT JOIN closestStop on FX.locId = closestStop.locId WHERE (SELECT count(distinct(subId)) FROM historicObservations AS QA WHERE QA.comName = FX.comName) > 2 AND FX.comName in ({querySpecies});'
        obsData = pd.read_sql(query,con=cnx,parse_dates=['obsDt'])
        obsData = obsData.assign(obsDt_week=obsData.obsDt.dt.isocalendar().week)
        obsData['howMany'].fillna(1,inplace=True)
        obsData = obsData.groupby(['speciesCode','StopName','obsDt_week'])['howMany'].mean().reset_index()
        #maxCount = obsData.groupby(['speciesCode'])['howMany'].max()
        avgCount = obsData.groupby(['speciesCode'])['howMany'].mean()
        #derive relative abundance
        obsData['relativeAbundance'] = obsData.apply(lambda x: (x.howMany/(avgCount[x.speciesCode])),axis=1)  #baseline
        obsData['relativeAbundance'] = obsData.apply(lambda f: stopQualityMask(f.speciesCode,f.StopName)*f.relativeAbundance,axis=1)  #apply frequency mask
        avgAbd = obsData.groupby(['speciesCode'])['relativeAbundance'].mean()
        obsData['relativeAbundance'] = obsData.apply(lambda n: ((n.relativeAbundance)/(avgAbd[n.speciesCode])),axis=1)  #normalizing around average relative abundance


        #apply absence smoothing mask
        list = []
        obsData = obsData.groupby(['speciesCode','obsDt_week'])['relativeAbundance'].mean().reset_index()
        speciesKeys = obsData.drop_duplicates(subset=['speciesCode'])
        for speciesCode in speciesKeys.itertuples():
            species_obsData = obsData[obsData['speciesCode']==speciesCode.speciesCode]

            ##removing this block, it corrupts interpolation comparisons
            #weekMin = (species_obsData['obsDt_week'].min()-8)
            #if weekMin <= 0: weekMin = 1
            #weekMax = (species_obsData['obsDt_week'].max()+8)
            #if weekMax > 53: weekMax = 53
            #allweek = pd.DataFrame({'obsDt_week':range(weekMin,weekMax)})
            
            allweek = pd.DataFrame({'obsDt_week':range(1,53)})
            species_obsData.drop(columns=['speciesCode'],inplace=True)
            species_obsData = pd.merge(left=species_obsData,right=allweek,left_on='obsDt_week',right_on='obsDt_week',how='outer')
            species_obsData.set_index('obsDt_week',inplace=True)
            species_obsData.sort_index(axis='index',ascending=True,inplace=True)
            species_obsData['mask'] = species_obsData['relativeAbundance'].interpolate(method='index',limit=5,limit_direction='both')  #mask, values do not matter
            species_obsData.loc[species_obsData['mask'].isna() == True,'relativeAbundance'] = 0
            species_obsData.reset_index(inplace=True)
            species_obsData['speciesCode'] = speciesCode.speciesCode
            species_obsData.drop(columns=['mask'],inplace=True)
            list.append(species_obsData)
        weeklySpeciesAbd = pd.DataFrame(pd.concat(list,ignore_index=True))
    except db.DatabaseError as dbExc:
        raise f'there was an issue with the database request: {dbExc}'
    except Exception as ex:
        raise ex
    finally: cnx.close()
    return weeklySpeciesAbd

Easy question first: for each species of bird, which stop's relative abundance score are explained by the all-stop relative abundance score? Evaluate using r2 and co-variance.

Habitat compostions and preference. First, develop a KMeans function to cluster locId's by habitat composition

In [350]:
def kmeans_habitat(cnx,distinctHabitats: int):
    try:
        data = pd.read_sql(sql='SELECT * FROM FAO_by_locId;',con=cnx)
        data = data.drop(columns=['locName']).set_index('locId')
        data.fillna(0,inplace=True)
    #normalize
        maxValue = data.apply(max,axis=1)
        data = data.apply(lambda x: (x/maxValue[x.index]),axis=0)   #min-max normalizing to smooth in proportionality
    #compute kmeans for each locId
        habitat_kmeans = KMeans(n_clusters=distinctHabitats,init='k-means++')
        habitat_kmeans = habitat_kmeans.fit(data.values)
        clusterLabels = habitat_kmeans.labels_
    #define habitats
        habitatFrame = pd.DataFrame(data=clusterLabels,columns=['clusterLabel'],index=data.index).sort_values(by='clusterLabel').reset_index()
        habitatFrame = pd.merge(left=habitatFrame,left_on='locId',right=data,right_on='locId',how='left')
    except Exception as kmeansExc:
        raise kmeansExc
    return habitatFrame        

In [357]:
habitat = kmeans_habitat(cnx=connectDB(),distinctHabitats=10)

Unnamed: 0,locId,clusterLabel,Barren,Urban,Grasslands,Savannas,waterBodies,waterUnclassified,evergreenNeedleleafForest,evergreenBroadleafForests,openForests,mixedBroadleafandNeedleleafForests,deciduousBroadleafForests,sparseForests,denseHerbaceous,sparseHerbaceous
0,L5980530,0,0.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42,0.0,0.08
1,L1096378,0,0.67,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22,0.0,0.11
2,L1465184,0,0.44,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.56,0.0,0.0
3,L8312096,1,0.0,0.0,0.0,0.0,0.0,0.0,0.147727,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,L10129054,1,0.0,0.0,0.0,0.0,0.0,0.0,0.147727,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,L3943215,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147727,0.0,1.0,0.0,0.0,0.0,0.0
6,L7672326,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,L424117,1,0.0,0.0,0.0,0.0,0.0,0.0,0.185185,0.160494,0.0,1.0,0.0,0.0,0.0,0.0
8,L4381196,1,0.0,0.0,0.0,0.0,0.0,0.0,0.388889,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,L6226126,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [358]:
habitat_one = habitat[habitat['clusterLabel']==1]
loclist_one = list(habitat_one['locId'])
habitat_one

Unnamed: 0,locId,clusterLabel,Barren,Urban,Grasslands,Savannas,waterBodies,waterUnclassified,evergreenNeedleleafForest,evergreenBroadleafForests,openForests,mixedBroadleafandNeedleleafForests,deciduousBroadleafForests,sparseForests,denseHerbaceous,sparseHerbaceous
3,L8312096,1,0.0,0.0,0.0,0.0,0.0,0.0,0.147727,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,L10129054,1,0.0,0.0,0.0,0.0,0.0,0.0,0.147727,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,L3943215,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147727,0.0,1.0,0.0,0.0,0.0,0.0
6,L7672326,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,L424117,1,0.0,0.0,0.0,0.0,0.0,0.0,0.185185,0.160494,0.0,1.0,0.0,0.0,0.0,0.0
8,L4381196,1,0.0,0.0,0.0,0.0,0.0,0.0,0.388889,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,L6226126,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
10,L450305,1,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.066667,0.0,1.0,0.066667,0.0,0.0,0.0
11,L128530,1,0.0,0.0,0.0,0.15873,0.31746,0.0,0.47619,0.0,0.15873,1.0,0.793651,0.0,0.0,0.0


In [359]:
habOne_abundance = wklyAbd_selectLocId(locIdList=loclist_one)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


For each habitat, interpolate relative abundance data for birds with very low and very high number of sightings using data from the coefficients_bySpecies table. Processing the outliers first removes noise from covariation analysis.

In [561]:
def build_outlierAbd(abundanceFrame: pd.DataFrame,habitatLocIds: list,cnx):
    try:
    #locId's to a concat string
        query_locIds = []
        for i in habitatLocIds:
            i = str(i)
            query_locIds.append(i)
        query_locIds = str(query_locIds).strip('[]')
    #speciesCodes to a concat string
        query_speciesCodes = []
        for s in list(abundanceFrame['speciesCode'].drop_duplicates()):
            s = str(s)
            query_speciesCodes.append(s)
        query_speciesCodes = str(query_speciesCodes).strip('[]')
        coeffQuery = f'SELECT speciesCode,sum(uniqueSightings) as "totalUniqueSightings",max(WEEKfrequency) as "maxWeekFreq" FROM coefficients_bySpecies WHERE speciesCode IN ({query_speciesCodes}) AND locId IN ({query_locIds}) GROUP BY speciesCode;'
        coeff = pd.read_sql(sql=coeffQuery,con=cnx)
    #build quantiles
        coeff['group'] = int
        sightingsConf = coeff['totalUniqueSightings'].quantile([0,0.15,0.85,1])        
        coeff.loc[coeff['totalUniqueSightings'] <= sightingsConf[0.15],'group'] = 1
        coeff.loc[(coeff['totalUniqueSightings'] > sightingsConf[0.15]) & (coeff['totalUniqueSightings'] < sightingsConf[0.85]),'group'] = 2
        coeff.loc[(coeff['totalUniqueSightings'] >= sightingsConf[0.85]) & (coeff['totalUniqueSightings'] <=sightingsConf[1]),'group'] = 3
    #interpolate for outlier species
        resultList = []
        for species in coeff.itertuples():
            habitatAbd = abundanceFrame[abundanceFrame['speciesCode'] == species.speciesCode].reset_index()
            habitatAbd = habitatAbd[['relativeAbundance','obsDt_week']].set_index('obsDt_week')
            habitatAbd['inter'] = bool
            if species.group == 3:
                habitatAbd['relativeAbundance'] = habitatAbd['relativeAbundance'].interpolate(method='linear',limit=5,limit_direction='both')
                habitatAbd['inter'] = True
            if species.group == 1:
                habitatAbd['relativeAbundance'] = habitatAbd['relativeAbundance'].interpolate(method='polynomial',order=2,limit=3,limit_direction='both')
                habitatAbd.fillna(value=0,inplace=True)
                habitatAbd['inter'] = True
            if species.group == 2:
                habitatAbd['inter'] = False
            habitatAbd = habitatAbd.assign(speciesCode=species.speciesCode)
            resultList.append(habitatAbd)
        results = pd.concat(resultList)
    except Exception as buildExc:
        raise buildExc
    return results

>`Hey everyone, look at this loser doing ML inside a network!`

Since some quality data is now available (group 3 birds), it is now possible to do some testing and model teaching within this neuron. The input of this network is a bird and a stop name and the output is a relative abundance regression so this is the first opportunity to do some fitting and deliver the output.

1. Fit each stop to a predicted kmeans habitat cluster, with the distance from the cluster center as the 'error'
2. As the null hypothesis, plot the all-stop relative abundance for each group three bird. If the all-stop abundance regression explains the habitat abundance regression then the...
3. Plot the relative abundance of group 3 birds against the relative abundance of group 3 birds at each stop
4. For each group three bird, isolate stops that reasonably fit within the habitat cluster and plot with an acceptable level of error against the group three relative abundance plot.
5. Derive a relative abundance model for the fitted stops for each group three bird using supervised reinforcement sampling with the group three abundance regression acting as the reference.

An interesting aside: look for a linear relationship between the kmeans_predict error and the relative abundance regression error. If there is a linear relationship, that [x,y] pairing may need to be fleshed out as another neuron

In [562]:
habOne_processed = build_outlierAbd(abundanceFrame=habOne_abundance,habitatLocIds=loclist_one,cnx=connectDB())
habOne_processed

Unnamed: 0_level_0,relativeAbundance,inter,speciesCode
obsDt_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4.070953,True,amecro
2,3.139690,True,amecro
3,2.208426,True,amecro
4,1.277162,True,amecro
5,0.478936,True,amecro
...,...,...,...
48,0.000000,False,yerwar
49,0.000000,False,yerwar
50,0.000000,False,yerwar
51,0.000000,False,yerwar


For each week of the year, which habitat cluster(s) are chosen by each bird? Strength of choice is expressed by z-score

Within each habitat cluster, do the relative abundance trends explain one another? In other words, are there groups of birds that express similar habitat preferences at certain times of the year? Micro-trends lend context to macro trends and helps ultimately build better forecasting and ML models.

In [None]:
##TODO #80 develop a reference model for birds present in winter and early spring that returns a list of species that fit the model
##TODO #81 write a function that tests the fit of each species' observerd smoothed relative abundance against each model, then returns a model

#winter abundance model



In [150]:
methods = [{'method':'cubic','order':3},
{'method':'cubic','order':5},
{'method':'cubic','order':7},
{'method':'linear','order':3},
{'method':'slinear','order':3},
{'method':'polynomial','order':3},
{'method':'polynomial','order':5},
{'method':'polynomial','order':7},
{'method':'spline','order':3},
{'method':'spline','order':5},
{'method':'spline','order':7}]

for modes in methods:
    for method in modes:
        modes['method']
        modes['order']

In [None]:
def build_abundanceModels(allBirds: pd.DataFrame, methods: list)

In [375]:
habOne_abundance.value_counts(['speciesCode'])

speciesCode
amecro         53
varthr         53
pacwre1        53
pilwoo         53
pinsis         53
               ..
glwgul         52
gadwal         52
foxspa         52
evegro         52
yerwar         52
Length: 100, dtype: int64

In [576]:
#interpolation testing
##TODO #87 turn block into a function that takes relative abundance values as and returns a list of like birds with the best interpolation method as an r2 score
allBirds = habOne_processed
speciesKeys = allBirds.drop_duplicates(subset=['speciesCode'])
helpList = []
weekSeries = pd.DataFrame(range(1,53),columns=['week'])
for speciesCode in speciesKeys.itertuples():
    winterGuy = allBirds[allBirds['speciesCode']== speciesCode.speciesCode].reset_index()
    winterGuy = winterGuy[['obsDt_week','relativeAbundance']].set_index('obsDt_week')
    winterGuy = winterGuy.interpolate(method='spline',order=5,limit=5,limit_direction='both')
#normalize
    winterGuy_max = winterGuy['relativeAbundance'].max()
    winterGuy_min = winterGuy['relativeAbundance'].min()
    winterGuy['relativeAbundance'] = winterGuy['relativeAbundance'].apply(lambda n: ((n-winterGuy_min)/(winterGuy_max-winterGuy_min)))
    
    winterGuy.rename(columns={'relativeAbundance':f'{speciesCode.speciesCode}'},inplace=True)
    helpList.append(winterGuy)
    #results = pd.merge(right=weekSeries,right_on='week',left=winterGuy,left_on='obsDt_week',how='left')
results = pd.concat(helpList,axis=1)
results = results.fillna(value=0,axis=0)
#results = results.fillna(method='ffill',axis=0)
results['model_one'] = results[['bkhgro','westan','swathr']].apply(np.average,axis=1)
results['model_two'] = results[['pasfly','btywar']].apply(np.average,axis=1)
results['model_three'] = results[['orcwar','batpig1']].apply(np.average,axis=1) #in testing, whcspa should be passed into this model then dropped on re-evaluation

##goal output for this function is a dictionary: 
#goals = ({'leadSpecies':'pasfly','method':'polynomial','order':3,'explains':[{'btywar':-1.059640118399928},{'rufhum':-0.04706819541929441},{'warvir':0.4480046324671728}]})

In [577]:
fit = [(species,r2_score(results.wlswar,results[species])) for species in list(results)]
fit_df = pd.DataFrame(np.array(fit),columns=['species','r_squared'])
fit_df.sort_values(by='r_squared',ascending=False,inplace=True)
fit_df['r_squared'] = fit_df['r_squared'].astype('float64')
fit_df['error'] = 1-fit_df['r_squared']
fit_df = fit_df[fit_df['r_squared'] > 0.65]
fit_df

Unnamed: 0,species,r_squared,error
95,wlswar,1.0,0.0
100,model_one,0.826849,0.173151
91,westan,0.821938,0.178062


In [156]:
model_grnher = results[['grnher','vauswi','orcwar']]
model_grnher['blendedAbundance'] = model_grnher.apply(np.average,axis=1)
model_grnher.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_grnher['blendedAbundance'] = model_grnher.apply(np.average,axis=1)


Unnamed: 0_level_0,grnher,vauswi,orcwar,blendedAbundance
obsDt_week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.146277,0.01795,0.092073,0.085434
2,0.146278,0.01795,0.092073,0.085434
3,0.146277,0.01795,0.092073,0.085434
4,0.146277,0.01795,0.092073,0.085433
5,0.146277,0.01795,0.092073,0.085434


In [157]:
modelFit = [(modelSpecies,r2_score(model_grnher.blendedAbundance,model_grnher[modelSpecies])) for modelSpecies in list(model_grnher)]
modelFit_df = pd.DataFrame(np.array(modelFit),columns=['modelSpecies','r2_score'])
modelFit_df.sort_values(by='r2_score',ascending=False,inplace=True)
modelFit_df

Unnamed: 0,modelSpecies,r2_score
3,blendedAbundance,1.0
0,grnher,0.9265214554459535
1,vauswi,0.8184361100894103
2,orcwar,0.7862963243683426


Forecasting Architecture:

1. A relative abundance model is calculated for the entire observed region for all birds to deliver a picture of behavior/sighting opportunities independent of habitat preference. 
2. LocId's are grouped by Type1/Type3 habitat types using k-means clustering to identify distinct habitats, then the relative abundance of each bird species is calculated within each habitat cluster. Blended relative abundance models are then derived from relative abundance trends that demonstrate moderate co-efficiency (r-squared score above 0.65) and tested against the whole as a form of psudeo-unsupervised machine learning.
3. Habitat preference is calculated as a function of relative abundance and demonstrated choice given known availability. This methodology is borrowed from some paper I need to find and cite. 
    1. Available habitats are first derived for each bird by making a list of all habitats each species has been observed in
    2. Habitat choice throughout the year is derived as a function of relative abundance and presence. If the Orange-Crowned Warbler prefers open deciduous forests from March to June and Evergreen Needleleaf Forests in July and August, then the Orange-Crowned Warbler is actively not choosing to reside in open deciduous forests for those months. 
    3. Spearman ranking is used to calculate the strength of the habitat preference modifier placed on the relative abundance projection (positive and negative, similar to Fink, D., T. Auer, A. Johnston, M. Strimas-Mackey, O. Robinson, S. Ligocki, W. Hochachka, L. Jaromczyk, C. Wood, I. Davies, M. Iliff, L. Seitz. 2021. eBird Status and Trends, Data Version: 2020; Released: 2021. Cornell Lab of Ornithology, Ithaca, New York.)
4. Relative abundance at each stop is calculated, then changes in habitat preferences are used to infer whether deviations from global relative abundance trends are explained by changes in habitat prefrence



- Reference models group birds based on expressed habitat preference, seasonality and rolling volume. Where neccessary, linear interpolation is applied to the reference models to reasonabily fill in gaps. The strength (re:limit value) of this interpolation and the accompanying mask is determined by rarity. Each bird+stop combination is assigned a reference model.
- R-squared testing assigns an interpolation method to each bird+stop combination to estimate the relative abundance of a bird at each stop based off of the given relative abundance and the reference model group
- Dependent forecasting uses existing relative abundance values to make inferences about relative abundance and habitat preference using a baseline of known values/sightings
- Independent forecasting projects the habitat preference of each bird over the habitat makeup of each stop, ignoring the sightings used to confirm the dependent forecasting model. For example: although there are not any Canada Jay sightings logged at Mt. Si, the habitat makeup of that stop correlates (Spearman rank) with the Canada Jay's preferred late-season habitat

In [None]:
##TODO #77 identify birds that express seasonality/can only be found in the observed regions during specific times of year to inform the method of interpolation

In [369]:
##TODO #79 add 'StopName' as a loop layer to the relative abundance forecasting --DONE
def forecast_weeklyAbd():
    realObsAbd = weeklyAbundance()
    try:
        keyList = realObsAbd.drop_duplicates(subset=['comName','StopName'])
        realObsAbd = realObsAbd.groupby(['comName','StopName','obsDt_week'])['relativeAbundance'].mean().reset_index()
        forecastList = []
        for comName in keyList.itertuples():
            #species = pd.DataFrame(keyList[keyList['comName'] == comName.comName])
            obsAbd = realObsAbd[realObsAbd['comName'] == comName.comName]
            obsAbd = obsAbd[obsAbd['StopName'] == comName.StopName]
    #build presence time series scope
            weekMin = (obsAbd['obsDt_week'].min()-8)
            if weekMin <= 0: weekMin = 1
            weekMax = (obsAbd['obsDt_week'].max()+8)
            if weekMax > 53: weekMax = 53
            allweek = pd.DataFrame({'obsDt_week':range(weekMin,weekMax)})
            obsAbd.drop(columns=['comName','StopName'],inplace=True)
            ptForxt = pd.merge(left=obsAbd,right=allweek,left_on='obsDt_week',right_on='obsDt_week',how='outer')
            ptForxt.set_index('obsDt_week',inplace=True)
            ptForxt.sort_index(axis='index',ascending=True,inplace=True)
    #mask in seasonality, where there are no birds present in the observed location
            ptForxt['mask'] = ptForxt['relativeAbundance'].interpolate(method='index',limit=5,limit_direction='both')  #mask, values do not matter
            ptForxt.loc[ptForxt['mask'].isna() == True,'relativeAbundance'] = 0
            ptForxt['forecastAbd'] = ptForxt['relativeAbundance'].interpolate(method='polynomial',order=5,limit=5,limit_direction='both')   #real forecast
            ptForxt['comName'] = comName.comName
            ptForxt['StopName'] = comName.StopName
            ptForxt = ptForxt.reset_index()
            forecastList.append(ptForxt)
        seasonalAbd = pd.DataFrame()
        seasonalAbd = pd.DataFrame(pd.concat(forecastList,ignore_index=True))
    except Exception as abdForxExc:
        raise abdForxExc
    return seasonalAbd

In [370]:
forx = forecast_weeklyAbd()

In [None]:
##TODO #66 split out birds into quartiles based on the number of observations available and the YOY frequency. Results will determine the strength of forecasting required

In [406]:
##TODO #64 using the relative abundance value from weeklyAbundance(), write a function that returns FAO land coverage preference --DONE
##TODO #68 add seasons to relative abundance function, then add in a loop over each season in the function seasonalHabitatPreference
def seasonalHabitatPref():
    cnx = connectDB()
    try:
        wklyAbd = weeklyAbundance()
        #start season loop here
        rawList = []
        for locId in wklyAbd.itertuples():
            query = f'SELECT Barren,Urban,Grasslands,Savannas,waterBodies,evergreenNeedleleafForest,evergreenBroadleafForests,openForests,mixedBroadleafandNeedleleafForests,sparseForests,denseHerbaceous,sparseHerbaceous FROM FAO_by_locId WHERE FAO_by_locId.locId = "{locId.locId}"'
            lx = pd.read_sql(query,con=cnx)
            lx = lx.applymap(lambda x: locId.relativeAbundance*x,na_action='ignore')
            lx = lx.assign(comName=locId.comName)
            lx = lx.assign(obsDt_week=locId.obsDt_week)
            #lx['comName'] = locId.comName
            #lx['obsDt_week'] = locId.obsDt_week
            rawList.append(lx)
        results = pd.DataFrame()
        results = pd.concat(rawList,ignore_index=True)
        results = results.groupby(['comName','obsDt_week']).mean()
        results.reset_index(inplace=True)
        results = pd.DataFrame(results)
    except Exception as prefExc:
        raise UserWarning(prefExc)
    finally: cnx.close()
    return results

In [103]:
##TODO #65 using historic habitat preference data, forecast the seasonal habitat preference for each species of bird --DONE
##TODO #74 update the interpolation method used in forecasting habitat preference to polynomial or just use the forecasted relative abundance
def forecastHabtPref():
    realHabtPref = seasonalHabitatPref()
    habtKeys = realHabtPref.drop_duplicates(subset=['comName'])
    try:
        forxHabtList = []
        for i in habtKeys.itertuples():
            allweek = pd.DataFrame({'obsDt_week':range(1,53)})
            obsPref = realHabtPref[realHabtPref['comName'] == i.comName]
            obsPref.drop(columns=['comName'],inplace=True)
            forxPref = pd.merge(left=obsPref,right=allweek,left_on='obsDt_week',right_on='obsDt_week',how='outer')
            forxPref.set_index('obsDt_week',inplace=True)
            forxPref.sort_index(axis='index',ascending=True,inplace=True)
            #forxPref.interpolate(method='polynomial',order=3,inplace=True,limit=3,limit_direction='both')
            forxPref.interpolate(method='linear',inplace=True,limit=3,limit_direction='both')
            forxPref['comName'] = i.comName
            forxPref = forxPref.reset_index()
            forxHabtList.append(forxPref)
        forecastPref = pd.DataFrame()
        forecastPref = pd.concat(forxHabtList,ignore_index=True)
    except Exception as habtPrefForxExcp:
        raise UserWarning(habtPrefForxExcp)
    return forecastPref

In [None]:
##TODO #65 using historic habitat preference data, forecast the seasonal habitat preference for each species of bird --DONE
##TODO #74 update the interpolation method used in forecasting habitat preference to polynomial or just use the forecasted relative abundance
def forecastHabtPref():
    realHabtPref = seasonalHabitatPref()
    habtKeys = realHabtPref.drop_duplicates(subset=['comName'])
    try:
        forxHabtList = []
        for i in habtKeys.itertuples():
            allweek = pd.DataFrame({'obsDt_week':range(1,53)})
            obsPref = realHabtPref[realHabtPref['comName'] == i.comName]
            obsPref.drop(columns=['comName'],inplace=True)
            forxPref = pd.merge(left=obsPref,right=allweek,left_on='obsDt_week',right_on='obsDt_week',how='outer')
            forxPref.set_index('obsDt_week',inplace=True)
            forxPref.sort_index(axis='index',ascending=True,inplace=True)
            #forxPref.interpolate(method='polynomial',order=3,inplace=True,limit=3,limit_direction='both')
            forxPref.interpolate(method='linear',inplace=True,limit=3,limit_direction='both')
            forxPref['comName'] = i.comName
            forxPref = forxPref.reset_index()
            forxHabtList.append(forxPref)
        forecastPref = pd.DataFrame()
        forecastPref = pd.concat(forxHabtList,ignore_index=True)
    except Exception as habtPrefForxExcp:
        raise UserWarning(habtPrefForxExcp)
    return forecastPref

In [104]:
forgex = forecastHabtPref()
forgex

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,obsDt_week,Urban,Savannas,evergreenNeedleleafForest,evergreenBroadleafForests,openForests,mixedBroadleafandNeedleleafForests,sparseForests,comName
0,1,,,0.010000,,,0.067692,,Canada Jay
1,2,,,0.060171,,,0.067692,,Canada Jay
2,3,,,0.110342,,,0.067692,,Canada Jay
3,4,,,0.160513,,,0.067692,,Canada Jay
4,5,,,,,,,,Canada Jay
...,...,...,...,...,...,...,...,...,...
100,48,,,,,,,,Orange-crowned Warbler
101,49,,,,,,,,Orange-crowned Warbler
102,50,,,,,,,,Orange-crowned Warbler
103,51,,,,,,,,Orange-crowned Warbler
