# Notebook to Read and Process eBird Data
- Download dataset here: https://ebird.org/science/use-ebird-data/download-ebird-data-products 
- Please read readme.txt available at: https://github.com/mila-iqia/ecosystem-embedding

---------------------------------------------------------------------------------------------------------

### Import libraries

In [1]:
import os
import sys
import datetime
import time
import pandas as pd
# from datetime import time
from collections import OrderedDict
from tqdm._tqdm_notebook import tqdm_notebook
from tqdm import tqdm

### All data at checklists level
- `complete-checklists.txt` is the file which has the data of complete checklists.
- I got this `complete-checklists.txt` file from the R-code provided by Matt (maintainer of ebird)

In [2]:
import pandas as pd
df = pd.read_csv('../complete-checklists.txt', delimiter = "\t", keep_default_na=False)
df.head()

Unnamed: 0,country,STATE,LOCALITY,LOCALITY ID,LOCALITY TYPE,LATITUDE,LONGITUDE,OBSERVER ID,SAMPLING EVENT IDENTIFIER,GROUP IDENTIFIER
0,United States,New Jersey,07866 Rockaway,L126977,PC,40.917198,-74.509209,obs17638,S84135601,
1,Canada,British Columbia,Trial Islands,L2747482,H,48.398617,-123.304933,obs961105,S84138959,
2,Canada,British Columbia,Stevens Lane,L455677,P,49.014585,-123.086266,obs133217,S84136606,
3,United States,Washington,Wallingford Steps,L5759293,H,47.647237,-122.336347,obs554444,S85200104,
4,United States,New York,Durland Preserve,L446377,H,42.437996,-76.397982,obs9009,S924173,


### Number of checklists per hotspot (sorted in descending order)
- `n_checklists.csv` was also calculated from R code 

In [None]:
import pandas as pd

# n_checklists is the number of complete checklist per hotspot
df_nc_hotspot = pd.read_csv('../n_checklists.csv', keep_default_na=False)
print(df_nc_hotspot.shape)

# sort by number of complete checklists
df_nc_hotspot_sorted = df_nc_hotspot.sort_values(by=['n'], ascending = False)
print(type(df_nc_hotspot_sorted))

# Rename header of the dataframe
df_new = df_nc_hotspot_sorted.rename({'locality_id': 'LOCALITY ID', 'locality': 'LOCALITY'}, axis=1)
df_new.head()

### Merge two dataframes such that each hotspot has:
- 'number' of complete checklists
- other informations like latitude longitude information etc

In [None]:
# This step takes time to execute. May lead to kernel restarting
df_checklist_count = df_new.merge(df, on ='LOCALITY ID', how='inner') 

In [None]:
df_checklist_count.head()

### Select only US based hotspots

In [None]:
df_checklist_usa = df_checklist_count.loc[df_checklist_count['country'] == 'United States']

### Drop duplicate columns when data was merged

In [None]:
df_checklist_usa_uc = df_checklist_usa.drop('LOCALITY_y', 1) # uc in df_checklist_usa_uc: unique column
print(df_checklist_usa_uc.shape)
df_checklist_usa_uc.head(5)

### Drop duplicate rows

In [None]:
df_checklist_usa_unique = df_checklist_usa.drop_duplicates('LOCALITY ID')
df_checklist_usa_unique = df_checklist_usa_unique.sort_values(by=['n'], ascending = False)
print("Number of unique hotspots in USA", len(df_checklist_usa_unique))
df_checklist_usa_unique.head()

### Selectonly conttinental USA

In [None]:
# c_usa = continental USA

df_c_usa = df_checklist_usa_unique[(df_checklist_usa_unique.STATE != "Alaska") &
                                   (df_checklist_usa_unique.STATE != "Hawaii")]

### All hotspots with their number of checklists in Continental USA

In [None]:
df_usa_count = df_c_usa.sort_values(by=['n'], ascending = False)
print("Number of hotspots in continental USA", len(df_usa_count))
df_usa_count.head()

In [None]:
# Places with complete checklist > 50

threshold = 50
df_usa_count_threshold = df_usa_count[(df_usa_count['n'] >= threshold)]
print(len(df_usa_count_threshold))

### Create a dict with locaity ids and country

In [None]:
# df_usa_loc_country = df_usa_count[['LOCALITY ID_x', 'country']].values
# dict_usa = dict(df_usa_loc_country)
# print ("Length of dictionary:", len(dict_usa))
# # dict_usa

### Make a list of hostspot IDs

In [None]:
list_loc = df_usa_count_threshold['LOCALITY ID_x'].to_list()
print ("Length of list of locality:", len(list_loc))
list_loc[0]

### Verify if a locality exists in the list `list_loc`. 
This will be needed when we need to filter the whole data file (250 GB+) to distill only those rows which are associated with hotspot id present in `list_loc`

In [None]:
"L348850" in list_loc

### Read ebird data
- File location : /miniscratch/srishtiy/ebd_relJan-2021.txt
- This file is very large (~250 GB+) and needs to read in chunks

### Read eBIRD data file in chunks because original file is huge.
- We want all columns for now so we are not filtering column wise
- How chuunk works: https://stackoverflow.com/questions/25962114/how-do-i-read-a-large-csv-file-with-pandas
- chunksize = 1000 implies 1000 rows in each chunk will be processed