# Notebook to Read and Process eBird Data
- Download dataset here: https://ebird.org/science/use-ebird-data/download-ebird-data-products 
- Please read readme.txt available at: https://github.com/mila-iqia/ecosystem-embedding

---------------------------------------------------------------------------------------------------------

### Import libraries

In [1]:
import os
import sys
import datetime
import time
import pandas as pd
# from datetime import time
from collections import OrderedDict
from tqdm._tqdm_notebook import tqdm_notebook
from tqdm import tqdm

### All data at checklists level
- `complete-checklists.txt` is the file which has the data of complete checklists.
- I got this `complete-checklists.txt` file from the R-code provided by Matt (maintainer of ebird)

In [2]:
import pandas as pd
df = pd.read_csv('complete-checklists.txt', delimiter = "\t", keep_default_na=False)
df.head()

Unnamed: 0,country,STATE,LOCALITY,LOCALITY ID,LOCALITY TYPE,LATITUDE,LONGITUDE,OBSERVER ID,SAMPLING EVENT IDENTIFIER,GROUP IDENTIFIER
0,United States,New York,12203 Albany,L11418,PC,42.67515,-73.82348,obs16693,S922566,
1,United States,New York,12203 Albany,L11418,PC,42.67515,-73.82348,obs16693,S922570,
2,United States,New York,13045 Cortland,L11457,PC,42.59461,-76.17981,obs0,S922732,
3,Canada,Ontario,"ONT, Ottawa South",L142631,P,45.291786,-75.676125,obs18664,S1015872,
4,United States,Nevada,Gardnerville Area,L142280,P,38.87056,-119.777924,obs21134,S1015970,


### Number of checklists per hotspot (sorted in descending order)
- `n_checklists.csv` was also calculated from R code 

In [3]:
import pandas as pd

# n_checklists is the number of complete checklist per hotspot
df_nc_hotspot = pd.read_csv('n_checklists.csv', keep_default_na=False)
print(df_nc_hotspot.shape)

# sort by number of complete checklists
df_nc_hotspot_sorted = df_nc_hotspot.sort_values(by=['n'], ascending = False)
print(type(df_nc_hotspot_sorted))

# Rename header of the dataframe
df_new = df_nc_hotspot_sorted.rename({'locality_id': 'LOCALITY ID', 'locality': 'LOCALITY'}, axis=1)
df_new.head()

(98875, 3)
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,LOCALITY ID,LOCALITY,n
26205,L2329015,Observatorio de Cazalla,30334
26274,L2334855,Algeciras--Observatorio de Algarrobo,26624
22493,L191106,Central Park,20743
4978,L109516,Prospect Park,20687
4735,L109132,Cape Island--Cape May Point SP (CMPSP),13178


### Merge two dataframes such that each hotspot has:
- 'number' of complete checklists
- other informations like latitude longitude information etc

In [4]:
# This step takes time to execute. May lead to kernel restarting
df_checklist_count = df_new.merge(df, on ='LOCALITY', how='inner') 

### Select only US based hotspots

In [5]:
df_checklist_usa = df_checklist_count.loc[df_checklist_count['country'] == 'United States']

### Drop duplicate columns when data was merged

In [6]:
df_checklist_usa_uc = df_checklist_usa.drop('LOCALITY ID_y', 1) # uc in df_checklist_usa_uc: unique column
print(df_checklist_usa_uc.shape)
df_checklist_usa_uc.head(5)

(5129776, 11)


Unnamed: 0,LOCALITY ID_x,LOCALITY,n,country,STATE,LOCALITY TYPE,LATITUDE,LONGITUDE,OBSERVER ID,SAMPLING EVENT IDENTIFIER,GROUP IDENTIFIER
57243,L191106,Central Park,20743,United States,New York,H,40.771548,-73.972482,obs44102,S1746158,
57244,L191106,Central Park,20743,United States,New York,H,40.771548,-73.972482,obs44102,S1767831,
57245,L191106,Central Park,20743,United States,New York,H,40.771548,-73.972482,obs27480,S1994726,
57246,L191106,Central Park,20743,United States,New York,H,40.771548,-73.972482,obs37690,S2021897,
57247,L191106,Central Park,20743,United States,New York,H,40.771548,-73.972482,obs59506,S2029768,


### Drop duplicate rows

In [7]:
df_checklist_usa_unique = df_checklist_usa.drop_duplicates('LOCALITY ID_x')
df_checklist_usa_unique = df_checklist_usa_unique.sort_values(by=['n'], ascending = False)
print("Number of unique hotspots in USA", len(df_checklist_usa_unique))
df_checklist_usa_unique.head()

Number of unique hotspots in USA 41817


Unnamed: 0,LOCALITY ID_x,LOCALITY,n,country,STATE,LOCALITY ID_y,LOCALITY TYPE,LATITUDE,LONGITUDE,OBSERVER ID,SAMPLING EVENT IDENTIFIER,GROUP IDENTIFIER
57243,L191106,Central Park,20743,United States,New York,L191106,H,40.771548,-73.972482,obs44102,S1746158,
147211,L109516,Prospect Park,20687,United States,New York,L109516,H,40.660284,-73.968953,obs18140,S955217,
214627,L109132,Cape Island--Cape May Point SP (CMPSP),13178,United States,New Jersey,L109132,H,38.933827,-74.955085,obs16394,S1206533,
258869,L109339,"IRWD San Joaquin Marsh & Wildlife Sanctuary, I...",10758,United States,California,L109339,H,33.663727,-117.842681,obs38282,S1645754,
271525,L207391,Mt. Auburn Cemetery,10549,United States,Massachusetts,L1320096,P,42.37347,-71.14201,obs284928,S9290576,


### Selectonly conttinental USA

In [8]:
# c_usa = continental USA

df_c_usa = df_checklist_usa_unique[(df_checklist_usa_unique.STATE != "Alaska") &
                                   (df_checklist_usa_unique.STATE != "District of Columbia") &
                                   (df_checklist_usa_unique.STATE != "Hawaii")]

### All hotspots with their number of checklists in Continental USA

In [9]:
df_usa_count = df_c_usa.sort_values(by=['n'], ascending = False)
print("Number of hotspots in continental USA", len(df_usa_count))
df_usa_count.head()

Number of hotspots in continental USA 40963


Unnamed: 0,LOCALITY ID_x,LOCALITY,n,country,STATE,LOCALITY ID_y,LOCALITY TYPE,LATITUDE,LONGITUDE,OBSERVER ID,SAMPLING EVENT IDENTIFIER,GROUP IDENTIFIER
57243,L191106,Central Park,20743,United States,New York,L191106,H,40.771548,-73.972482,obs44102,S1746158,
147211,L109516,Prospect Park,20687,United States,New York,L109516,H,40.660284,-73.968953,obs18140,S955217,
214627,L109132,Cape Island--Cape May Point SP (CMPSP),13178,United States,New Jersey,L109132,H,38.933827,-74.955085,obs16394,S1206533,
258869,L109339,"IRWD San Joaquin Marsh & Wildlife Sanctuary, I...",10758,United States,California,L109339,H,33.663727,-117.842681,obs38282,S1645754,
271525,L207391,Mt. Auburn Cemetery,10549,United States,Massachusetts,L1320096,P,42.37347,-71.14201,obs284928,S9290576,


In [10]:
# Places with complete checklist > 50

threshold = 50
df_usa_count_threshold = df_usa_count[(df_usa_count['n'] >= threshold)]
print(len(df_usa_count_threshold))

11981


### Create a dict with locaity ids and country

In [None]:
# df_usa_loc_country = df_usa_count[['LOCALITY ID_x', 'country']].values
# dict_usa = dict(df_usa_loc_country)
# print ("Length of dictionary:", len(dict_usa))
# # dict_usa

### Make a list of hostspot IDs

In [11]:
list_loc = df_usa_count_threshold['LOCALITY ID_x'].to_list()
print ("Length of list of locality:", len(list_loc))
list_loc[0]

Length of list of locality: 11981


'L191106'

### Verify if a locality exists in the list `list_loc`. 
This will be needed when we need to filter the whole data file (250 GB+) to distill only those rows which are associated with hotspot id present in `list_loc`

In [12]:
"L348850" in list_loc

True

### Read ebird data
- File location : /miniscratch/srishtiy/ebd_relJan-2021.txt
- This file is very large (~250 GB+) and needs to read in chunks

In [14]:
# EBIRD DATA FILE LOCATION

base_folder = '/miniscratch/srishtiy/'
data = base_folder + 'ebd_relJan-2021.txt'


### Read eBIRD data file in chunks because original file is huge.
- We want all columns for now so we are not filtering column wise
- How chuunk works: https://stackoverflow.com/questions/25962114/how-do-i-read-a-large-csv-file-with-pandas
- chunksize = 1000 implies 1000 rows in each chunk will be processed

In [16]:
reader = pd.read_csv(data , 
                     delimiter = "\t",
                     chunksize = 1000
                    )

### Read the data in chunks 
- Save the processed file in a csv file
- Sample file generated is usa_hotspot_data.csv and is available on github

**WARNING: DON'T RUN THIS CELL INSTANTLY. TAKES HOURS**

In [None]:
# Remove CSV file if it already exists
# Running this cell take a long time so need to work on how to do it. 

output_path = 'usa_hotspot_data.csv'
print("Output file: ", output_path)

try:
    os.remove(output_path)
except OSError:
    pass

# Read chunks and save to a new csv
for i,chunk in enumerate(reader):
        usa_chunk = chunk.loc[chunk['COUNTRY'] == 'United States']
        usa_chunk_locality = usa_chunk.loc[usa_chunk['LOCALITY ID'].isin(list_loc)]
        usa_chunk_locality.to_csv(output_path ,mode='a', header=not os.path.exists(output_path))   
        # Progress Bar
        if (i% 1000 == 0):
            print("#", end ='')