In [2]:
'''
Author.      : Aditya Jain
Date Started : 17th March, 2021
About        : This script scraps data from iNaturalist for moth species in Quebec, Canada
'''

!pip install pyinaturalist
!pip install ipyleaflet

Collecting pyinaturalist
[?25l  Downloading https://files.pythonhosted.org/packages/f6/80/6bb131f97e7e0a22bfd5e1c701a17f6373ce7d5af61c981168f2e1d1a08e/pyinaturalist-0.12.1-py3-none-any.whl (56kB)
[K     |████████████████████████████████| 61kB 4.5MB/s  eta 0:00:01
Collecting python-forge
  Downloading https://files.pythonhosted.org/packages/41/d6/e9af8e22d153ebbf584833c1c96d590046f522ae2a86978d4efe496b4aac/python_forge-18.6.0-py35-none-any.whl
Collecting keyring~=21.4.0
  Downloading https://files.pythonhosted.org/packages/e4/ed/7be20815f248b0d6aae406783c2bee392640924623c4e17b50ca90c7f74d/keyring-21.4.0-py3-none-any.whl
Collecting requests~=2.25.0
[?25l  Downloading https://files.pythonhosted.org/packages/29/c1/24814557f1d22c56d50280771a17307e6bf87b70727d975fd6b2ce6b014a/requests-2.25.1-py2.py3-none-any.whl (61kB)
[K     |████████████████████████████████| 61kB 4.3MB/s 
Collecting SecretStorage>=3; sys_platform == "linux"
  Downloading https://files.pythonhosted.org/packages/d9/1e/29

In [3]:
from pyinaturalist.node_api import get_geojson_observations, get_taxa, get_places_nearby, get_places_by_id, get_observations, get_taxa_by_id, get_all_observations
# from pyinaturalist.rest_api import get_observations
from ipyleaflet import Map
import json
import math
from bson import json_util
from datetime import date
from tqdm import tqdm
import csv
import numpy as np
import pandas as pd

# iNat place and species IDs
CANADA_ID  = 6712
QUEBEC_ID  = 13336
LEPIDOP_ID = 47157

CUR_DATE   = date.today()
DATA_DIR   = "/content/drive/My Drive/Data/"



## 1. Fetch and Save json data from iNat
##### a) The below snippet finds per-page results

In [None]:
# getting data for lepidopterans spotted in Quebec
quebec_res = get_observations(
    taxon_id=LEPIDOP_ID,
    photos=True,
    geo=True,
    place_id=QUEBEC_ID,
    quality_grade='research'
)

# getting data for lepidopterans spotted in Canada
canada_res = get_observations(
    taxon_id=LEPIDOP_ID,
    photos=True,
    geo=True,
    place_id=CANADA_ID,
    quality_grade='research'
)

total_quebec_points = quebec_res['total_results']

print("No. of lepidopterans spotted in Quebec: ", quebec_res['total_results'])
print("No. of lepidopterans spotted in Canada: ", canada_res['total_results'])

# saving Quebec data in json format
# with open('quebec_full-' + str(CUR_DATE) + '.json', 'w') as f:
#     json.dump(quebec_res, f, default=json_util.default)

No. of lepidopterans spotted in Quebec:  46420
No. of lepidopterans spotted in Canada:  420430


##### b) [Under Work] The get_all_observations API is troublesome, so manually going through all the pages and fetching the data

In [None]:
max_per_page = 200                                  # no. of entries to display per page
no_of_pages  = math.ceil(total_quebec_points/200)   # no. of pages
iNat_dict    = {}                                   # dictionary to store lepidopteran species on iNat

# just for vizualisation
lat_list     = []
lon_list     = []

for i in tqdm(range(1,no_of_pages+1)):
  data = get_observations(
    taxon_id=LEPIDOP_ID,
    photos=True,
    geo=True,
    place_id=QUEBEC_ID,
    quality_grade='research',
    page=i,
    per_page=max_per_page,
    order_by='observed_on')
  
  data = data['results']
  
  for item in data:
    taxa_name = item['taxon']['name']
    iNat_ID   = item['taxon']['id']
    
    if iNat_ID not in iNat_dict.keys():
      iNat_dict[iNat_ID] = [taxa_name, 1]
    else:
      iNat_dict[iNat_ID][1] += 1

    lat_list.append(item['location'][0])
    lon_list.append(item['location'][1])


##### c) The below code snippet fetches a more concise data for lepidopterans in quebec

In [None]:
quebec_res = get_geojson_observations(
    taxon_id=LEPIDOP_ID,
    photos=True,
    geo=True,
    place_id=QUEBEC_ID,
    quality_grade='research'
)

with open('quebec_full-' + 'geojson' + '.json', 'w') as f:
    json.dump(quebec_res, f, default=json_util.default)

data = quebec_res["features"]

iNat_dict    = {}                                   # dictionary to store lepidopteran species on iNat
lat_list     = []
lon_list     = []

for item in data:  
  taxa_name = item["properties"]["taxon_name"]
  iNat_taxonID   = item["properties"]["taxon_id"]
    
  if iNat_taxonID not in iNat_dict.keys():
    iNat_dict[iNat_taxonID] = [taxa_name, 1]
  else:
    iNat_dict[iNat_taxonID][1] += 1

  lat_list.append(item["geometry"]["coordinates"][1])
  lon_list.append(item["geometry"]["coordinates"][0])

print('Total species count:', len(iNat_dict.keys()))

## 2. Load json file

Loading quebec lepidopterans list

In [None]:
filename  = "quebec_full-geojson.json"
path      = DATA_DIR + filename
f         = open(path,)
data      = json.load(f)['features']
iNat_dict = {}                        # dictionary to store lepidopteran species on iNat


for item in data:  
  taxa_name = item["properties"]["taxon_name"]
  iNat_taxonID   = item["properties"]["taxon_id"]
    
  if iNat_taxonID not in iNat_dict.keys():
    iNat_dict[iNat_taxonID] = [taxa_name, 1]
  else:
    iNat_dict[iNat_taxonID][1] += 1


Once the data is loaded, the species list is saved in a csv with their counts

In [None]:
species_list = []

for id, value in iNat_dict.items():
  species_list.append([id, value[0], value[1]])

species_list = np.array(species_list, dtype=np.dtype(object))
species_list = species_list[species_list[:,2].argsort()[::-1]]
field_names  = np.array([['iNat_TaxonID', 'Scientific_Name', 'Total_Observations_in_Quebec']])
species_list = np.concatenate((field_names, species_list))
np.savetxt('Quebec-Lepidop-Species_iNat.csv', species_list, delimiter=",", fmt='%s')

#### Visualizing the observations geo-spatially

In [None]:
import plotly.express as px

fig = px.scatter_geo(lat=lat_list,
                     lon=lon_list,
                     scope='north america',
                     title="Geo-locations of Lepidopterans spotted in Quebec"
                     )
fig.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

plt.hist(hist, bins=len(hist))
plt.title("Histogram")
plt.show()

In [None]:
hist = []
for key in iNat_dict.keys():
  hist.append(iNat_dict[key][1])

print(hist)

[47, 5, 14, 138, 26, 104, 175, 99, 45, 6, 99, 75, 214, 104, 30, 24, 158, 260, 13, 48, 71, 11, 165, 15, 100, 78, 101, 40, 25, 37, 84, 22, 18, 10, 18, 46, 318, 31, 325, 104, 112, 163, 23, 36, 8, 10, 13, 42, 114, 7, 11, 15, 108, 145, 18, 20, 64, 55, 20, 19, 204, 25, 2, 37, 123, 50, 25, 18, 6, 65, 81, 4, 206, 21, 18, 238, 6, 55, 14, 59, 151, 8, 28, 47, 4, 4, 5, 20, 30, 60, 72, 26, 18, 140, 8, 34, 281, 5, 12, 56, 6, 6, 235, 8, 115, 170, 28, 168, 9, 140, 5, 38, 5, 5, 13, 31, 198, 69, 121, 420, 12, 13, 18, 43, 134, 16, 95, 17, 177, 2, 10, 3, 36, 20, 1, 29, 37, 92, 38, 30, 7, 25, 1, 7, 30, 19, 40, 11, 21, 25, 1, 68, 15, 80, 170, 19, 44, 28, 7, 114, 320, 56, 186, 295, 59, 161, 23, 17, 102, 48, 26, 9, 6, 20, 8, 25, 28, 94, 30, 101, 25, 41, 4, 8, 31, 130, 242, 63, 48, 39, 50, 87, 76, 2, 19, 10, 24, 30, 9, 39, 78, 80, 9, 45, 426, 24, 19, 52, 12, 41, 20, 4, 6, 107, 2, 6, 95, 21, 44, 71, 49, 54, 38, 62, 143, 158, 42, 27, 99, 138, 30, 31, 11, 163, 13, 9, 79, 9, 47, 36, 41, 2, 52, 4, 151, 41, 84, 22, 

## 3. Reading moth species list (shared by Maxim)
First reading the file, getting the relevant fields, cleaning it and generating a new csv file

In [None]:
filename  = 'quebec_mothlist.xlsx'
file_path = DATA_DIR + filename
file_path = 'listB_Quebec_Pohl2018.xlsx'
data      = pd.read_excel(file_path, sheet_name=None)
data      = data['Pohl et al 2018 Qc moths']
data      = data[['SP NO.', 'superfamily', 'family', 'subfamily', 'val genus', 'name', 'FULL NAME']]

# data.to_csv('quebec_moth_v1.csv')   # this contains relevant columns but some empty rows

# removing empty entries by checking a specific column
for row in data.index:
  if pd.isnull(data['family'][row]):
    data.drop(row, inplace=True)

print(data)
data.to_csv('quebec_moth_v2.csv')   # this contains relevant columns along with removed empty rows

      SP NO.  ...                                   FULL NAME
0     010001  ...         Epimartyria auricrinella Walsingham
1     070001  ...  Dyseriocrania griseocapitella (Walsingham)
2     070003  ...        Eriocrania semipurpurella (Stephens)
3        NaN  ...                                         NaN
4     070004  ...                  Eriocrania breviapex Davis
...      ...  ...                                         ...
3502  933680  ...                 Abagrotis alternata (Grote)
3503  933682  ...                Abagrotis forbesi (Benjamin)
3504  933683  ...             Abagrotis brunneipennis (Grote)
3505  933685  ...                    Abagrotis cupida (Grote)
3506  933688  ...           Abagrotis anchocelioides (Guenée)

[3507 rows x 7 columns]
       SP NO.  ...                                   FULL NAME
0      010001  ...         Epimartyria auricrinella Walsingham
1      070001  ...  Dyseriocrania griseocapitella (Walsingham)
2      070003  ...        Eriocrania semip

## 4. Reading Vermont moth species list (shared by KP)
First reading the file, getting the relevant fields, cleaning it and generating a new csv file

In [9]:
filename  = 'listC_Vermont_29March2021.xlsx'
file_path = DATA_DIR + filename
data      = pd.read_excel(file_path, sheet_name=None)
data      = data['Sheet2']
data      = data[['Hodges_No', 'P3no', 'group', 'superfamily', 'family', 
                  'subfamily', 'genus', 'specificEpithet', 'taxonRank',
                  'scientificName']]

# removing empty entries by checking a specific column
for row in data.index:
  if pd.isnull(data['scientificName'][row]):
    data.drop(row, inplace=True)

print(data)

data.to_csv('listC_Vermont_29March2021.csv')

      Hodges_No      P3no  ... taxonRank                 scientificName
0           1.0   10001.0  ...   species       Epimartyria auricrinella
1           3.0   70001.0  ...   species  Dyseriocrania griseocapitella
2           5.0   70003.0  ...   species      Eriocrania semipurpurella
3          31.0  110011.0  ...   species        Korscheltellus gracilis
4          18.0  110016.0  ...   species    Sthenopis argenteomaculatus
...         ...       ...  ...       ...                            ...
1935    11043.0  933685.0  ...   species               Abagrotis cupida
1936    11045.0  933688.0  ...   species       Abagrotis anchocelioides
1937    10658.0   34151.0  ...   species               Agrotis stigmosa
1938     3412.2  621291.2  ...   species          Dichrorampha aeratana
1939     6405.0  910822.0  ...   species          Digrammia gnophosaria

[1940 rows x 10 columns]


## 5. Comparing iNat and Maxim's Quebec Moth species list
Comparing the two lists to find which species are common in the two lists and which are missing in either of them

In [None]:
iNat_name      = 'Quebec-Lepidop-Species_iNat.csv'
quebec_name    = 'quebec_moth_v2.csv'
iNat_file      = DATA_DIR + iNat_name
quebec_file    = DATA_DIR + quebec_name

iNat_data      = pd.read_csv(iNat_file)
quebec_data    = pd.read_csv(moth_file)

iNat_species   = iNat_data['Scientific_Name']
quebec_species = quebec_data['val genus'] + ' ' + quebec_data['name']

#### A) In iNat, not in quebec list
The iNat list is of complete lepidoptera order, hence, it will also contain butterfly species. However, if there is any quebec moth species in iNat but not in maxim list, that would be interesting (because the latter is supposedly a superset of quebec moth species)

In [None]:
list_not_in_Moth = []
for item in iNat_species.index:
  if iNat_species[item] not in list(quebec_species):
    list_not_in_Moth.append(iNat_species[item])

list_not_in_Moth = pd.DataFrame(list_not_in_Moth)
list_not_in_Moth.to_csv('IniNat_NotQuebec.csv') 

#### B) In quebec list, not in iNat
This code snippet will find out the species in quebec list which are not spotted on iNat

In [None]:
list_not_in_iNat = []
for item in quebec_species.index:
  if quebec_species[item] not in list(iNat_species):
    list_not_in_iNat.append(quebec_species[item])

list_not_in_iNat = pd.DataFrame(list_not_in_iNat)
list_not_in_iNat.to_csv('InQuebec_NotiNat.csv')

## 5. Fetch Data
Saving images from iNat

In [None]:
iNat_name      = 'Quebec-Lepidop-Species_iNat.csv'
iNat_file      = DATA_DIR + iNat_name

iNat_data      = pd.read_csv(iNat_file)
# print(iNat_data)

iNat_species   = list(iNat_data['iNat_TaxonID'])
# iNat_species   = list(iNat_data['Scientific_Name'])

print(iNat_species)

data = get_observations(
    # taxon_name=iNat_species[:2],
    taxon_id=iNat_species[0],
    photos=True,
    geo=True,
    geoprivacy='open',
    quality_grade='research')


[48662, 58583, 55641, 47802, 57463, 59675, 58523, 81663, 47919, 67691, 145659, 56832, 1063932, 55626, 143274, 48548, 120217, 81677, 143728, 127460, 119013, 154446, 49133, 81681, 219746, 130017, 211009, 120214, 121183, 60839, 229817, 205300, 143518, 142993, 47916, 58576, 49716, 208521, 337467, 81665, 81687, 81656, 205231, 58531, 127457, 736797, 223997, 143445, 198813, 84428, 201281, 47980, 153076, 233010, 154582, 128242, 58586, 119492, 1081323, 224075, 198812, 82379, 143118, 205197, 143867, 219124, 212435, 226649, 119953, 121236, 127133, 133447, 220826, 63128, 81657, 154353, 211012, 68262, 143009, 144107, 81582, 232156, 212430, 84662, 126276, 58484, 143517, 143121, 82153, 81680, 56776, 118901, 177320, 143112, 233838, 213838, 217190, 131673, 205204, 215211, 227466, 217970, 82225, 82204, 211100, 213514, 143446, 129226, 213826, 119063, 217054, 122259, 121534, 54064, 48094, 228855, 117452, 218900, 81679, 124181, 606580, 233677, 321779, 226646, 129572, 205213, 211007, 204699, 219856, 231396,

In [None]:
PER_PAGE  = 200
MAX_PAGES = 25   # allows for max 5000 observations to be fetched
import math

for id in iNat_species:
  
  print(id)
  data = get_observations(
    taxon_id=id,
    photos=True,
    geo=True,
    geoprivacy='open',
    quality_grade='research',)
  
  total_count = data['total_results']
  no_pages    = total_count/PER_PAGE
  no_pages    = math.floor(min(no_pages, MAX_PAGES))

  print(no_pages)

48662
25
58583
25
55641


Could not parse timestamp: Sun Jul 14 2019 14:59:23 GMT+0100 (GMT+1): Unknown string format: Sun Jul 14 2019 14:59:23 GMT+0100 (GMT+1)
Could not parse timestamp: Sat Aug 15 2020 10:50:23 GMT+0200 (GMT+2): Unknown string format: Sat Aug 15 2020 10:50:23 GMT+0200 (GMT+2)


25
47802


Could not parse timestamp: Sun Aug 08 2010 18:43:14 GMT+0400 (GMT+4): Unknown string format: Sun Aug 08 2010 18:43:14 GMT+0400 (GMT+4)


25
57463


Could not parse timestamp: 2010/05/26 EST: Unknown string format: 2010/05/26 EST


21
59675
25
58523


Could not parse timestamp: Sat Mar 27 2021 13:21:56 GMT -0400 (EDT): Unknown string format: Sat Mar 27 2021 13:21:56 GMT -0400 (EDT)


25
81663


Could not parse timestamp: Mon Mar 29 2021 13:28:39 GMT -0400 (EDT): Unknown string format: Mon Mar 29 2021 13:28:39 GMT -0400 (EDT)
Could not parse timestamp: Sun Mar 28 2021 15:42:24 GMT -0500 (CDT): Unknown string format: Sun Mar 28 2021 15:42:24 GMT -0500 (CDT)


25
47919
25
67691
25
145659
25
56832


Could not parse timestamp: Tue Mar 23 2021 13:00:23 GMT -0400 (EDT): Unknown string format: Tue Mar 23 2021 13:00:23 GMT -0400 (EDT)


25
1063932
6
55626


Could not parse timestamp: Mon Mar 29 2021 14:02:34 GMT+0800 (GMT+8): Unknown string format: Mon Mar 29 2021 14:02:34 GMT+0800 (GMT+8)
Could not parse timestamp: Sun Mar 28 2021 10:09:16 GMT+1300 (GMT+13): Unknown string format: Sun Mar 28 2021 10:09:16 GMT+1300 (GMT+13)
Could not parse timestamp: Mon Mar 29 2021 14:53:46 GMT+1100 (GMT+11): Unknown string format: Mon Mar 29 2021 14:53:46 GMT+1100 (GMT+11)


25
143274


Could not parse timestamp: 2019/07/12 1:30: AM EDT: Unknown string format: 2019/07/12 1:30: AM EDT


10
48548


Could not parse timestamp: Thu Aug 22 2019 14:12:47 GMT -0500 (CDT): Unknown string format: Thu Aug 22 2019 14:12:47 GMT -0500 (CDT)


25
120217
25
81677
25
143728
15
127460
25
119013
25
154446
14
49133


Could not parse timestamp: Sun Mar 15 2020 11:36:49 GMT+0100 (GMT+1): Unknown string format: Sun Mar 15 2020 11:36:49 GMT+0100 (GMT+1)
Could not parse timestamp: Tue Mar 23 2021 09:43:08 GMT+0100 (GMT+1): Unknown string format: Tue Mar 23 2021 09:43:08 GMT+0100 (GMT+1)
Could not parse timestamp: Mon Mar 29 2021 12:16:43 GMT+0100 (GMT+1): Unknown string format: Mon Mar 29 2021 12:16:43 GMT+0100 (GMT+1)
Could not parse timestamp: Mon Mar 29 2021 15:26:55 GMT +0200 (GMT+2): Unknown string format: Mon Mar 29 2021 15:26:55 GMT +0200 (GMT+2)


25
81681
25
219746
3
130017
7
211009
3
120214
6
121183
25
60839
25
229817
9
205300
3
143518
24
142993
7
47916
25
58576
21
49716
19
208521
2
337467
3
81665
25
81687


Could not parse timestamp: Fri Mar 26 2021 16:32:05 GMT+0100 (GMT+1): Unknown string format: Fri Mar 26 2021 16:32:05 GMT+0100 (GMT+1)
Could not parse timestamp: Fri Mar 26 2021 14:02:57 GMT+0100 (GMT+1): Unknown string format: Fri Mar 26 2021 14:02:57 GMT+0100 (GMT+1)
Could not parse timestamp: Wed Mar 24 2021 20:54:24 GMT+0100 (GMT+1): Unknown string format: Wed Mar 24 2021 20:54:24 GMT+0100 (GMT+1)


25
81656


Could not parse timestamp: 2019/07/12 1:30: AM EDT: Unknown string format: 2019/07/12 1:30: AM EDT


13
205231


Could not parse timestamp: Mon Aug 17 2020 00:56:06 GMT+0100 (GMT+1): Unknown string format: Mon Aug 17 2020 00:56:06 GMT+0100 (GMT+1)


13
58531
25
127457
21
736797
18
223997
1
143445
10
198813


Could not parse timestamp: Sat Mar 20 2021 13:48:59 GMT -0400 (EDT): Unknown string format: Sat Mar 20 2021 13:48:59 GMT -0400 (EDT)


12
84428
12
201281


Could not parse timestamp: 2019/07/12 1:30: AM EDT: Unknown string format: 2019/07/12 1:30: AM EDT


10
47980
25
153076
9
233010
4
154582
10
128242
14
58586
25
119492
5
1081323
18
224075
3
198812
25
82379
25
143118
13
205197
15
143867


Could not parse timestamp: 2019/07/12 1:30: AM EDT: Unknown string format: 2019/07/12 1:30: AM EDT


8
219124
1
212435


HTTPError: ignored

In [None]:
print(data.keys())
print(data['page'])

dict_keys(['total_results', 'page', 'per_page', 'results'])
1


Getting iNat taxon ids for species in quebec list

In [None]:
quebec_name    = 'quebec_moth_v2.csv'
quebec_file    = DATA_DIR + quebec_name
quebec_data    = pd.read_csv(quebec_file)

quebec_species = quebec_data['val genus'] + ' ' + quebec_data['name']
print(quebec_species)

0            Epimartyria auricrinella
1       Dyseriocrania griseocapitella
2           Eriocrania semipurpurella
3              Gazoryctra hyperboreus
4               Gazoryctra novigannus
                    ...              
2876                Abagrotis placida
2877              Abagrotis alternata
2878          Abagrotis brunneipennis
2879                 Abagrotis cupida
2880         Abagrotis anchocelioides
Length: 2881, dtype: object


In [None]:
def get_iNat_id(species_list):
  '''
  given a list of species name, this function returns the unique iNat taxon ID
  '''
  id_list = []

  for item in species_list.index:
    species_name = species_list[item]
    data         = get_taxa(q=species_name, rank='species', per_page=200)

    if data['total_results']==0:
      id_list.append('None')
    else:
      id_list.append(data['results'][0]['id'])
      
  return id_list

In [None]:
print(get_iNat_id(quebec_species))

### Miscellaneous - Not to Run

In [None]:
print(len(hist))

1824


In [None]:
id_list = []
for p in response['results']['community']:
  id_list.append(p['id'])
  print(p['name'], p['id'])

for p in response['results']['standard']:
  id_list.append(p['id'])
  print(p['name'], p['id'])

Great Lakes Basin 57637
New England-Acadian forests 128777
Northeast & Mid-Atlantic States 96557
Labrador 132047
Northeastern United States 64492
Gulf of St. Lawrence 134488
Maritimes Provinces of Canada 82171
Maritimes Provinces of Canada 82180
New England 52339
Experimenting with polygons 154600
North America 97394
United States 1
Canada 6712
Québec 13336
Newfoundland and Labrador 7289
New Brunswick 7587
Maine 17
Nord-du-Québec 49209
Division No. 10 49165
Sept-Rivières--Caniapiscau 49211


In [None]:
QUEBEC_BOX = (62.350899, -64.170174, 45.083101, -78.449810)       # lat/long coordinates for bounding box for quebec
response   = get_places_nearby(*QUEBEC_BOX)
print(response)

In [None]:
a = {
    'A': [0, 'Aditya'],
    'B': [1, 'Suman']
}

print(a)

{'A': [0, 'Aditya'], 'B': [1, 'Suman']}


In [None]:
a['B'][0] += 1
print(a)


{'A': [0, 'Aditya'], 'B': [2, 'Suman']}
