# Entropy and Persistent Homology

## &copy;  [Omkar Mehta](omehta2@illinois.edu) ##
### Industrial and Enterprise Systems Engineering, The Grainger College of Engineering,  UIUC ###

<hr style="border:2px solid blue"> </hr>

# [Reference](https://towardsdatascience.com/how-to-pull-data-from-an-api-using-python-requests-edcc8d6441b1)

# Part 1: Download Data from INaturalist website using API
From API documentation,we get the following information:

`Please note that we throttle API usage to a max of 100 requests per minute, though we ask that you try to keep it to 60 requests per minute or lower, and to keep under 10,000 requests per day. If we notice usage that has serious impact on our performance we may institute blocks without notification.`

`per_page
Allowed values: 1 to 200`

Locations data collected from:
* place_id: 1563
* place_id: 49906

In [2]:
# importing required libraries
import requests
import math
import time
import pandas as pd
from datetime import datetime
import os

In [3]:
# Data for storing all the pulled data
data = {
    'time_observed_at': list(),
    'species_guess': list(),
    'genus_name': list(),
    'rank': list(),
    'wikipedia_url': list(),
    'iconic_taxon_name': list(),
    'preferred_common_name': list(),
    'uri': list(),
    'longitude': list(),
    'latitude': list(),
    'place_guess': list()
}

## Part 1.1. Function for getting the dictionary of pages and page_numbers

For each page, there is a different per_page limit. This was not mentioned in the api documentation of the inaturalist website.

In [4]:
# pages = {}
# place_id = 1563
# page_number = 1
# per_page = 200 #max_limit
# i = 1
# sleep_time = 60
# while True:
#   while per_page != 0:
#     #go through each page and per_page, get the request's response, if it is 200, append page_number:per_page to the dictionary.
#     #if for any per_page, we get status_code != 200, we decrement the per_page by 1, and check response. 
    
#     response = requests.get("https://api.inaturalist.org/v1/observations?place_id={}&page={}&per_page={}".format(place_id, page_number, per_page))
#     if i%50 == 0: #We have a limit of 60 pages per minute, for pulling data from API
#         time.sleep(sleep_time)
#     if response.status_code == 200:
#       pages[page_number] = per_page
#       page_number +=1
#     elif response.status_code != 200:
#       per_page -= 1
#     i += 1
#   break 

def findPerPage(place_id, page_number, per_page, how_many_pages):
    pages = {}
    i = 1
    sleep_time = 60
    while True:
        while per_page != 0:
            #go through each page and per_page, get the request's response, 
            # if it is 200, append page_number:per_page to the dictionary.
            # if for any per_page, we get status_code != 200, we decrement the per_page by 1, and check response. 
            response = requests.get("https://api.inaturalist.org/v1/observations?place_id={}&page={}&per_page={}".format(place_id, page_number, per_page))
            if i%50 == 0: #We have a limit of 60 pages per minute, for pulling data from API
                time.sleep(sleep_time)
            if response.status_code == 200:
                pages[page_number] = per_page
                page_number +=1
            elif response.status_code != 200:
                per_page -= 1
            i += 1

            # if how_many_pages >= 200:
            #     return pages
        break
    return pages

def download_csv_pages(pages):
    pages_Series = pd.Series(pages)
    #pages_Series.to_csv('pages.csv')
    pages_df = pd.DataFrame(pages_Series, columns=['page_number', 'per_page'])
    pages_df.to_csv('pages.csv')

def download_pickle_pages(pages):
    pages_Series = pd.Series(pages)
    #pages_Series.to_csv('pages.csv')
    pages_df = pd.DataFrame(pages_Series, columns=['page_number', 'per_page'])
    pages_df.to_pickle('pages.pkl')

In [5]:
# pages_Series = pd.Series(pages)
# pages_Series.to_csv('pages.csv')
# pages_df = pd.read_csv('pages.csv', names=['page_number', 'per_page'])
# pages_df.info()
# pages_df.to_csv('pages.csv')
# pages_df.to_pickle('pages.pkl')

## Part 1.2. Pull data

In [6]:
# Function for pulling the data
def pull_data(data, place_id, sleep_time, page_number, per_page):
    '''
    place_id: each location has a place_id, which we can get from the website
    sleep_time: time in seconds we want to sleep
    page_number: depends on the number of observations. total_observations/per_page
    per_page: #observations per page
    '''
    if page_number%40 == 0:
        time.sleep(sleep_time)
    else:
        response = requests.get("https://api.inaturalist.org/v1/observations?place_id={}&page={}&per_page={}".format(place_id, page_number, per_page))
        file_dict = response.json()
        if ('results' in file_dict):
            for j in range(per_page):
            
                if 'time_observed_at' in file_dict['results'][j]:
                    data['time_observed_at'].append(file_dict['results'][j]['time_observed_at'])
                else:
                    data['time_observed_at'].append(None)
                if 'species_guess' in file_dict['results'][j]:
                    data['species_guess'].append(file_dict['results'][j]['species_guess'])
                else:
                    data['species_guess'].append(None)

                if ('taxon' in file_dict['results'][j]) & (file_dict['results'][j]['taxon'] is not None) :
                    #print(j)
                    if 'name' in file_dict['results'][j]['taxon']:
                        data['genus_name'].append(file_dict['results'][j]['taxon']['name'])
                    else:
                        data['genus_name'].append(None)
                    if 'rank' in file_dict['results'][j]['taxon']:
                        data['rank'].append(file_dict['results'][j]['taxon']['rank'])
                    else:
                        data['rank'].append(None)
                    if 'wikipedia_url' in file_dict['results'][j]['taxon']:
                        data['wikipedia_url'].append(file_dict['results'][j]['taxon']['wikipedia_url'])
                    else:
                        data['wikipedia_url'].append(None)
                    if 'iconic_taxon_name' in file_dict['results'][j]['taxon']:
                        data['iconic_taxon_name'].append(file_dict['results'][j]['taxon']['iconic_taxon_name'])
                    else:
                        data['iconic_taxon_name'].append(None)
                    #print(j)
                    if 'preferred_common_name' in file_dict['results'][j]['taxon']:
                        data['preferred_common_name'].append(file_dict['results'][j]['taxon']['preferred_common_name'])
                    else:
                        data['preferred_common_name'].append(None)
                else:
                    data['genus_name'].append(None)
                    data['rank'].append(None)
                    data['wikipedia_url'].append(None)
                    data['iconic_taxon_name'].append(None)
                    data['preferred_common_name'].append(None)
                if 'uri' in file_dict['results'][j]:
                    data['uri'].append(file_dict['results'][j]['uri'])
                else:
                    data['uri'].append(None)
                if 'geojson' in file_dict['results'][j]:
                    data['longitude'].append(file_dict['results'][j]['geojson']['coordinates'][0])
                    data['latitude'].append(file_dict['results'][j]['geojson']['coordinates'][1])
                else:
                    data['longitude'].append(None)
                    data['latitude'].append(None)
                if 'place_guess' in file_dict['results'][j]:
                    data['place_guess'].append(file_dict['results'][j]['place_guess'])
                else:
                    data['place_guess'].append(None)
        else:
            print(page_number)

    

`pages.csv' contains the 
```python
{'page_number': 'per_page'}
```
dictionary. For each page, we have a maximum limit on the number of observations that one can pull.

In [7]:
pages = pd.read_csv('pages.csv')

In [9]:
# Uncomment for pulling data. Change place_id
# for i in range(1, len(pages)):

#     pull_data(data, 1563, 60, int(pages['page_number'][i]), int(pages['per_page'][i]))
# data_df = pd.DataFrame(data)
# data_df.info()

# place_id = 1563
# csv_file = 'data_' + str(place_id) + '_' + str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv'
# pickle_file = 'data_' + str(place_id) + '_' + str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.pkl'
# data_df.to_csv(csv_file)
# data_df.to_pickle(pickle_file)
# data_df.info()

def getData(pages_df, data, place_id):

    for i in range(1, len(pages)):
        pull_data(data, place_id, 60, int(pages_df['page_number'][i]), int(pages_df['per_page'][i]))
    data_df = pd.DataFrame(data)
    return data_df
def download_csv_data(data, place_id, ):

    csv_file = 'data_' + str(place_id) + '_' + str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.csv'
    pickle_file = 'data_' + str(place_id) + '_' + str(datetime.now().strftime('%Y_%m_%d_%H_%M_%S')) + '.pkl'
    data_df.to_csv(csv_file)
    data_df.to_pickle(pickle_file)


***Use pickle instead of csv***

In [134]:
data_df = pd.read_csv('data_1563_2021_06_24_12_57_28.csv', index_col= 0)
# data_df.to_pickle('data_1563_2021_06_06_20_47_20.pickle')
# data_df = pd.read_pickle('data_1563_2021_06_06_20_47_20.pickle')
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45576 entries, 0 to 45575
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   time_observed_at       45240 non-null  object 
 1   species_guess          31406 non-null  object 
 2   genus_name             44627 non-null  object 
 3   rank                   44627 non-null  object 
 4   wikipedia_url          42470 non-null  object 
 5   iconic_taxon_name      44611 non-null  object 
 6   preferred_common_name  42654 non-null  object 
 7   uri                    45576 non-null  object 
 8   longitude              45576 non-null  float64
 9   latitude               45576 non-null  float64
 10  place_guess            45576 non-null  object 
dtypes: float64(2), object(9)
memory usage: 4.2+ MB


In [135]:
data_df.dropna(subset = ['wikipedia_url'], inplace = True)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42470 entries, 0 to 45575
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   time_observed_at       42201 non-null  object 
 1   species_guess          29515 non-null  object 
 2   genus_name             42470 non-null  object 
 3   rank                   42470 non-null  object 
 4   wikipedia_url          42470 non-null  object 
 5   iconic_taxon_name      42454 non-null  object 
 6   preferred_common_name  41305 non-null  object 
 7   uri                    42470 non-null  object 
 8   longitude              42470 non-null  float64
 9   latitude               42470 non-null  float64
 10  place_guess            42470 non-null  object 
dtypes: float64(2), object(9)
memory usage: 3.9+ MB


In [136]:
data_df.reset_index(inplace=True, drop = True)

In [137]:
data_df.head(15)

Unnamed: 0,time_observed_at,species_guess,genus_name,rank,wikipedia_url,iconic_taxon_name,preferred_common_name,uri,longitude,latitude,place_guess
0,2021-06-23T16:22:12-05:00,,Storeria dekayi,species,http://en.wikipedia.org/wiki/Storeria_dekayi,Reptilia,Dekay's Brownsnake,https://www.inaturalist.org/observations/84313002,-88.367813,40.212863,"Mahomet, IL, US"
1,2021-06-24T10:29:25-05:00,,Potentilla norvegica,species,http://en.wikipedia.org/wiki/Potentilla_norvegica,Plantae,rough cinquefoil,https://www.inaturalist.org/observations/84311385,-88.369087,40.21303,"N Lake of the Woods Rd, Mahomet, IL, US"
2,2021-06-23T14:19:35-05:00,,Tettigoniinae,subfamily,http://en.wikipedia.org/wiki/Tettigoniinae,Insecta,Shieldback Katydids,https://www.inaturalist.org/observations/84311301,-88.368783,40.210567,"Mahomet, IL, US"
3,2021-06-24T09:45:31+00:00,,Marpissa lineata,species,http://en.wikipedia.org/wiki/Marpissa_lineata,Arachnida,Four-lined Slender Jumping Spider,https://www.inaturalist.org/observations/84306238,-88.208083,40.112786,"W Main St, Urbana, IL, US"
4,2021-06-24T06:03:52-05:00,Eastern Box Turtle,Terrapene carolina carolina,subspecies,https://en.wikipedia.org/wiki/Eastern_box_turtle,Reptilia,Eastern Box Turtle,https://www.inaturalist.org/observations/84287773,-88.164075,40.030186,"Illinois, US"
5,2021-06-21T15:57:37-05:00,,Aesculus parviflora,species,http://en.wikipedia.org/wiki/Aesculus_parviflora,Plantae,bottlebrush buckeye,https://www.inaturalist.org/observations/84273805,-88.204346,40.110535,Urbana
6,2021-06-23T20:09:39+00:00,Xylotrechus colonus,Xylotrechus colonus,species,http://en.wikipedia.org/wiki/Xylotrechus_colonus,Insecta,Rustic Borer,https://www.inaturalist.org/observations/84257323,-88.185602,40.13284,"Hamilton Ave, Urbana, IL, US"
7,2021-06-23T10:45:09-05:00,garden petunia,Petunia × atkinsiana,hybrid,http://en.wikipedia.org/wiki/Petunia_×_atkinsiana,Plantae,garden petunia,https://www.inaturalist.org/observations/84252834,-88.255166,40.115403,"W University Ave, Champaign, IL, US"
8,2021-06-23T11:00:59-05:00,Spiked Sedge,Carex spicata,species,http://en.wikipedia.org/wiki/Carex_spicata,Plantae,Spiked Sedge,https://www.inaturalist.org/observations/84252426,-88.267401,40.116266,"S Russell St, Champaign, IL, US"
9,2021-06-23T10:36:28-05:00,Dill,Anethum graveolens,species,http://en.wikipedia.org/wiki/Dill,Plantae,Dill,https://www.inaturalist.org/observations/84252189,-88.246905,40.115404,"S State St, Champaign, IL, US"


# Part 2. Get taxonomy data from the rows

## Search for a term in wikipedia search and get its page

Uncomment for looking at results. I didn't find it useful.

In [2]:
# import wikipedia
# result = wikipedia.search("Monarda_fistulosa")
# print(result)
# # get the page: Neural network
# page = wikipedia.page(result[0])
# print(page)
# # get the title of the page
# title = page.title
# print(title)
# # get the categories of the page
# categories = page.categories
# print(categories)
# # get the whole wikipedia page text (content)
# content = page.content
# print(content)
# # get all the links in the page
# links = page.links
# print(links)
# # get the page references
# references = page.references
# print(references)
# # summary
# summary = page.summary
# print(summary)

## Part 2.1. Scraping data using beautiful soup


In [39]:
# For each row, if wikipedia_uri exists, we will use beautiful soup to extract the data 
# related to taxonomy
# import required modules
import requests
from bs4 import BeautifulSoup
# get URL
page = requests.get("http://en.wikipedia.org/wiki/Monarda_fistulosa")
  
# display status code
# print(page.status_code)
  
# display scrapped data
# print(page.content)

# scrape webpage
soup = BeautifulSoup(page.content, 'html.parser') #.get_text(strip=True) #This removes \xa0

# display scrapped data
# print(soup.prettify())

# list(soup.children)

# find all occurance of p in HTML
# includes HTML tags
# print(soup.find_all('p'))

In [40]:
data_df[data_df['rank'] == 'species' ]['wikipedia_url'][5]

'http://en.wikipedia.org/wiki/Aesculus_parviflora'

### Part 3.1.1. Get table from the id

In [72]:
# create object
# object = soup.find(id="mw-content-text")

# # find tags
# items = object.find_all(class_="infobox biota")
# result = items[0]
  
# # display tags
# print(result.prettify())

table = soup.find_all('table')

# table[0]

# for child in soup.find_all('table')[0].children:
#     for td in child:
#         print(td)

# list(soup.find_all('table')[0].tr.next_siblings)

# for sibling in soup.find_all('table')[0].tr.next_siblings:
#     for td in sibling:
#         print(td)

table = soup.find('table', attrs={'class':'infobox biota'}) #class="infobox biota"
table_rows = table.find_all('tr')

# table_rows

# len(table_rows)
data_taxonomy = {
    'Kingdom': list(),
    'Phylum': list(),
    'Class': list(),
    'Order': list(),
    'Suborder': list(),
    'Family': list(),
    'Genus': list(),
    'Species': list()
}
l = []
data_taxonomy['Kingdom'].insert(0, None)
data_taxonomy['Phylum'].insert(0, None)
data_taxonomy['Class'].insert(0, None)
data_taxonomy['Order'].insert(0, None)
data_taxonomy['Suborder'].insert(0, None)
data_taxonomy['Family'].insert(0, None)
data_taxonomy['Genus'].insert(0, None)
data_taxonomy['Species'].insert(0, None)
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.replace('\n', '').replace(':', '').replace(u'\xa0', ' ') for tr in td]
    

    if 'Kingdom' in row:
        data_taxonomy['Kingdom'][0] = row[1]
        l.append(row)
    if 'Phylum' in row:
        data_taxonomy['Phylum'][0] = row[1]
        l.append(row)   
    if 'Class' in row:
        data_taxonomy['Class'][0] = row[1]
        l.append(row)
    if 'Order' in row:
        data_taxonomy['Order'][0] = row[1]
        l.append(row)
    if 'Suborder' in row:
        data_taxonomy['Suborder'][0] = row[1]
        l.append(row)
    if 'Family' in row:
        data_taxonomy['Family'][0] = row[1]
        l.append(row)
    if 'Genus' in row:
        data_taxonomy['Genus'][0] = row[1]
        l.append(row)
    if 'Species' in row:
        data_taxonomy['Species'][0] = row[1]
        l.append(row)
    
    # elif 'Clade' in row:
    #     l.append(row)
l

# 'Kingdom' in l[4]

# table_rows = table_rows[4:13]

# l = []
# for tr in table_rows:
#     td = tr.find_all('td')
#     row = [tr.text for tr in td]
#     l.append(row)

# l

[['Kingdom', 'Plantae'],
 ['Order', 'Lamiales'],
 ['Family', 'Lamiaceae'],
 ['Genus', 'Monarda'],
 ['Species', 'M. fistulosa']]

In [73]:
data_taxonomy

{'Kingdom': ['Plantae'],
 'Phylum': [None],
 'Class': [None],
 'Order': ['Lamiales'],
 'Suborder': [None],
 'Family': ['Lamiaceae'],
 'Genus': ['Monarda'],
 'Species': ['M. fistulosa']}

In [138]:
len(data_df)

42470

In [146]:
data_taxonomy = {
    'Kingdom': list(),
    'Phylum': list(),
    'Class': list(),
    'Order': list(),
    'Suborder': list(),
    'Family': list(),
    'Genus': list(),
    'Species': list()
}
for i in range(len(data_df)):
    if 'http' in data_df['wikipedia_url'][i]:
        #print(i)
        # get URL
        data_taxonomy['Kingdom'].insert(i, None)
        data_taxonomy['Phylum'].insert(i, None)
        data_taxonomy['Class'].insert(i, None)
        data_taxonomy['Order'].insert(i, None)
        data_taxonomy['Suborder'].insert(i, None)
        data_taxonomy['Family'].insert(i, None)
        data_taxonomy['Genus'].insert(i, None)
        data_taxonomy['Species'].insert(i, None)

        page = requests.get(data_df['wikipedia_url'][i])

        # scrape webpage
        soup = BeautifulSoup(page.content, 'html.parser')

        #table = soup.find_all('table')

        table = soup.find('table', attrs={'class':'infobox biota'}) #class="infobox biota"
        if table is not None:
        
        
            table_rows = table.find_all('tr')
            for tr in table_rows:
                td = tr.find_all('td')
                row = [tr.text.replace('\n', '').replace(':', '').replace(u'\xa0', ' ') for tr in td]
        

                if 'Kingdom' in row:
                    data_taxonomy['Kingdom'][i] = row[1]
                    #l.append(row)
                if 'Phylum' in row:
                    data_taxonomy['Phylum'][i] = row[1]
                    #l.append(row)   
                if 'Class' in row:
                    data_taxonomy['Class'][i] = row[1]
                    #l.append(row)
                if 'Order' in row:
                    data_taxonomy['Order'][i] = row[1]
                    #l.append(row)
                if 'Suborder' in row:
                    data_taxonomy['Suborder'][i] = row[1]
                    #l.append(row)
                if 'Family' in row:
                    data_taxonomy['Family'][i] = row[1]
                    #l.append(row)
                if 'Genus' in row:
                    data_taxonomy['Genus'][i] = row[1]
                    #l.append(row)
                if 'Species' in row:
                    data_taxonomy['Species'][i] = row[1]
                    #l.append(row)
    


ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

In [149]:
len(data_taxonomy['Species'])

1660

In [150]:
data_taxonomy_df = pd.DataFrame(data_taxonomy) 
data_taxonomy_df.to_csv('data_taxonomy_df.csv')

In [151]:
page = requests.get(data_df['wikipedia_url'][0])

In [152]:
data_taxonomy_df.head()

Unnamed: 0,Kingdom,Phylum,Class,Order,Suborder,Family,Genus,Species
0,Animalia,Chordata,Reptilia,Squamata,Serpentes,Colubridae,Storeria,S. dekayi
1,Plantae,,,Rosales,,Rosaceae,Potentilla,P. norvegica
2,Animalia,Arthropoda,Insecta,Orthoptera,Ensifera,Tettigoniidae,,
3,Animalia,Arthropoda,Arachnida,Araneae,,Salticidae,Marpissa,M. lineata
4,Animalia,Chordata,Reptilia,Testudines,Cryptodira,Emydidae,Terrapene,T. carolina


## 
1. Put dictionary in database
    a. Use something like Unstructured database like MongoDB.
        * Each record is a dictionary.
        
3. 