In [1]:
import time 
import re 
import sys
import requests 
from pathlib import Path

import numpy as np
import pandas as pd 

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

In [2]:
def get_page_source(url):
    """
    Esta es una función que extrae el código fuente
    de una página de Aliexpress
    
    Returns
        código fuente
    """
    
    randint = np.random.randint
    # --------------------------- #
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--incognito')
#     options.add_argument('--headless') # no aparezca el driver
    # --------------------------- #
    driver = webdriver.Chrome(r'C:\Users\irvg\Downloads\chromedriver_win32 (1)\chromedriver.exe',
                             chrome_options=options)
    driver.get(url)
    time.sleep(randint(3, high=7))
    ActionChains(driver).send_keys(Keys.END).perform()
    time.sleep(randint(3, high=7))
    ActionChains(driver).send_keys(Keys.END).perform()
    time.sleep(randint(3, high=7))
    
    page_source = driver.page_source
    driver.quit()
    
    return page_source

In [3]:
def get_links(page_source):
    """
    Recibe el código fuente de una página 
    extrae las urls y las regresa.
    
    Returns
        list
    """
    mamiferos = BeautifulSoup(page_source, 'lxml')
    link_to_animal = mamiferos.select('a[class="display-name comname"]')
    links = [link['href'] for link in link_to_animal]
    protocol = 'https://www.naturalista.mx'
    links_with_protocol = [protocol + link for link in links]
    return links_with_protocol


In [4]:
def random_keypress_generator():
    keys = [Keys.ARROW_DOWN, Keys.ARROW_LEFT, Keys.ARROW_UP, Keys.ARROW_RIGHT, Keys.DOWN, Keys.END, Keys.PAGE_UP]
    action = np.random.choice(keys, p=[0.3, 0.05, 0.05, 0.1, 0.2, 0.2, 0.1])
    return action

In [5]:
def web_content(link):
    """
    Recibe una url de producto
    y extrae el codigo fuente
        
    Returns
        código fuente
    """
    random = np.random.choice
    # --------------------------- #
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--incognito')
#     options.add_argument('--headless') # no aparezca el driver
    # --------------------------- #
    driver = webdriver.Chrome(r'C:\Users\irvg\Downloads\chromedriver_win32 (1)\chromedriver.exe',
                             chrome_options=options)
    driver.get(link)
    time.sleep(random(5))
    for ran in range(random(3)):       # keys.END
        ActionChains(driver).send_keys(random_keypress_generator()).perform()
        time.sleep(random(4))
    page_source = driver.page_source
    driver.quit()
    return page_source

In [10]:
def naturalista_animal(page_source):
    """
    Recibe una url y extrae cinco atributos:
        -nombre del animal
        -nombre del observador
        -numero de observaciones
        -fecha
        -lugar
        
    Returns
        tuple
    """
    animal = BeautifulSoup(page_source, 'lxml')
    animal_observer = animal.select('div[class="title"]')
    text = [x.text for x in animal_observer]
    observador = text[0]
    animal_observations = animal.select('div[class="subtitle"]')
    text = [x.text for x in animal_observations]
    observations = re.findall(r'[0-9].+?', text[0])
    animal_date = animal.select('div[class="col-xs-6"]')
    text = [x.text for x in animal_date]
    date = animal.find('span', class_='date').get_text()
    animal_place = animal.select('div[class="place-guess"]')
    text = [x.text for x in animal_place]
    place = text[0]
    try:
        animal_name = animal.find('span', class_='SplitTaxon taxon species Mammalia has-com-name parens').get_text()
    except AttributeError:
         animal_name = animal.find('span', class_='SplitTaxon taxon subfamily Mammalia has-com-name parens').get_text()
        

    return animal_name, observador, observations, date, place

In [11]:
def naturalist_animal_project():
    url = "https://www.naturalista.mx/observations?taxon_id=40151"
    page_source = get_page_source(url)
    links = get_links(page_source)
    print(len(links))

    for num, link in enumerate(links, start=1):

        print(f"This link is #{num}")
        filename = f"animal_{num}_naturalista.csv"
        filename = "data/"+ filename
        print("Document will be place and call: " + filename)

        if not Path(filename).is_file():
            page_source = web_content(link)
            print(link)
            print("Finish extracting animal's page source.")
            print("Parsing content.")
            variables = naturalista_animal(page_source)
            print("Finish Parsing. Appending.")
            variables = list(variables) + [link]
            print(variables)
            df = pd.DataFrame([variables], columns=["Animal_Name", "Observer",
                                     "Observations", "Date", 
                                     "Place", 
                                     "Link"],
                          dtype=object)

            df.to_csv(filename, index = False)
            print(f"{filename} saved.")

    print("Finished!")

In [13]:
naturalist_animal_project()

  driver = webdriver.Chrome(r'C:\Users\irvg\Downloads\chromedriver_win32 (1)\chromedriver.exe',
  driver = webdriver.Chrome(r'C:\Users\irvg\Downloads\chromedriver_win32 (1)\chromedriver.exe',


266
This link is #1
Document will be place and call: data/animal_1_naturalista.csv


  driver = webdriver.Chrome(r'C:\Users\irvg\Downloads\chromedriver_win32 (1)\chromedriver.exe',
  driver = webdriver.Chrome(r'C:\Users\irvg\Downloads\chromedriver_win32 (1)\chromedriver.exe',


https://www.naturalista.mx/observations/131507777
Finish extracting animal's page source.
Parsing content.
Finish Parsing. Appending.
['Mapache Procyon lotor', 'tepelmeme16', ['83'], 'nov. 18, 2019 · 17:16 CST', 'Tepelmeme Villa de Morelos, Oax....Muestra', 'https://www.naturalista.mx/observations/131507777']
data/animal_1_naturalista.csv saved.
This link is #2
Document will be place and call: data/animal_2_naturalista.csv


  driver = webdriver.Chrome(r'C:\Users\irvg\Downloads\chromedriver_win32 (1)\chromedriver.exe',
  driver = webdriver.Chrome(r'C:\Users\irvg\Downloads\chromedriver_win32 (1)\chromedriver.exe',


https://www.naturalista.mx/observations/131507414
Finish extracting animal's page source.
Parsing content.


IndexError: list index out of range