In [1]:
import glob
import pandas as pd
import numpy as np
import pickle
import re
from bs4 import BeautifulSoup
import collections
import requests
from itertools import chain
from tqdm import tqdm

In [6]:
# Load POWO
powo = pickle.load(open('../../data/processed/descriptions_web_plants_powo.pkl', 'rb'))
# Load all powo data
ipni_data = pd.read_csv('../../data/external/ipni.csv', header=None)
ipni_data.columns = ['link', 'species', 'subname', 
                     'x4',    'x5',     'x6',
                     'x7',    'x8',     'x9', 
                     'x10',   'x11',    'x12',]

# List species
powo_list = list(powo.keys())
# Drop species without description
inpi_df = ipni_data[ipni_data['species'].isin(powo_list)]

In [8]:
ipni_data

Unnamed: 0,link,species,subname,x4,x5,x6,x7,x8,x9,x10,x11,x12
0,urn:lsid:ipni.org:names:327752-1,Desmia conferta,D.Don,spec.,Ericaceae,,urn:lsid:ipni.org:names:14701-1,,1834.0,Edinburgh New Philos. J. 17: 153. 1834 [Jul 1...,unknown,https://www.ipni.org/n/urn:lsid:ipni.org:names...
1,urn:lsid:ipni.org:names:327752-2,trib. Paniceae,R.Br. in Flinders,trib.,Poaceae,urn:lsid:ipni.org:names:321663-2,urn:lsid:ipni.org:names:30000032-2,urn:lsid:ipni.org:publications:9050-2,1814.0,Voy. Terra Austral. 2: 582. 1814 [19 Jul 1814],tax. nov.,https://www.ipni.org/n/urn:lsid:ipni.org:names...
2,urn:lsid:ipni.org:names:327753-1,Desmia polifolia,D.Don,spec.,Ericaceae,,urn:lsid:ipni.org:names:14701-1,,1834.0,Edinburgh New Philos. J. 17: 153. 1834 [Jul 1...,unknown,https://www.ipni.org/n/urn:lsid:ipni.org:names...
3,urn:lsid:ipni.org:names:327753-2,Chamaesyce,Gray,gen.,Euphorbiaceae,,urn:lsid:ipni.org:names:30000064-2,urn:lsid:ipni.org:publications:4868-2,1821.0,Nat. Arr. Brit. Pl. 2: 260. 1821 [1 Nov 1821],tax. nov.,https://www.ipni.org/n/urn:lsid:ipni.org:names...
4,urn:lsid:ipni.org:names:327754-1,Desmogyne angustifolia,Knagg,spec.,Ericaceae,,urn:lsid:ipni.org:names:50325670-1,urn:lsid:ipni.org:publications:758-2,1923.0,Notes Roy. Bot. Gard. Edinburgh 14: 73. 1923,unknown,https://www.ipni.org/n/urn:lsid:ipni.org:names...
...,...,...,...,...,...,...,...,...,...,...,...,...
1352345,urn:lsid:ipni.org:names:32775-1,Cyclaminos,Heldr.,gen.,Primulaceae,,urn:lsid:ipni.org:names:50032665-1,urn:lsid:ipni.org:publications:573-2,,Bull. Herb. Boissier vi. (1898) 386.,unknown,https://www.ipni.org/n/urn:lsid:ipni.org:names...
1352346,urn:lsid:ipni.org:names:32775-2,Bigelowia nuttallii,L.C.Anderson,spec.,Asteraceae,,urn:lsid:ipni.org:names:331402-2,urn:lsid:ipni.org:publications:1068-2,1970.0,Sida 3(7): 460. 1970,tax. nov.,https://www.ipni.org/n/urn:lsid:ipni.org:names...
1352347,urn:lsid:ipni.org:names:327750-2,subtrib. Zoysiinae,Benth.,subtrib.,Poaceae,,urn:lsid:ipni.org:names:30000032-2,urn:lsid:ipni.org:publications:921-2,1878.0,"Fl. Austral. 7: 453, 505. 1878 [Mar 1878]",tax. nov.,https://www.ipni.org/n/urn:lsid:ipni.org:names...
1352348,urn:lsid:ipni.org:names:327751-1,Desmia aequalis,D.Don,spec.,Ericaceae,,urn:lsid:ipni.org:names:14701-1,,1834.0,Edinburgh New Philos. J. 17: 153. 1834 [Jul 1...,unknown,https://www.ipni.org/n/urn:lsid:ipni.org:names...


In [None]:
#ipni_data = inpi_df.iloc[0:20, :]

# loop over URLs
for ipni_link, species, subname in zip(tqdm(inpi_df.link), inpi_df.species, inpi_df.subname):
    
    #print(species)
    
    URL = 'http://powo.science.kew.org/taxon/' + ipni_link
    #print(URL)
    
    try:
        page = requests.get(URL, timeout=5)
        soup = BeautifulSoup(page.content, 'html.parser')
        # Create folder
        # Dump the HTML file
        with open('../data/raw/POWO/' + species + ' - ' + subname + '.html', "w") as f:
              f.write(str(soup))
    except:
        continue

In [None]:
# Read files
html_list = glob.glob('../../data/raw/POWO/*')
# Open dict
data_powo = collections.defaultdict(list)
# Init Description list
descriptions = ['Morphology', 'General Description', 'Diagnostic', 'sex', 'Sterile', 'Fertile']

for html in tqdm(html_list):
    # Open HTML file
    with open(html) as f:
        soup = BeautifulSoup(f, 'html.parser')

        # Extract title
        name = html.lstrip('../data/raw/POWO/').split(' - ')[0]
        
        # Loop over text
        for text in soup.find_all('dd'):
            try:
                if text.find_previous_sibling().span.text in descriptions:
                    data_powo[name].append(tuple([1, text.text.strip()]))
                else:
                    data_powo[name].append(tuple([0, text.text.strip()]))
            except:
                continue
    
with open('../../data/processed/train_dataPOWO.pkl', 'wb') as f:
    pickle.dump(data_powo, f)    