In [1]:
import sys
import time
import random
import requests
import pickle
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm as tqdm_notebook
import re
import collections
import glob
import json
from bs4 import BeautifulSoup

# Top level parts

In [2]:
URL = "http://dev.floranorthamerica.org/Category:Top_Level_Property"
base = "http://dev.floranorthamerica.org/"
base_URLs = []

In [None]:
# Get all base URLS

finished = False

while not finished:
    
    time.sleep(random.randint(0, 3))
    
    page = requests.get(URL, timeout=5)
    soup = BeautifulSoup(page.content, 'html.parser')
    items = soup.find_all('a')
    
    # Get next page
    next_page = [item for item in items if item.text == 'next page']
    if next_page:
        for page in next_page:
            base_URL = base + page['href']
            if base_URL not in base_URLs:
                base_URLs.append(base_URL)
                URL = base_URL
    else:
        finished = True


In [None]:
base_URLs.append("http://dev.floranorthamerica.org/Category:Top_Level_Property")

In [None]:
glossary = collections.defaultdict(list)
base = "http://dev.floranorthamerica.org"

for URL in tqdm_notebook(base_URLs[0:], desc='Main'):
    
    time.sleep(random.randint(0, 3))
    
    page = requests.get(URL, timeout=20)
    soup = BeautifulSoup(page.content, 'html.parser')
    items = soup.find_all('a')
    

    
    for item in tqdm_notebook(items[0:], leave=False, desc='Item'):
        if item.has_attr('href'):
            if item['href'].startswith('/Property:'):
                
                try:
                    candidate = re.sub(r'Property:', '', item.text)
                    #print(candidate)
                    sub_URL = base + item['href']

                    time.sleep(random.randint(1, 3))

                    page = requests.get(sub_URL, timeout=20)
                    soup = BeautifulSoup(page.content, 'html.parser')

                    for item in soup.find("div", {"id": "mw-content-text"}).find_all('li'):
                        #print(item)
                        if item.a.has_attr('href'):
                            if item.text.startswith('Structure'):
                                glossary[item.a.text].append(candidate)
                except:
                    print(item['href'])
                    continue

In [None]:
with open('../../data/glossaries/FNA_toplevels.pkl', 'wb') as f:
    pickle.dump(glossary, f)    

# Parts

In [None]:
parts = collections.defaultdict(list)

In [None]:
for i in tqdm_notebook(range(0, 20000, 2000)):
    
    URL = f"http://dev.floranorthamerica.org/w/index.php?title=Property:Part_of&limit=2000&offset={i}&from=&until=&filter="
   
    page = requests.get(URL, timeout=30)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    try:
        for item in soup.find_all("div", {"class": "smw-table-row value-row"}):
            for idx, subitem in enumerate(item.find_all('a')):
                if idx == 0:
                    info = subitem.text
                elif subitem.text != '+':
                    part = subitem.text
                    parts[part].append(info)
    except:
        print('No info?')
        continue

In [None]:
with open('../../data/glossaries/FNA_parts.pkl', 'wb') as f:
    pickle.dump(parts, f) 

# Properties

In [None]:
properties = collections.defaultdict(list)

In [None]:
for i in tqdm_notebook(range(0, 108000, 2000)):    
    URL = f"http://dev.floranorthamerica.org/w/index.php?title=Property:Subproperty_of&limit=2000&offset={i}&from=&until=&filter="

    page = requests.get(URL, timeout=30)
    soup = BeautifulSoup(page.content, 'html.parser')

    try:
        for item in soup.find_all("div", {"class": "smw-table-row value-row"}):
            for idx, subitem in enumerate(item.find_all('a')):
                if idx == 0:
                    info = subitem.text
                elif subitem.text != '+':
                    part = subitem.text
                    properties[part].append(info)
    except:
        print('No info?')
        continue

In [None]:
with open('../../data/glossaries/FNA_properties.pkl', 'wb') as f:
    pickle.dump(properties, f) 

In [None]:
len(properties_keys)

# Create clean glossary

In [41]:
parts = pickle.load(open('../../data/glossaries/FNA_parts.pkl', 'rb'))
properties = pickle.load(open('../../data/glossaries/FNA_properties.pkl', 'rb'))

In [42]:
glossary = collections.defaultdict(list)

In [43]:
for mainpart in list(parts.keys())[0:]:
    for subpart in parts[mainpart]:
        #print(subpart)
        if len(subpart.split()) == 1:
            if subpart.lower() not in compound_list:
                #print(subpart.lower())
                glossary[mainpart.lower()].append(subpart.lower())
    for subpart in properties[mainpart]:
        #print(subpart)
        if len(subpart.split()) == 1:
            if subpart.lower() not in compound_list:
                #print(subpart.lower())
                glossary[mainpart.lower()].append(subpart.lower())

In [44]:
#glossary['Compounds'] = compound_list

In [45]:
for key in glossary.keys():
    glossary[key] = list(set(glossary[key]))
    glossary[key].sort()

In [46]:
print(json.dumps(glossary, sort_keys=False, indent=4)) 

{
    "flower": [
        "androecium",
        "androgynophore",
        "androphore",
        "anther",
        "anthocyanotic",
        "bristle",
        "bursicle",
        "calyx",
        "capitulum",
        "caudicle",
        "clinandrium",
        "column-foot",
        "corolla",
        "corolla-tube",
        "disc",
        "disc-floret",
        "epicalyx",
        "floral",
        "floral-cup",
        "floral-tube",
        "floret",
        "flower",
        "gynoecium",
        "gynophore",
        "gynostemium",
        "hypanthium",
        "mentum",
        "perianth",
        "perigonium",
        "petal",
        "ray",
        "ray-floret",
        "scale",
        "sepal",
        "sinal",
        "stamen",
        "staminal",
        "stigmatic",
        "tepal",
        "tubule"
    ],
    "phyllary": [
        "appendage",
        "phyllary",
        "tip"
    ],
    "style": [
        "appendage",
        "stylar",
        "style",
        "style-beak"
 

In [47]:
with open('../../data/glossaries/FNA_glossary.pkl', 'wb') as f:
    pickle.dump(glossary, f) 

In [None]:
glossary[]

In [None]:
properties = pickle.load(open('../../data/glossaries/FNA_parts.pkl', 'rb'))