# Generate Documentation

Generate documentation pages from website. 

In [23]:
import pandas as pd 
import requests
from requests.exceptions import HTTPError
from bs4 import BeautifulSoup
from pathlib import Path
from io import StringIO
import tomli
from pandoc import pandoc
from tempfile import TemporaryFile

In [2]:
BASE_URL = "https://xgboost.readthedocs.io/en/stable/index.html"

In [3]:
# load in toml 
my_toml = tomli.loads((Path.cwd() / "docs.toml").read_text())

my_toml.keys()

dict_keys(['doc-pages'])

In [4]:
# load in page urls 
urls = my_toml.get("doc-pages")
urls

{'parameters': 'https://xgboost.readthedocs.io/en/stable/parameter.html',
 'gpu_support': 'https://xgboost.readthedocs.io/en/stable/gpu/index.html',
 'faw': 'https://xgboost.readthedocs.io/en/stable/faq.html',
 'tutorials': 'https://xgboost.readthedocs.io/en/stable/tutorials/index.html'}

In [5]:
# get pages 
def get_page(url: str, doc_page: str = None): 
    with requests.request("GET",url) as req: 
        if req.status_code != 200: 
            raise HTTPError(req.status_code)
        
        # SOUP
        soup = BeautifulSoup(req.text)

        # otherwise, just get page
        article_content = soup.select("div.document > div > section")[0]

        if doc_page is not None: 
            # save to file path
            _path =Path.cwd().parents[2] / f"docs/{doc_page}.html"
            
            _path.write_text(article_content)

        return article_content
            

In [6]:
page_text = get_page(url=next(iter(urls.values())))

In [24]:
with TemporaryFile("w", encoding='utf-8') as temp: 
    temp.write(page_text.text)

    my_text = temp.read()

    print(1)

UnsupportedOperation: not readable

In [14]:
(Path.cwd() / "Data/temp.html")\
    .write_text(
        page_text.text,
        encoding = 'utf-8'
    )

29022

In [15]:
pdoc_read = pandoc.read(page_text.text, format='html')

In [22]:
pandoc._configuration

{'auto': True,
 'path': '/usr/local/bin/pandoc',
 'version': '3.1.12.3',
 'pandoc_types_version': '1.22.2.1'}

In [21]:
pandoc.write(
    pdoc_read, format='markdown'
)

ProcessExecutionError: Unexpected exit code: 64
Command line: | /usr/local/bin/pandoc -t markdown -o /var/folders/vz/x1_phppx4jlfxt612qb8vcs80000gn/T/tmp20wsl85d/output -f json /var/folders/vz/x1_phppx4jlfxt612qb8vcs80000gn/T/tmp20wsl85d/input.js
Stderr:       | JSON parse error: Error in $: Incompatible API versions: encoded with [1,22,2,1] but attempted to decode with [1,23,1].

In [9]:
# try just getting base url page 
with requests.request("GET",BASE_URL) as req: 
    if req.status_code != 200: 
        raise HTTPError(req.status_code)
    
    # otherwise save 
    html_path = Path.cwd() / "data/base_url.html"

    # save base html to buffer 
    buff = StringIO(req.text)

    # generate soup
    soup = BeautifulSoup(buff)

    html_path.write_text(req.text)

In [41]:
_selector = "body > div.wy-grid-for-nav > nav > div > div.wy-menu.wy-menu-vertical > ul > li > a"

menu_items = soup.select(_selector)

In [43]:
http_title = [x.get("href") for x in menu_items if x.get("href")[:4] != 'http']
full_link = [f"https://xgboost.readthedocs.io/en/stable/{x}#" for x in http_title]

full_link

['https://xgboost.readthedocs.io/en/stable/install.html#',
 'https://xgboost.readthedocs.io/en/stable/build.html#',
 'https://xgboost.readthedocs.io/en/stable/get_started.html#',
 'https://xgboost.readthedocs.io/en/stable/tutorials/index.html#',
 'https://xgboost.readthedocs.io/en/stable/faq.html#',
 'https://xgboost.readthedocs.io/en/stable/gpu/index.html#',
 'https://xgboost.readthedocs.io/en/stable/parameter.html#',
 'https://xgboost.readthedocs.io/en/stable/prediction.html#',
 'https://xgboost.readthedocs.io/en/stable/treemethod.html#',
 'https://xgboost.readthedocs.io/en/stable/python/index.html#',
 'https://xgboost.readthedocs.io/en/stable/R-package/index.html#',
 'https://xgboost.readthedocs.io/en/stable/jvm/index.html#',
 'https://xgboost.readthedocs.io/en/stable/julia.html#',
 'https://xgboost.readthedocs.io/en/stable/c.html#',
 'https://xgboost.readthedocs.io/en/stable/c%2B%2B.html#',
 'https://xgboost.readthedocs.io/en/stable/cli.html#',
 'https://xgboost.readthedocs.io/en/s