Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Update URL JSON loading for compatibility with toolbox URL scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
sco1 committed Mar 20, 2018
1 parent 3fc2ccc commit 68d8964
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 6 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
# Ignore Jupyter notebook things
*.ipynb
*.ipynb

# Caches
__pycache__
17 changes: 12 additions & 5 deletions MATLABfcnscrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,18 @@ def loadURLdict(sourceJSON):
"""
Load URL dictionary from input JSON file
Expected dict format is key: toolbox name, value: alphabetical function list URL
Expected input dict format a nested dict:
Top level dict is MATLAB's "Family" group
Next level is a dict of toolbox:URL KV-pair for each group
Output is a single layer dict containing the toolbox:url KV-pairs
"""
sourceJSON = Path(sourceJSON)
with sourceJSON.open(mode='r') as fID:
return json.load(fID)
tmp = json.load(fID)

squeezegen = (tmp[grouping] for grouping in tmp.keys())
return {k: v for d in squeezegen for k, v in d.items()}

def scrapedocpage(URL):
"""
Expand All @@ -35,7 +42,7 @@ def scrapedocpage(URL):
Returns a list of function name strings
"""
r = requests.get(URL, timeout=1)
r = requests.get(URL, timeout=2)
soup = BeautifulSoup(r.content, 'html.parser')

tags = soup.find_all(attrs={'class': 'function'})
Expand Down Expand Up @@ -83,7 +90,7 @@ def scrapetoolboxes(URL="https://www.mathworks.com/help/index.html", JSONpath =
Dictionary is dumped to JSON/fname.JSON
"""
r = requests.get(URL, timeout=1)
r = requests.get(URL, timeout=2)
soup = BeautifulSoup(r.content, 'html.parser')

# Get first header that matches 'MATLAB', this should be our 'MATLAB Family' column
Expand Down Expand Up @@ -131,7 +138,7 @@ def helpURLbuilder(shortlink, prefix="https://www.mathworks.com/help/", suffix="
fcnlist = scrapedocpage(URL)
writeToolboxJSON(fcnlist, toolbox, outpath)
except (requests.exceptions.Timeout, requests.exceptions.ConnectionError):
# TODO: Add a retry pipeline
# TODO: Add a retry pipeline, verbosity of exception
logging.info(f"Unable to access online docs for '{toolbox}': '{URL}'")
else:
concatenatefcns(outpath)

0 comments on commit 68d8964

Please sign in to comment.