Skip to content

Commit

Permalink
New function to persist progress between runs eg. of scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
mcarans committed Nov 15, 2019
1 parent f034faa commit 3e66dce
Show file tree
Hide file tree
Showing 5 changed files with 229 additions and 17 deletions.
20 changes: 18 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -349,8 +349,24 @@ Examples:
# Gets temporary directory from environment variable
# TEMP_DIR and falls back to os function,
# optionally appends the given folder, creates the
# folder and on exiting, deletes the folder
with temp_dir('papa') as tempdir:
# folder and deletes the folder if exiting
# successfully else keeps the folder if tehre was
# an exception
with temp_dir('papa', delete_on_success=True, delete_on_failure=False) as tempdir:
...
# Sometimes it is necessary to be able to resume runs if they fail. The following
# example creates a temporary folder and iterates through a list of items.
# On each iteration, the current state of progress is stored in the temporary
# folder. If the iteration were to fail, the temporary folder is not deleted and
# on the next run, it will resume where it failed. Once the whole list is iterated
# through, the temporary folder is deleted.
# The environment variable WHERETOSTART can be set to the starting value. If it is
# set to RESET, then the temporary folder is deleted before the run starts to ensure
# it starts from the beginning.
iterator = [{'iso3': 'AFG', 'name': 'Afghanistan'}, {'iso3': 'SDN', 'name': 'Sudan'},
{'iso3': 'YEM', 'name': 'Yemen'}, {'iso3': 'ZAM', 'name': 'Zambia'}]
result = list()
for tempdir, nextdict in progress_storing_tempdir(tempfolder, iterator, 'iso3'):
...

# Get current directory of script
Expand Down
68 changes: 61 additions & 7 deletions src/hdx/utilities/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@
"""Directory Path Utilities"""
import contextlib
import inspect
import logging
import sys
from os import getenv, makedirs
from os.path import abspath, realpath, dirname, join, exists
from shutil import rmtree
from tempfile import gettempdir
from typing import Any, Optional
from typing import Any, Optional, Iterable, Tuple, Dict

from hdx.utilities.loader import load_file_to_str
from hdx.utilities.saver import save_str_to_file

logger = logging.getLogger(__name__)


def script_dir(pyobject, follow_symlinks=True):
Expand Down Expand Up @@ -48,7 +54,7 @@ def script_dir_plus_file(filename, pyobject, follow_symlinks=True):
def get_temp_dir():
# type: () -> str
"""Get a temporary directory. Looks for environment variable TEMP_DIR and falls
back on os.ggettempdir.
back on os.gettempdir.
Returns:
str: A temporary directory
Expand All @@ -57,13 +63,14 @@ def get_temp_dir():


@contextlib.contextmanager
def temp_dir(folder=None, delete=True):
# type: (Optional[str], bool) -> str
def temp_dir(folder=None, delete_on_success=True, delete_on_failure=True):
# type: (Optional[str], bool, bool) -> str
"""Get a temporary directory optionally with folder appended (and created if it doesn't exist)
Args:
folder (Optional[str]): Folder to create in temporary folder. Defaults to None.
delete (bool): Whether to delete folder (assuming folder arg supplied) on exiting with statement
delete_on_success (bool): Whether to delete folder (if folder supplied) on exiting with statement successfully. Defaults to True.
delete_on_failure (bool): Whether to delete folder (if folder supplied) on exiting with statement unsuccessfully. Defaults to True.
Returns:
str: A temporary directory
Expand All @@ -75,6 +82,53 @@ def temp_dir(folder=None, delete=True):
makedirs(tempdir)
try:
yield tempdir
finally:
if folder and delete:
if folder and delete_on_success:
rmtree(tempdir)
except:
if folder and delete_on_failure:
rmtree(tempdir)
raise


def progress_storing_tempdir(folder, iterator, key):
# type: (str, Iterable[Dict], str) -> Tuple[str,Dict]
"""Get a temporary directory optionally with folder appended (and created if it doesn't exist) that
persists until the contents of the with statement are fully complete, persisting state between runs.
Args:
folder (str): Folder to create in temporary folder
iterator (Iterable[Dict]): Iterate over this object persisting progress
key (str): Key to examine from dictionary from iterator
Returns:
Tuple[str,Dict]: A tuple of the form (temporary directory, next object in iterator)
"""
with temp_dir(folder, delete_on_success=True, delete_on_failure=False) as tempdir:
progress_file = join(tempdir, 'progress.txt')
wheretostart = getenv('WHERETOSTART')
if wheretostart:
wheretostart = wheretostart.upper()
if wheretostart == 'RESET':
rmtree(tempdir)
makedirs(tempdir)
wheretostart = None
logger.info('Removing progress file and will start from beginning!')
else:
logger.info('Environment variable WHERETOSTART = %s' % wheretostart)
else:
if exists(progress_file):
wheretostart = load_file_to_str(progress_file, strip=True)
logger.info('File WHERETOSTART = %s' % wheretostart)
found = False
for nextdict in iterator:
currentlocation = nextdict[key]
if wheretostart and not found:
if currentlocation == wheretostart:
found = True
logger.info('Starting run from WHERETOSTART %s' % wheretostart)
else:
logger.info('Run not started. Ignoring %s. WHERETOSTART (%s) not matched.' % (currentlocation,
wheretostart))
continue
save_str_to_file(currentlocation, progress_file)
yield tempdir, nextdict
2 changes: 1 addition & 1 deletion src/hdx/utilities/version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.8.7
1.8.8
2 changes: 1 addition & 1 deletion tests/hdx/utilities/test_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def test_load_json_into_existing_dict(self, configfolder):
assert result == TestLoader.expected_json

def test_load_file_to_str(self):
with temp_dir(folder='test_text', delete=True) as tmpdir:
with temp_dir(folder='test_text') as tmpdir:
text_file = join(tmpdir, 'text_file.txt')
save_str_to_file(TestLoader.text, text_file)
result = load_file_to_str(text_file)
Expand Down
154 changes: 148 additions & 6 deletions tests/hdx/utilities/test_path.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,160 @@
# -*- coding: UTF-8 -*-
"""Path Utility Tests"""
from os.path import join
from os.path import join, exists
from shutil import rmtree
from tempfile import gettempdir

from hdx.utilities.path import get_temp_dir, temp_dir
import pytest

from hdx.utilities.path import get_temp_dir, temp_dir, progress_storing_tempdir


class TestPath:
def test_tempdir(self, monkeypatch):
@pytest.fixture(scope='class')
def mytestdir(self):
return join('haha', 'lala')

def test_get_temp_dir(self, monkeypatch, mytestdir):
assert get_temp_dir() == gettempdir()
mytestdir = join('haha', 'lala')
monkeypatch.setenv('TEMP_DIR', mytestdir)
assert get_temp_dir() == mytestdir
monkeypatch.delenv('TEMP_DIR')

def test_temp_dir(self, monkeypatch, mytestdir):
monkeypatch.setenv('TEMP_DIR', mytestdir)
with temp_dir() as tempdir:
assert tempdir == mytestdir
with temp_dir('papa') as tempdir:
assert tempdir == join(mytestdir, 'papa')
monkeypatch.delenv('TEMP_DIR')

tempfolder = 'papa'
expected_dir = join(gettempdir(), tempfolder)

with temp_dir(tempfolder) as tempdir:
assert tempdir == expected_dir
assert exists(tempdir) is False
try:
with temp_dir(tempfolder) as tempdir:
assert tempdir == expected_dir
raise ValueError('Fail!')
except:
pass
assert exists(tempdir) is False

with temp_dir(tempfolder, delete_on_success=True, delete_on_failure=True) as tempdir:
assert tempdir == expected_dir
assert exists(tempdir) is False
try:
with temp_dir(tempfolder, delete_on_success=True, delete_on_failure=True) as tempdir:
assert tempdir == expected_dir
raise ValueError('Fail!')
except:
pass
assert exists(tempdir) is False

with temp_dir(tempfolder, delete_on_success=False, delete_on_failure=False) as tempdir:
assert tempdir == expected_dir
assert exists(tempdir) is True
rmtree(tempdir)
try:
with temp_dir(tempfolder, delete_on_success=False, delete_on_failure=False) as tempdir:
assert tempdir == expected_dir
raise ValueError('Fail!')
except:
pass
assert exists(tempdir) is True

with temp_dir(tempfolder, delete_on_success=True, delete_on_failure=False) as tempdir:
assert tempdir == expected_dir
assert exists(tempdir) is False
try:
with temp_dir(tempfolder, delete_on_success=True, delete_on_failure=False) as tempdir:
assert tempdir == expected_dir
raise ValueError('Fail!')
except:
pass
assert exists(tempdir) is True
rmtree(tempdir)

with temp_dir(tempfolder, delete_on_success=False, delete_on_failure=True) as tempdir:
assert tempdir == expected_dir
assert exists(tempdir) is True
rmtree(tempdir)
try:
with temp_dir(tempfolder, delete_on_success=False, delete_on_failure=True) as tempdir:
assert tempdir == expected_dir
raise ValueError('Fail!')
except:
pass
assert exists(tempdir) is False

def test_progress_storing_tempdir(self, monkeypatch):
tempfolder = 'papa'
expected_dir = join(gettempdir(), tempfolder)
iterator = [{'iso3': 'AFG', 'name': 'Afghanistan'}, {'iso3': 'SDN', 'name': 'Sudan'},
{'iso3': 'YEM', 'name': 'Yemen'}, {'iso3': 'ZAM', 'name': 'Zambia'}]
result = list()
for tempdir, nextdict in progress_storing_tempdir(tempfolder, iterator, 'iso3'):
assert exists(tempdir) is True
assert tempdir == expected_dir
result.append(nextdict)
assert result == iterator
assert exists(expected_dir) is False

monkeypatch.setenv('WHERETOSTART', 'SDN')
result = list()
for tempdir, nextdict in progress_storing_tempdir(tempfolder, iterator, 'iso3'):
assert exists(tempdir) is True
assert tempdir == expected_dir
result.append(nextdict)
assert result == iterator[1:]
assert exists(expected_dir) is False
monkeypatch.delenv('WHERETOSTART')

try:
for tempdir, nextdict in progress_storing_tempdir(tempfolder, iterator, 'iso3'):
if nextdict['iso3'] == 'YEM':
raise ValueError('Problem!')
except:
pass
assert exists(expected_dir) is True
result = list()
for tempdir, nextdict in progress_storing_tempdir(tempfolder, iterator, 'iso3'):
assert exists(tempdir) is True
assert tempdir == expected_dir
result.append(nextdict)
assert result == iterator[2:]
assert exists(expected_dir) is False

try:
for tempdir, nextdict in progress_storing_tempdir(tempfolder, iterator, 'iso3'):
if nextdict['iso3'] == 'YEM':
raise ValueError('Problem!')
except:
pass
assert exists(expected_dir) is True
monkeypatch.setenv('WHERETOSTART', 'RESET')
result = list()
for tempdir, nextdict in progress_storing_tempdir(tempfolder, iterator, 'iso3'):
assert exists(tempdir) is True
assert tempdir == expected_dir
result.append(nextdict)
assert result == iterator
assert exists(expected_dir) is False
monkeypatch.delenv('WHERETOSTART')

try:
for tempdir, nextdict in progress_storing_tempdir(tempfolder, iterator, 'iso3'):
if nextdict['iso3'] == 'YEM':
raise ValueError('Problem!')
except:
pass
assert exists(expected_dir) is True
monkeypatch.setenv('WHERETOSTART', 'SDN')
result = list()
for tempdir, nextdict in progress_storing_tempdir(tempfolder, iterator, 'iso3'):
assert exists(tempdir) is True
assert tempdir == expected_dir
result.append(nextdict)
assert result == iterator[1:]
assert exists(expected_dir) is False
monkeypatch.delenv('WHERETOSTART')

0 comments on commit 3e66dce

Please sign in to comment.