/
resource_downloader.py
49 lines (38 loc) · 1.51 KB
/
resource_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import logging
from os.path import join
from shutil import copy2
from slugify import slugify
from ..base_scraper import BaseScraper
logger = logging.getLogger(__name__)
class ResourceDownloader(BaseScraper):
"""Each resource downloader is configured from dataset information that can come
from a YAML file for example. When run it downloads the resource described in the
dataset information from HDX and puts it in the given folder.
Args:
datasetinfo (Dict): Information about dataset
folder (str): Folder to which to download. Defaults to "".
"""
def __init__(self, datasetinfo, folder):
# ResourceDownloader only outputs to sources
name = f"resource_downloader_{slugify(datasetinfo['hxltag'].lower(), separator='_')}"
super().__init__(name, datasetinfo, {})
self.folder = folder
def run(self) -> None:
"""Runs one resource downloader given dataset information
Returns:
None
"""
reader = self.get_reader("hdx")
resource = reader.read_hdx_metadata(self.datasetinfo)
url, path = reader.download_resource(resource, file_prefix=self.name)
logger.info(f"Downloading {url} to {path}")
copy2(path, join(self.folder, self.datasetinfo["filename"]))
def add_sources(self) -> None:
"""Add source for resource download
Returns:
None
"""
self.add_hxltag_source(
self.datasetinfo["hxltag"],
key="ResourceDownloader",
)