Skip to content

Commit

Permalink
get_tabular_rows enhancement
Browse files Browse the repository at this point in the history
  • Loading branch information
mcarans committed Feb 7, 2020
1 parent b0f8ba4 commit 8dce7bf
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 40 deletions.
15 changes: 7 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,16 +77,15 @@ if that library is included), then it can be configured once and used automatica
The get_tabular_rows method enables iteration through tabular data. It returns the header of tabular file pointed to by
the url and an iterator where each row is returned as a list or dictionary depending on the dict_rows argument.
Optionally, headers and values can be inserted at specific positions. This is achieved using the insertions argument.
If supplied, it must be a dictionary containing the keys "headers" and "functions". "headers" contains a list of tuples
of the form (position, header) to be inserted and "functions" is a list of functions each of which takes a parameter
extended_rows which contains the row's number, file headers list and the row values list. Example:
If supplied, it must be a dictionary containing the keys "headers" and "function". "headers" contains a list of tuples
of the form (position, header) to be inserted and "function" is a function which takes in the arguments headers (prior
to any insertions) and row (which will be in dict or list form depending upon the dict_rows argument). Example:

def testfn(extended_rows):
for row_number, headers, row in extended_rows:
row.insert(2, 'lala')
yield row_number, headers, row
def testfn(headers, row):
row['la'] = 'lala'
return row

insertions = {'headers': [(2, 'la')], 'functions': [testfn]}
insertions = {'headers': [(2, 'la')], 'function': testfn}
headers, generator = downloader.get_tabular_rows(url, headers=3, insertions=insertions)

Other useful functions:
Expand Down
22 changes: 15 additions & 7 deletions src/hdx/utilities/downloader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
"""Downloading utilities for urls"""
import copy
import hashlib
import logging
from collections import OrderedDict
Expand Down Expand Up @@ -376,9 +377,9 @@ def get_tabular_rows(self, url, headers=1, dict_rows=False, insertions=None, **k
"""Returns header of tabular file pointed to by url and an iterator where each row is returned as a list
or dictionary depending on the dict_rows argument. Optionally, headers and values can be inserted at specific
positions. This is achieved using the insertions argument. If supplied, it must be a dictionary containing
the keys "headers" and "functions". "headers" contains a list of tuples of the form (position, header) to be
inserted and "functions" is a list of functions each of which takes a parameter extended_rows which contains
the row's number, file headers list and the row values list.
the keys "headers" and "function". "headers" contains a list of tuples of the form (position, header) to be
inserted and "function" is a function which takes in the arguments headers (prior to any insertions) and
row (which will be in dict or list form depending upon the dict_rows argument).
Args:
url (str): URL to download
Expand All @@ -393,14 +394,21 @@ def get_tabular_rows(self, url, headers=1, dict_rows=False, insertions=None, **k
Tuple[List[str],Iterator[Union[List,Dict]]]: Tuple (headers, iterator where each row is a list or dictionary)
"""
if insertions is not None:
kwargs['post_parse'] = insertions['functions']
stream = self.get_tabular_stream(url, headers=headers, **kwargs)
headers = stream.headers
origheaders = stream.headers
if insertions is None:
return origheaders, stream.iter(keyed=dict_rows)
headers = copy.deepcopy(origheaders)
if insertions is not None:
for position, header in insertions['headers']:
headers.insert(position, header)
return headers, stream.iter(keyed=dict_rows)

def get_next():
for row in stream.iter(keyed=dict_rows):
row = insertions['function'](origheaders, row)
yield row

return headers, get_next()

def download_tabular_key_value(self, url, **kwargs):
# type: (str, Any) -> Dict
Expand Down
2 changes: 1 addition & 1 deletion src/hdx/utilities/version.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.1.2
2.1.3
52 changes: 28 additions & 24 deletions tests/hdx/utilities/test_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,49 +277,53 @@ def test_get_tabular_rows(self, fixtureprocessurl):
expected = [['la1', 'ha1', 'ba1', 'ma1'], ['header1', 'header2', 'header3', 'header4'],
['coal', '3', '7.4', 'needed'], ['gas', '2', '6.5', 'n/a']]
expected_headers = expected[0]
headers, generator = downloader.get_tabular_rows(fixtureprocessurl)
headers, iterator = downloader.get_tabular_rows(fixtureprocessurl)
assert headers == expected_headers
assert list(generator) == expected[1:]
headers, generator = downloader.get_tabular_rows(fixtureprocessurl, headers=1)
assert list(iterator) == expected[1:]
headers, iterator = downloader.get_tabular_rows(fixtureprocessurl, headers=1)
assert headers == expected_headers
assert list(generator) == expected[1:]
headers, generator = downloader.get_tabular_rows(fixtureprocessurl, headers=2)
assert list(iterator) == expected[1:]
headers, iterator = downloader.get_tabular_rows(fixtureprocessurl, headers=2)
assert headers == expected[1]
assert list(generator) == expected[2:]
headers, generator = downloader.get_tabular_rows(fixtureprocessurl, headers=[1, 2])
assert list(iterator) == expected[2:]
headers, iterator = downloader.get_tabular_rows(fixtureprocessurl, headers=[1, 2])
assert headers == ['la1 header1', 'ha1 header2', 'ba1 header3', 'ma1 header4']
assert list(generator) == expected[2:]
assert list(iterator) == expected[2:]
myheaders = ['a', 'b', 'c', 'd']
headers, generator = downloader.get_tabular_rows(fixtureprocessurl, headers=myheaders)
headers, iterator = downloader.get_tabular_rows(fixtureprocessurl, headers=myheaders)
assert headers == myheaders
assert list(generator) == expected
headers, generator = downloader.get_tabular_rows(fixtureprocessurl, dict_rows=True, headers=1)
assert list(iterator) == expected
headers, iterator = downloader.get_tabular_rows(fixtureprocessurl, dict_rows=True, headers=1)
assert headers == expected_headers
expected_dicts = [{'la1': 'header1', 'ha1': 'header2', 'ba1': 'header3', 'ma1': 'header4'},
{'la1': 'coal', 'ha1': '3', 'ba1': '7.4', 'ma1': 'needed'},
{'la1': 'gas', 'ha1': '2', 'ba1': '6.5', 'ma1': 'n/a'}]
assert list(generator) == expected_dicts
headers, generator = downloader.get_tabular_rows(fixtureprocessurl, dict_rows=True, headers=3)
assert list(iterator) == expected_dicts
headers, iterator = downloader.get_tabular_rows(fixtureprocessurl, dict_rows=True, headers=3)
assert headers == expected[2]
expected_dicts = [{'coal': 'gas', '3': '2', '7.4': '6.5', 'needed': 'n/a'}]
assert list(generator) == expected_dicts
assert list(iterator) == expected_dicts

def testfn(extended_rows):
for row_number, headers, row in extended_rows:
row.insert(2, 'lala')
yield row_number, headers, row
def testfn(headers, row):
row.insert(2, 'lala')
return row

insertions = {'headers': [(2, 'la')], 'functions': [testfn]}
headers, generator = downloader.get_tabular_rows(fixtureprocessurl, headers=3, insertions=insertions)
insertions = {'headers': [(2, 'la')], 'function': testfn}
headers, iterator = downloader.get_tabular_rows(fixtureprocessurl, headers=3, insertions=insertions)
expected_headers_la = ['coal', '3', 'la', '7.4', 'needed']
assert headers == expected_headers_la
assert list(generator) == [['gas', '2', 'lala', '6.5', 'n/a']]
assert list(iterator) == [['gas', '2', 'lala', '6.5', 'n/a']]

headers, generator = downloader.get_tabular_rows(fixtureprocessurl, dict_rows=True, headers=3,
insertions=insertions)
def testfn(headers, row):
row['la'] = 'lala'
return row

insertions = {'headers': [(2, 'la')], 'function': testfn}
headers, iterator = downloader.get_tabular_rows(fixtureprocessurl, dict_rows=True, headers=3,
insertions=insertions)
assert headers == expected_headers_la
expected_dicts[0]['la'] = 'lala'
assert list(generator) == expected_dicts
assert list(iterator) == expected_dicts

def test_download_tabular_rows_as_dicts(self, fixtureprocessurl):
with Download() as downloader:
Expand Down

0 comments on commit 8dce7bf

Please sign in to comment.