Skip to content

Commit

Permalink
add data quality assurance checks
Browse files Browse the repository at this point in the history
  • Loading branch information
nathanhilbert committed Aug 18, 2015
1 parent 7be7b8c commit ee72051
Showing 1 changed file with 76 additions and 2 deletions.
78 changes: 76 additions & 2 deletions openspending/admin/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,16 @@

from flask import current_app, url_for

from openspending.core import db

log = logging.getLogger(__name__)


OUTPUT_TEXT = """
%(label)s
Put more information here
%(name)s
Min/Max Values: %(minval)s / %(maxval)s
Year Range: %(minyear)s - %(maxyear)s
"""
Expand All @@ -30,10 +34,13 @@ def __init__(self, dataset=None):
self.dataset = dataset
self.namedfile = None
self.zf = None
self.tablebase = self.dataset.name
self.model = self.dataset.source.model

self._buildzf()
self._write_basefile()
#self._write_preloaddata()
self._perform_qa_checks()
self._write_logs()
self._write_loaded_data()

Expand All @@ -42,7 +49,29 @@ def _buildzf(self):
self.zf = zipfile.ZipFile(self.namedfile, "w")

def _write_basefile(self):
self.zf.writestr("metadata.csv", OUTPUT_TEXT%dict(label=self.dataset.label))

result = db.engine.execute("SELECT MAX(amount), MIN(amount) FROM %s__entry"%self.tablebase).first()
if result:
maxval= result[0]
minval = result[1]
else:
minval = None
maxval = None
timeresult = db.engine.execute("SELECT MAX(year), MIN(year) FROM %s__time"%self.tablebase).first()
if timeresult:
maxyear= timeresult[0]
minyear = timeresult[1]
else:
minyear = None
maxyear = None

# check years shown
self.zf.writestr("metadata.csv", OUTPUT_TEXT%dict(label=self.dataset.label,
name=self.dataset.name,
maxval=maxval,
minval=minval,
minyear=minyear,
maxyear=maxyear))

def _write_logs(self):

Expand Down Expand Up @@ -102,6 +131,51 @@ def _write_loaded_data(self):
print e
log.warn("could not write postload value")

def _perform_qa_checks(self):
missing_countries_file = io.StringIO()
missing_countries_file.write(",".join(['label', 'name']) + u"\n")
missingcountries = db.engine.execute("SELECT name, label \
FROM %s__country_level0 \
WHERE countryid = 0"%self.tablebase).fetchall()
missing_countries_file.write(u"This list shows the countries that were in the original data, but were not found in the FIND country list. They are typically aggregate regions as the is the case with the World Bank.\n")
for row in missingcountries:
missing_countries_file.write(",".join(row) + u"\n")
if len(missingcountries) == 0:
missing_countries_file.write(u'No Missing Countries\n')
self.zf.writestr("qualitychecks/countriesnotfound.csv", missing_countries_file.getvalue().encode(encoding='UTF-8'))

countriesnotreped_file = io.StringIO()
countriesnotreped_file.write(",".join(['label', 'sovereignt']) + u"\n")
countriesnotreped = db.engine.execute("SELECT gc.label,gc.sovereignt FROM geometry__country_level0 as gc \
LEFT OUTER JOIN %s__country_level0 as tab \
ON gc.gid=tab.countryid WHERE tab.countryid IS null"%self.tablebase).fetchall()
countriesnotreped_file.write(u"These are the countries that are in the FIND system, but not represented in the dataset. Typically these are smaller countries that do not track data or their data are rolled into the soeverienty\n")
for row1 in countriesnotreped:
countriesnotreped_file.write(",".join(row1) + u"\n")
if len(countriesnotreped) == 0:
countriesnotreped_file.write(u'All counries are represented\n')
self.zf.writestr("qualitychecks/countries-not-represented.csv", countriesnotreped_file.getvalue().encode(encoding='UTF-8'))

dupvals_file = io.StringIO()
dupvals_file.write(",".join(['counter', 'label', 'year']) + u"\n")
dups_result = db.engine.execute("SELECT COUNT(*) as counter, MAX(label) as label, MAX(time) as year \
FROM \
public.%s__entry, \
public.geometry__country_level0, \
public.geometry__time \
WHERE \
%s__entry.geom_time_id = geometry__time.id AND \
geometry__time.gid = geometry__country_level0.gid \
GROUP BY geom_time_id HAVING COUNT(*)>1;"%(self.tablebase, self.tablebase,)).fetchall()
dupvals_file.write(u"This is a list of countries that have multiple year, country values. There should only be one country for every year.\n")
for row2 in dups_result:
temprow = [str(x) for x in row2]
dupvals_file.write(",".join(temprow) + u"\n")
if len(dups_result) == 0:
dupvals_file.write(u'There are no duplicate year-country combinations\n')
self.zf.writestr("qualitychecks/duplicate-countries-years.csv", dupvals_file.getvalue().encode(encoding="UTF-8"))


def get_output(self):
#return the zip value as a string
self.zf.close()
Expand Down

0 comments on commit ee72051

Please sign in to comment.