Skip to content

Commit

Permalink
Merge 43207c8 into a9dd12d
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelwood committed May 24, 2022
2 parents a9dd12d + 43207c8 commit 0167a96
Show file tree
Hide file tree
Showing 3 changed files with 192 additions and 23 deletions.
30 changes: 23 additions & 7 deletions datastore/data_quality/management/commands/rewrite_quality_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,18 @@ def add_arguments(self, parser):
help="The datagetter run id or latest",
)

parser.add_argument(
"--publisher-only",
action="store_true",
help="Only rewrite publisher data",
)

parser.add_argument(
"--sourcefile-only",
action="store_true",
help="Only rewrite sourcefile data",
)

def handle(self, *args, **options):

if "latest" in options["getter_run"]:
Expand Down Expand Up @@ -50,8 +62,10 @@ def process_source_file(source_file):

connection.close()

with Pool(4) as process_pool:
process_pool.starmap(process_source_file, zip(source_files))
if not options["publisher_only"]:
print("Processing sourcefile data")
with Pool(4) as process_pool:
process_pool.starmap(process_source_file, zip(source_files))

def process_publishers(source_file):
publisher = source_file.get_publisher()
Expand All @@ -67,11 +81,13 @@ def process_publishers(source_file):
print(e)
connection.close()

with Pool(4) as process_pool:
process_pool.starmap(
process_publishers,
zip(source_files.distinct("data__publisher__prefix")),
)
if not options["sourcefile_only"]:
print("Processing publisher data")
with Pool(4) as process_pool:
process_pool.starmap(
process_publishers,
zip(source_files.distinct("data__publisher__prefix")),
)

# Clear all caches - data has changed
cache.clear()
54 changes: 38 additions & 16 deletions datastore/data_quality/quality_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,14 +225,25 @@ def get_total_publishers(self):
return self.source_file_set.distinct("data__publisher__prefix").count()

def get_total_recipients(self):
latest_id = db.Latest.objects.get(series=db.Latest.CURRENT).pk

query = f"""
SELECT DISTINCT(jsonb_array_elements(db_sourcefile.aggregate->'recipients'))
FROM db_sourcefile
INNER JOIN db_sourcefile_latest on db_sourcefile.id=db_sourcefile_latest.sourcefile_id
WHERE db_sourcefile_latest.latest_id={latest_id}
"""
# Determine if we're dealing with just one publisher and whether we need to limit
# the source files to that publisher rather than all in 'latest'
if self.source_file_set.distinct("data__publisher__prefix").count() == 1:
source_file_ids = ",".join(
map(str, self.source_file_set.values_list("id", flat=True))
)
query = f"""
SELECT DISTINCT(jsonb_array_elements(db_sourcefile.aggregate->'recipients'))
FROM db_sourcefile
WHERE db_sourcefile.id IN ({source_file_ids})
"""
else:
latest_id = db.Latest.objects.get(series=db.Latest.CURRENT).pk
query = f"""
SELECT DISTINCT(jsonb_array_elements(db_sourcefile.aggregate->'recipients'))
FROM db_sourcefile
INNER JOIN db_sourcefile_latest on db_sourcefile.id=db_sourcefile_latest.sourcefile_id
WHERE db_sourcefile_latest.latest_id={latest_id}
"""

with connection.cursor() as cursor:
cursor.execute(query)
Expand All @@ -241,14 +252,25 @@ def get_total_recipients(self):
return total

def get_total_funders(self):
latest_id = db.Latest.objects.get(series=db.Latest.CURRENT).pk

query = f"""
SELECT DISTINCT(jsonb_array_elements(db_sourcefile.aggregate->'funders'))
FROM db_sourcefile
INNER JOIN db_sourcefile_latest on db_sourcefile.id=db_sourcefile_latest.sourcefile_id
WHERE db_sourcefile_latest.latest_id={latest_id}
"""
# Determine if we're dealing with just one publisher and whether we need to limit
# the source files to that publisher rather than all in 'latest'
if self.source_file_set.distinct("data__publisher__prefix").count() == 1:
source_file_ids = ",".join(
map(str, self.source_file_set.values_list("id", flat=True))
)
query = f"""
SELECT DISTINCT(jsonb_array_elements(db_sourcefile.aggregate->'funders'))
FROM db_sourcefile
WHERE db_sourcefile.id IN ({source_file_ids})
"""
else:
latest_id = db.Latest.objects.get(series=db.Latest.CURRENT).pk
query = f"""
SELECT DISTINCT(jsonb_array_elements(db_sourcefile.aggregate->'funders'))
FROM db_sourcefile
INNER JOIN db_sourcefile_latest on db_sourcefile.id=db_sourcefile_latest.sourcefile_id
WHERE db_sourcefile_latest.latest_id={latest_id}
"""

with connection.cursor() as cursor:
cursor.execute(query)
Expand Down
131 changes: 131 additions & 0 deletions datastore/tests/test_quality_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,134 @@ def test_create_data_quality_data(self):
# Our test data in the test_data.json currently generates 2
# data quality usefulness results
self.assertEqual(len(quality), 2)

def test_create_sourcefile_publisher_quality_data(self):
source_file = db.SourceFile.objects.first()

# Create source file aggregate and quality data
grants_list = {
"grants": list(source_file.grant_set.values_list("data", flat=True))
}

source_file.quality, source_file.aggregate = quality_data.create(grants_list)

source_file.save()

expected_sourcefile_aggregate = {
"count": 5,
"recipients": ["360G-example-a"],
"funders": ["GB-example-b"],
"max_award_date": "2019-10-03",
"min_award_date": "2019-10-03",
"currencies": {
"GBP": {
"count": 5,
"total_amount": 1341,
"max_amount": 583,
"min_amount": 92,
"currency_symbol": "£",
}
},
"award_years": {"2019": 5},
"recipient_org_types": {},
}
expected_sourcefile_quality = {
"RecipientOrg360GPrefix": {
"heading": "100% of grants have a <span class=\"highlight-background-text\">Recipient Org:Identifier</span> that starts '360G-'",
"count": 5,
"fail": True,
},
"FundingOrg360GPrefix": {"count": 0, "fail": False},
"NoRecipientOrgCompanyCharityNumber": {
"heading": '100% of grants do not have either a <span class="highlight-background-text">Recipient Org:Company Number</span> or a <span class="highlight-background-text">Recipient Org:Charity Number</span>',
"count": 5,
"fail": True,
},
"IncompleteRecipientOrg": {"count": 0, "fail": False},
"NoGrantProgramme": {"count": 0, "fail": False},
"NoBeneficiaryLocation": {"count": 0, "fail": False},
"TitleDescriptionSame": {"count": 0, "fail": False},
"TitleLength": {"count": 0, "fail": False},
"NoLastModified": {"count": 0, "fail": False},
"NoDataSource": {
"heading": '100% of grants do not have <span class="highlight-background-text">Data Source</span> information',
"count": 5,
"fail": True,
},
"ClassificationNotPresent": {
"heading": "100% of grants do not contain classifications field",
"count": 5,
"fail": True,
},
"BeneficiaryLocationNameNotPresent": {"count": 0, "fail": False},
"BeneficiaryLocationCountryCodeNotPresent": {
"heading": "100% of grants do not contain beneficiaryLocation/0/countryCode field",
"count": 5,
"fail": True,
},
"BeneficiaryLocationGeoCodeNotPresent": {"count": 0, "fail": False},
"PlannedDurationNotPresent": {"count": 0, "fail": False},
"GrantProgrammeTitleNotPresent": {"count": 0, "fail": False},
"RecipientOrgPrefixExternal": {
"count": 0,
"fail": True,
"heading": "Recipient Orgs with external org identifier",
},
"RecipientOrgPrefix50pcExternal": {"count": 0, "fail": True},
}

self.assertEqual(source_file.quality, expected_sourcefile_quality)
self.assertEqual(source_file.aggregate, expected_sourcefile_aggregate)

# Create publisher aggregate and quality data

publisher = source_file.get_publisher()

(
publisher.quality,
publisher.aggregate,
) = quality_data.create_publisher_stats(publisher)

publisher.save()

expected_publisher_aggregate = {
"total": {
"grants": 5,
"GBP": 1341.0,
"publishers": 1,
"recipients": 1,
"funders": 1,
},
"jsonFiles": 0,
"csvFiles": 0,
"xlsxFiles": 100,
"odsFiles": 0,
"awardYears": {
"2022": 0,
"2021": 0,
"2020": 0,
"2019": 5,
"2018": 0,
"2017": 0,
"2016": 0,
"2015": 0,
"2014": 0,
"2013": 0,
},
"orgIdTypes": {},
"awardedThisYear": 0,
"awardedLastThreeMonths": 0,
}
expected_publisher_quality = {
"hasBeneficiaryLocationName": 100,
"hasRecipientOrgLocations": 100,
"hasGrantDuration": 100,
"hasGrantProgrammeTitle": 100,
"hasGrantClassification": 0,
"hasBeneficiaryLocationGeoCode": 100,
"hasRecipientOrgCompanyOrCharityNumber": 0,
"has50pcExternalOrgId": 0,
}

self.assertEqual(publisher.aggregate, expected_publisher_aggregate)
self.assertEqual(publisher.quality, expected_publisher_quality)

0 comments on commit 0167a96

Please sign in to comment.