Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelwood committed Oct 27, 2021
1 parent cdf7fec commit f3f22d4
Showing 1 changed file with 33 additions and 18 deletions.
51 changes: 33 additions & 18 deletions datastore/data_quality/quality_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from lib360dataquality.cove.schema import Schema360
from lib360dataquality import check_field_present

from django.db.models import Q, Sum
from django.db.models import Sum
from django.db.models.expressions import RawSQL
from django.db import connection

Expand Down Expand Up @@ -57,8 +57,13 @@ def create(grants):
for available_test in TEST_CLASSES[USEFULNESS_TEST_CLASS]:
quality_results[available_test.__name__] = {"count": 0, "fail": False}

# This is a test that we actually copy from another test see below
quality_results["RecipientOrgPrefixExternal"] = {"count": 0, "fail": False}
# Initialise two new tests
# These will be derived from RecipientOrg360GPrefix
quality_results["RecipientOrgPrefixExternal"] = {
"count": cove_results["grants_aggregates"]["count"],
"fail": False,
}
quality_results["RecipientOrgPrefix50pcExternal"] = {"count": 0, "fail": False}

# Update with a heading and count template.
for test in cove_results["usefulness_checks"]:
Expand All @@ -68,19 +73,22 @@ def create(grants):
# If all the grants fail a test then we mark as fail true
"fail": test[0]["count"] == cove_results["grants_aggregates"]["count"],
}

# Our fail/pass conditions for this test are based at least 50% of recipients
# having an external (non 360G) org id.
if "RecipientOrg360GPrefix" in test[0]["type"]:
quality_results["RecipientOrgPrefix50pcExternal"] = test[0]
# Create a version of this test for 50% ext org ids
# Our fail/pass conditions for this test are based at least 50% of recipients
# having an external (non 360G) org id.
quality_results["RecipientOrgPrefix50pcExternal"]["fail"] = (
test[0]["count"] >= cove_results["grants_aggregates"]["count"] / 2
)

# Create an inverted version of this test for simplicity
# total grants - the number of grants with 360 prefix to give the number *with*
# an ext org id
count = cove_results["grants_aggregates"]["count"] - test[0]["count"]

quality_results["RecipientOrgPrefixExternal"] = {
"count": cove_results["grants_aggregates"]["count"] - test[0]["count"],
"fail": test[0]["count"] == 0,
"count": count,
"fail": count == 0,
"heading": "Recipient Orgs with external org identifier",
}

Expand All @@ -106,17 +114,22 @@ def create(grants):
aggregates["recipient_org_types"] = {}

def extract_org_id_type(org_id):
# Ignore internal org ids
if "360G-" in org_id:
return None

try:
return org_id.split("-")[1]
except IndexError:
return "Unknown"
return None

for grant in grants["grants"]:
org_id_type = extract_org_id_type(grant["recipientOrganization"][0]["id"])
try:
aggregates["recipient_org_types"][org_id_type] += 1
except KeyError:
aggregates["recipient_org_types"][org_id_type] = 1
if org_id_type:
try:
aggregates["recipient_org_types"][org_id_type] += 1
except KeyError:
aggregates["recipient_org_types"][org_id_type] = 1

return quality_results, aggregates

Expand Down Expand Up @@ -298,10 +311,10 @@ def get_pc_publishers_with_recipient_ext_org(self):
),
)

for i in ranges:
ret["{}% - {}%".format(*i)] = (
query.distinct("data__publisher__prefix")
.filter(Q(pc__gte=i[0]) & Q(pc__lt=i[1]))
for range in ranges:
ret["{}% - {}%".format(*range)] = (
query.filter(pc__range=(range[0], range[1]))
.distinct("data__publisher__prefix")
.count()
/ total_publishers
* 100
Expand Down Expand Up @@ -369,6 +382,8 @@ def get_grant_org_id_types_used(self):
aggregate->'recipient_org_types' is not null AND
db_sourcefile_latest.latest_id={latest_id}
GROUP BY keyval.key
ORDER BY sum DESC
LIMIT 10
"""

cursor = connection.cursor()
Expand Down

0 comments on commit f3f22d4

Please sign in to comment.