Skip to content

Commit

Permalink
Merge pull request #127 from ThreeSixtyGiving/mw/org-rework
Browse files Browse the repository at this point in the history
Datastore work for org pages
  • Loading branch information
michaelwood committed Nov 15, 2022
2 parents 3c48006 + bb632f6 commit 60d5b45
Show file tree
Hide file tree
Showing 12 changed files with 511 additions and 129 deletions.
4 changes: 3 additions & 1 deletion .gitattributes
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
*test_data.json binary
*.json binary
*.min.js
*.min.css
2 changes: 1 addition & 1 deletion datastore/additional_data/admin.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from django.contrib import admin
from django.contrib.postgres.fields import JSONField
from django.db.models import JSONField
from prettyjson import PrettyJSONWidget

import additional_data.models as db
Expand Down
3 changes: 2 additions & 1 deletion datastore/additional_data/models.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import re

from django.contrib.postgres.fields import ArrayField, JSONField
from django.contrib.postgres.fields import ArrayField
from django.contrib.postgres.indexes import GinIndex
from django.core.exceptions import ValidationError
from django.db.models import JSONField
from django.db import models


Expand Down
16 changes: 16 additions & 0 deletions datastore/additional_data/sources/find_that_charity.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,19 @@ def import_from_path(self, path, org_type=None):

class OrgTypeNotKnownError(Exception):
pass


def non_primary_org_ids_map():
"""Returns a dict of all non-primary org-ids and their corresponding primary org-id"""
org_ids = {}
orgs = OrgInfoCache.objects.filter(org_ids__len__gt=1).values_list(
"org_ids", flat=True
)
# [[orgid, orgid], [orgid, orgid] ...]
for org in orgs:
# [ primary-org-id, secondary-org-id, ...org-id ]
for non_primary_org_id in org[1:]:
org_ids[non_primary_org_id] = org[0]
# { non_primary_org_id : primary_org_id }

return org_ids
4 changes: 3 additions & 1 deletion datastore/db/admin.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from django.contrib import admin
from django.contrib.postgres.fields import JSONField
from django.db.models import JSONField
from prettyjson import PrettyJSONWidget

import db.models as db
Expand Down Expand Up @@ -29,3 +29,5 @@ class GrantAdmin(JSONFieldAdmin):
admin.site.register(db.SourceFile, JSONFieldAdmin)
admin.site.register(db.Status, JSONFieldAdmin)
admin.site.register(db.Latest)
admin.site.register(db.Funder)
admin.site.register(db.Recipient)
2 changes: 1 addition & 1 deletion datastore/db/fixtures/test_data.json

Large diffs are not rendered by default.

120 changes: 12 additions & 108 deletions datastore/db/management/commands/create_data_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
import os

from django.core.management.base import BaseCommand
from django.db import connection

from db.management.spinner import Spinner
from db.models import Latest

from db.management.commands.manage_entities_data import create_orgs_list


class Command(BaseCommand):
help = "Outputs a data package of our best Latest data"
Expand Down Expand Up @@ -35,9 +36,11 @@ def handle(self, *args, **options):
- data_all.json (all sources)
- json_all/
|- grants.json (lists of grants)
|- grants.json (contains lists of grants)
|- grants.json
...
|...
- funders.jl
- recipients.jl
"""
spinner = Spinner()
spinner.start()
Expand All @@ -48,32 +51,16 @@ def handle(self, *args, **options):
os.makedirs("%s/json_all/" % options["dir"], mode=0o700)

data_all = []
data_all_file = "%s/data_all.json" % options["dir"]

data_all_file = "%s/data_all.json" % options["dir"]
recipients_file = "%s/recipients.jl" % options["dir"]
funders_file = "%s/funders.jl" % options["dir"]

with connection.cursor() as cursor, open(
recipients_file, "w"
) as recipientfp, open(funders_file, "w") as funderfp:
cursor.execute(CREATE_RELATED_ORGIDS)

cursor.execute(ORG_SELECT.format(org="recipient"))
columns = [col[0] for col in cursor.description]
rows = 0
for row in cursor.fetchall():
recipient = json.dumps(dict(zip(columns, row)))
recipientfp.write(recipient)
recipientfp.write("\r\n")
rows += 1

cursor.execute(ORG_SELECT.format(org="funding"))
rows = 0
for row in cursor.fetchall():
funder = json.dumps(dict(zip(columns, row)))
funderfp.write(funder)
funderfp.write("\r\n")
rows += 1
with open(funders_file, "w") as funders_fp:
create_orgs_list("funder", funders_fp)

with open(recipients_file, "w") as recipients_fp:
create_orgs_list("recipient", recipients_fp)

def flatten_grant(in_grant):
"""Add the additional_data inside grant object"""
Expand Down Expand Up @@ -112,86 +99,3 @@ def flatten_grant(in_grant):
data_all_fp.write(json.dumps(data_all, indent=options["indent"]))

spinner.stop()


CREATE_RELATED_ORGIDS = """
DROP TABLE IF EXISTS tmp_related_orgids;
CREATE TABLE tmp_related_orgids AS (
with org_name AS (SELECT
data -> 'linked_orgs' ->> 0 AS canonical_orgid,
string_agg(data ->> 'name', '||') AS name,
string_agg(org_id, '||') AS name_org_id
FROM additional_data_orginfocache GROUP BY 1
)
SELECT
linked_org ->> 0 org_id,
data -> 'linked_orgs' tmp_related_orgids,
data -> 'linked_orgs' ->> 0 AS canonical_orgid,
max(name) AS name,
max(name_org_id) AS name_org_id
FROM
additional_data_orginfocache orgi
JOIN LATERAL
jsonb_array_elements(data -> 'linked_orgs') linked_org ON true
JOIN
org_name orgn ON orgi.data -> 'linked_orgs' ->> 0 = canonical_orgid
WHERE data -> 'linked_orgs' ->> 0 is not null
GROUP by 1,2,3
);
"""

ORG_SELECT = """
WITH latest_grant AS (
SELECT
*
FROM
db_grant
JOIN
db_grant_latest ON db_grant.id = db_grant_latest.grant_id
JOIN
db_latest on db_grant_latest.latest_id = db_latest.id
WHERE
db_latest.series = 'CURRENT'
),
{org}_by_currency AS (SELECT
coalesce(o.canonical_orgid, g.data -> '{org}Organization' -> 0 ->> 'id') org_id,
g.data ->> 'currency' AS currency,
coalesce(tmp_related_orgids, to_jsonb(ARRAY[g.data -> '{org}Organization' -> 0 ->> 'id'])) AS orgids,
max(coalesce(
o.name || '||' || (g.data -> '{org}Organization' -> 0 ->> 'name'),
g.data -> '{org}Organization' -> 0 ->> 'name')
) AS name,
max(name_org_id) AS org_ids_charity_finder,
max(o.name) AS name_charity_finder,
count(*) AS grants,
sum((g.data ->> 'amountAwarded')::numeric) total_amount,
max((g.data ->> 'amountAwarded')::numeric) max_amount,
min((g.data ->> 'amountAwarded')::numeric) min_amount,
avg((g.data ->> 'amountAwarded')::numeric) avg_amount,
max(g.data ->> 'awardDate') max_award_date,
min(g.data ->> 'awardDate') min_award_date
FROM
latest_grant g
LEFT JOIN
tmp_related_orgids o ON o.org_id = g.data -> '{org}Organization' -> 0 ->> 'id'
GROUP BY 1, 2, 3)
SELECT
org_id as id,
string_to_array(string_agg(name, '||'), '||') AS "organizationName",
orgids AS "orgIDs",
coalesce(string_to_array(string_agg(org_ids_charity_finder, '||'), '||'), Array[]::text[]) AS "orgIDsCharityFinder",
coalesce(string_to_array(string_agg(name_charity_finder, '||'), '||'), Array[]::text[]) AS "nameCharityFinder",
sum(grants)::int AS grants,
array_agg(currency) as currency,
jsonb_object_agg(currency, grants) AS "currencyGrants",
jsonb_object_agg(currency, total_amount) AS "currencyTotal",
jsonb_object_agg(currency, max_amount) AS "currencyMaxAmount",
jsonb_object_agg(currency, min_amount) AS "currencyMinAmount",
jsonb_object_agg(currency, avg_amount) AS "currencyAvgAmount",
max(max_award_date) "maxAwardDate",
min(min_award_date) "minAwardDate"
FROM
{org}_by_currency
GROUP BY org_id, orgids;
"""
38 changes: 28 additions & 10 deletions datastore/db/management/commands/load_datagetter_data.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import json
import os

from django.conf import settings
from django.db import transaction
from django.core.management import call_command
from django.core.management.base import BaseCommand, CommandError
from django.db import transaction
from django.core.cache import cache

import db.models as db
Expand All @@ -23,6 +23,13 @@ def add_arguments(self, parser):
help="The location of the data dir created by datagetter",
)

parser.add_argument(
"--skip-missing",
action="store_true",
help="Skip any missing dataset files instead of raising an error",
default=False,
)

def check_dir_looks_right(self):
"""Quickly check if the supplied dir looks correct"""
ls = os.listdir(self.options["data_dir"][0])
Expand Down Expand Up @@ -50,8 +57,14 @@ def load_grant_data(self, path):

new_path = os.path.join(self.options["data_dir"][0], "json_all", filename)

with open(new_path, encoding="utf-8") as f:
return json.loads(f.read())
try:
with open(new_path, encoding="utf-8") as f:
return json.loads(f.read())
except FileNotFoundError as e:
if self.options["skip_missing"]:
return {"grants": []}
else:
raise e

def load_data(self):
grant_additional_data_generator = AdditionalDataGenerator()
Expand All @@ -63,7 +76,12 @@ def load_data(self):
for ob in dataset:
prefix = ob["publisher"]["prefix"]
publisher, c = db.Publisher.objects.get_or_create(
getter_run=getter_run, prefix=prefix, data=ob["publisher"]
getter_run=getter_run,
prefix=prefix,
data=ob["publisher"],
org_id=ob["publisher"].get("org_id", "unknown"),
name=ob["publisher"]["name"],
source=db.Entity.PUBLISHER,
)

source_file = db.SourceFile.objects.create(data=ob, getter_run=getter_run)
Expand Down Expand Up @@ -103,6 +121,8 @@ def load_data(self):
"Skipping loading due to: '%s'" % e,
file=self.stdout,
)
if settings.DEBUG == True:
raise e
continue

return grants_added
Expand All @@ -126,11 +146,9 @@ def handle(self, *args, **options):
db.Latest.update()

print("Updating quality data", file=self.stdout)
try:
call_command("rewrite_quality_data", "latest")
except Exception as e:
print("Error running rewrite_quality_data %s" % e, file=self.stderr)
pass
call_command("rewrite_quality_data", "latest")
# Update entities data for funders and recipients
call_command("manage_entities_data", "--update")

# Clear all cached objects - The latest data as well as new data has been added
cache.clear()
Loading

0 comments on commit 60d5b45

Please sign in to comment.