## Initialize the ckan environment and requests session

In [1]:
from os import path, environ
import requests
from dataflows import Flow, load
from datapackage_pipelines_ckanext.helpers import get_plugin_configuration

config = get_plugin_configuration('odata_org_il')
data_path = config['data_path']

CKAN_API_KEY = environ.get('CKAN_API_KEY')
CKAN_URL = environ.get('CKAN_URL')
assert CKAN_API_KEY and CKAN_URL
CKAN_AUTH_HEADERS = {'Authorization': CKAN_API_KEY}
session = requests.session()
session.headers.update(CKAN_AUTH_HEADERS)

DEBUG   :Starting new HTTPS connection (1): www.odata.org.il:443
DEBUG   :https://www.odata.org.il:443 "GET /pipelines/config/odata_org_il HTTP/1.1" 200 None


## Common imports and functions

In [52]:
from dataflows import Flow, load
import yaml
from load_existing_entities import get_existing_entities_resource, get_existing_entities_resource_descriptor
from collections import defaultdict

def load_odata_resource(name, resources=None):
    path = f'{data_path}/{name}/datapackage.json'
    print(f'Loading resource from {path}')
    resource = Flow(load(path, resources=resources)).results()[0][0]
    print(f'Resource loaded: {name} ({len(resource)} rows)')
    return resource

def yaml_print(data):
    print(yaml.dump(data, allow_unicode=True, default_flow_style=False))

def get_existing_entities_flow():
    stats = defaultdict(int)
    return Flow(
        load(({'resources': [get_existing_entities_resource_descriptor()]}, 
              [get_existing_entities_resource(stats)]))
    ), stats


## Load resources

In [16]:
foi_offices_resource = load_odata_resource('new_foi_offices')
foi_groups_matching_resource = load_odata_resource('foi_groups_matching')

Loading resource from /var/lib/ckan/data/pipelines/odata_org_il/new_foi_offices/datapackage.json
Resource loaded: new_foi_offices (1677 rows)
Loading resource from /var/lib/ckan/data/pipelines/odata_org_il/foi_groups_matching/datapackage.json
Resource loaded: foi_groups_matching (1422 rows)


## Load existing entities (groups) from CKAN api

In [12]:
existing_entities_flow, existing_entities_stats = get_existing_entities_flow()
existing_entities_resource = existing_entities_flow.results()[0][0]
yaml_print(dict(existing_entities_stats))
print(f'Loaded existing entities ({len(existing_entities_resource)} rows)')

INFO    :Loading existing entities / groups, 500 results per page
INFO    :offset=0
DEBUG   :Starting new HTTPS connection (1): www.odata.org.il:443
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/group_list HTTP/1.1" 200 None
INFO    :offset=500
DEBUG   :Starting new HTTPS connection (1): www.odata.org.il:443
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/group_list HTTP/1.1" 200 None
INFO    :offset=1000
DEBUG   :Starting new HTTPS connection (1): www.odata.org.il:443
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/group_list HTTP/1.1" 200 None
INFO    :offset=1500
DEBUG   :Starting new HTTPS connection (1): www.odata.org.il:443
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/group_list HTTP/1.1" 200 None


existing_groups_with_entity: 1155
existing_groups_without_entity: 127

Loaded existing entities (1282 rows)


## Find mismatched foi office groups

In [64]:
existing_entities_group_ids = {}
existing_entities_entity_ids = {}

for existing_entity in existing_entities_resource:
    existing_entities_group_ids[existing_entity['group_id']] = existing_entity
    existing_entities_entity_ids[existing_entity['entity_id']] = existing_entity
    
mismatched_entity_ids = set()
mismatched_group_ids = set()
    
for foi_group_match in foi_groups_matching_resource:
    match_group_id = foi_group_match['Column3']
    match_entity_id = foi_group_match['entity_id']
    if match_group_id and match_entity_id:
        if match_entity_id not in existing_entities_entity_ids:
            # print(f'{match_group_id} : {match_entity_id}')
            matching_entity_by_group_id = existing_entities_group_ids.get(match_group_id)
            matching_entity_by_entity_id = existing_entities_entity_ids.get(match_entity_id)
            if matching_entity_by_group_id and not matching_entity_by_entity_id:
                print(f'group {matching_entity_by_group_id["title"]}: no matching entity {match_entity_id}')
                mismatched_entity_ids.add(match_entity_id)
                mismatched_group_ids.add(match_group_id)
            else:
                raise Exception()
        

group כאן תאגיד השידור הישראלי: no matching entity foi-office-2279
group משרד העבודה, הרווחה והשירותים החברתיים: no matching entity foi-office-185
group משרד התחבורה: no matching entity foi-office-186
group נתיבי ישראל: no matching entity foi-office-599
group עיריית אום אל פחם: no matching entity foi-office-812
group עיריית באקה אל גרבייה: no matching entity foi-office-459
group עיריית בית"ר עלית: no matching entity foi-office-463
group עיריית מודיעין: no matching entity foi-office-907
group עיריית סכנין: no matching entity foi-office-931
group עיריית קריית מוצקין: no matching entity foi-office-522
group עיריית קרית אונו: no matching entity foi-office-515
group עיריית קרית ים: no matching entity foi-office-520
group עמידר החדשה: no matching entity foi-office-602
group צבא ההגנה לישראל- צה"ל: no matching entity foi-office-442
group קופת חולים מכבי: no matching entity foi-office-639
group רשות הגבלים עסקיים: no matching entity foi-office-445
group רשות החדשנות- המדען הראשי: no matching e

## Run update_foi_offices dry run

In [62]:
from update_foi_offices_entities import get_foi_offices_resource, get_existing_entities, get_foi_groups_matching
from collections import defaultdict
import yaml

DRY_RUN = True

stats = defaultdict(int)

existing_entities = {}
for row in get_existing_entities(existing_entities_resource, existing_entities, stats):
    pass

for row in get_foi_groups_matching(foi_groups_matching_resource, existing_entities, stats):
    pass

foi_offices_resource_only_mismatched = [r for r in foi_offices_resource if f"foi-office-{r['nid']}" in mismatched_entity_ids]

for row in get_foi_offices_resource(foi_offices_resource_only_mismatched, existing_entities, stats, DRY_RUN):
    pass

INFO    :updating group id 6c0ca2e7-d2b5-43cd-ba15-e6a403753075
INFO    :updating group id 4fd50e02-c9f9-49b0-a1a9-6dc941b1d149
INFO    :updating group id cc68beb9-ee06-4725-8708-ea038fd3fa9e
INFO    :updating group id 7f6af937-e855-4dbb-b43c-2c9440843e3c
INFO    :updating group id 83817cca-312e-43fb-be3d-b0b3624f7c2e
INFO    :updating group id 41064ba7-34ec-4e83-96b1-633f632a9dfc
INFO    :updating group id a3510080-d5a2-4bea-ac54-5a348c39468a
INFO    :updating group id 9f460c68-c4b4-4e68-8dd7-cf0fb41e91ba
INFO    :updating group id bcb9d0ef-53bd-46fb-bcd7-966b1412767e
INFO    :updating group id 45d44c9e-c002-4842-984b-c8177df6a37f
INFO    :updating group id 83ae9a23-6d45-4aff-9238-07f0cf712e9d
INFO    :updating group id 9aa6c4bb-996b-4332-a46c-441f5bb040a5
INFO    :updating group id cfe53b51-c3ed-4b17-a9cb-b0f01c540692
INFO    :updating group id 23834e72-2649-426d-bebb-b172ce61c698
INFO    :updating group id 5e5acb5c-8a48-417b-b23f-4d36a610fc8f
INFO    :updating group id 9301eff2-b6c3

## Before updating - save the group datasets, otherwise they will be disconnected from group

In [None]:
%%sh
dpp run --verbose ./dump_group_datasets

In [58]:
stats = defaultdict(int)
for row in load_odata_resource('dump_group_datasets', resources=['group_datasets']):
    if not row['dataset_ids']:
        stats['0 | groups without datasets'] += 1
    elif len(row['dataset_ids']) == 1:
        stats['1 | groups with 1 dataset'] += 1
    elif 1 < len(row['dataset_ids']) < 11:
        stats['2 | groups with 2-10 datasets'] += 1
    elif len(row['dataset_ids']) > 10:
        stats['3 | groups with more then 10 datasets'] += 1
yaml_print(dict(stats))

Loading resource from /var/lib/ckan/data/pipelines/odata_org_il/dump_group_datasets/datapackage.json
Resource loaded: dump_group_datasets (1282 rows)
0 | groups without datasets: 1061
1 | groups with 1 dataset: 99
2 | groups with 2-10 datasets: 88
3 | groups with more then 10 datasets: 34



## Do the update

In [63]:
DRY_RUN = False

for row in get_foi_offices_resource(foi_offices_resource_only_mismatched, existing_entities, stats, DRY_RUN):
    pass

INFO    :updating group id 6c0ca2e7-d2b5-43cd-ba15-e6a403753075
DEBUG   :Starting new HTTPS connection (1): www.odata.org.il:443
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/group_patch HTTP/1.1" 200 None
INFO    :updating group id 4fd50e02-c9f9-49b0-a1a9-6dc941b1d149
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/group_patch HTTP/1.1" 200 None
INFO    :updating group id cc68beb9-ee06-4725-8708-ea038fd3fa9e
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/group_patch HTTP/1.1" 200 None
INFO    :updating group id 7f6af937-e855-4dbb-b43c-2c9440843e3c
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/group_patch HTTP/1.1" 200 None
INFO    :updating group id 83817cca-312e-43fb-be3d-b0b3624f7c2e
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/group_patch HTTP/1.1" 200 None
INFO    :updating group id 41064ba7-34ec-4e83-96b1-633f632a9dfc
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/group_patch HTTP/1.1" 200 None
INFO    :updating

## Restore the datasets

In [65]:
from os import path, environ
import requests
from dataflows import Flow, load
from datapackage_pipelines_ckanext.helpers import get_plugin_configuration

def restore_group_datasets(row):
    group_id = row['group_id']
    if group_id in mismatched_group_ids:
        for dataset_id in row['dataset_ids']:
            res = session.post('{}/api/3/action/member_create'.format(CKAN_URL),
                               json=dict(id=group_id,
                                         object=dataset_id,
                                         object_type='package',
                                         capacity='')).json()
            assert res and res['success']

Flow(
    load(path.join(data_path, 'dump_group_datasets/datapackage.json'), resources=['group_datasets']),
    restore_group_datasets
).process()


DEBUG   :Starting new HTTPS connection (1): www.odata.org.il:443
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/member_create HTTP/1.1" 200 None
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/member_create HTTP/1.1" 200 None
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/member_create HTTP/1.1" 200 None
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/member_create HTTP/1.1" 200 None
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/member_create HTTP/1.1" 200 None
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/member_create HTTP/1.1" 200 None
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/member_create HTTP/1.1" 200 None
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/member_create HTTP/1.1" 200 None
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/member_create HTTP/1.1" 200 None
DEBUG   :https://www.odata.org.il:443 "POST /api/3/action/member_create HTTP/1.1" 200 None
DEBUG   :https://www.odat

(<datapackage.package.Package at 0x7ff5c1ee9e48>, {})