In [1]:
import requests
from datetime import datetime
import pandas as pd
from io import StringIO
import os
import json
import gzip

from scripts import gather_metadata, process_report, create_dataframe, scrape_ibabs, TOOI_organisaties

In [2]:
def get_results_documenten(ori_id, amount, start_date, doc_type, last_discussed=True, id_list = []):
    # URL and headers
    url = "https://api.openraadsinformatie.nl/v1/elastic/ori_*/_search?"
    headers = {
        "Content-Type": "application/json"
    }
    
    # Request JSON data
    data = {
        "query": {
            "bool": {
                "must": [
                    {
                        "simple_query_string": {
                            "fields": ["text", "title", "description", "name"],
                            "default_operator": "or",
                            "query": "*"
                        }
                    },
                    {
                        "terms": {
                            "_index": [ori_id],
                        }
                    },
                    {
                        "match": {
                            "@type": doc_type,
                        }
                    }
                ]
            }
        },
        "size": amount,
        "_source": {
            "includes": ["*"],
            "excludes": []
        },
        "from": 0,
        "sort": [
            {
                "last_discussed_at": {
                    "order": "asc"
                }
            }
        ],
    }

    # If last_discussed is provided, add the range part to the query
    if last_discussed == True:
        data["query"]["bool"]["must"].append({
            "range": {
                "last_discussed_at": {
                    "gte": start_date,
                    "lte": f"2024-12-31T23:59:59+02:00",
                }
            }
        })

    if id_list:
        data["query"]["bool"]["must"].append({
            "terms": {
                "_id": id_list
            }
        })

    # Send the POST request
    response = requests.post(url, json=data, headers=headers)
    
    # Check the response
    if response.status_code == 200:
        result = response.json()
        return result
    else:
        print("Request failed with status code:", response.status_code)
        print("Response content:", response.text)

In [3]:
def amount_of_results(ori_id):
    url = f'https://api.openraadsinformatie.nl/v1/elastic/_cat/count/{ori_id}'
    response = requests.get(url)
    return int(response.text.split(' ')[-1])

In [4]:
def media_objecten_grote_orgs(ori_id):

    start_date = f"1990-01-01T00:00:00+02:00"
    
    length = 99999999
    all_mediaObjects = dict()
    
    while length > 9999:
        res = get_results_documenten(ori_id, 10000, start_date, 'MediaObject', True)
        for r in res['hits']['hits']:
            if r['_id'] not in all_mediaObjects:
                all_mediaObjects[r['_id']] = r
    
        length = len(res['hits']['hits'])
        start_date = res['hits']['hits'][-1]['_source']['last_discussed_at']

    print(len(all_mediaObjects))
    return all_mediaObjects

In [5]:
def gather_report_ids(all_mediaObjects):
    report_ids = set()
    for k,v in all_mediaObjects.items():
        report_ids.add(v['_source']['is_referenced_by'])
    
    print(len(report_ids))
    return report_ids

In [6]:
def gather_all_reports(report_ids):
    res_report_ids = len(report_ids)
    identifier = 0
    all_reports = dict()
    
    while res_report_ids > 0:
        reports = get_results_documenten(ori_id, 10000, None, 'Report', False, list(report_ids)[identifier:identifier+10000])
        for r in reports['hits']['hits']:
            if r['_id'] not in all_reports:
                all_reports[r['_id']] = r
    
        res_report_ids = res_report_ids-10000
        identifier = identifier+10000
    
    
    print(len(all_reports))
    return all_reports

In [7]:
def openraadsinformatie_data(ori_id, tooi):
    amount = amount_of_results(ori_id)
    
    if amount > 10000:
        all_media_objects = media_objecten_grote_orgs(ori_id)
        report_ids = gather_report_ids(all_media_objects)
        all_reports = gather_all_reports(report_ids)
        json_data = create_dataframe(all_reports, all_media_objects, ori_id, tooi)

    else:
        all_items = gather_metadata(ori_id, 10000)
        json_data = create_dataframe(all_items, all_items, ori_id, tooi)

    return json_data

    

In [8]:
def get_tooi(ori_id, alle_gemeenten):
    x = get_results_documenten(ori_id, 1000, 'XXX', 'Organization', last_discussed=False, id_list = [])

    # Bij de gemeente Noordwijk zit een attribuut van de oude gemeente Noordwijkerhout er nog in deze filter ik hier handmatig uit
    muni = [hit for hit in x['hits']['hits'] if '_source' in hit and 'classification' in hit['_source'] and hit['_source']['classification'] == 'Municipality' and hit['_source']['name'] != 'Gemeente Noordwijkerhout']
    if len(muni) == 1:
        gem_name = (muni[0]['_source']['name']).split(' ', 1)[1]
        if gem_name == 'Bergen NH':
            gem_name = 'Bergen (NH)' 

        try:
            tooi = alle_gemeenten[gem_name]
            return tooi

        except KeyError:
            return None


In [9]:
alle_gemeenten = TOOI_organisaties(orgs = ['gemeenten'])

In [10]:
url = 'https://api.openraadsinformatie.nl/v1/elastic/_cat/indices?v'
response = requests.get(url)
df = pd.read_csv(StringIO(response.text), delim_whitespace=True)
ids = [row['index'] for index, row in df.iterrows() if 'ori_' in row['index']]

In [11]:
output_dir = 'alle_gemeente_rapporten'

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for ori_id in ids:

    test_result = get_results_documenten(ori_id, 1, None, 'Report', False)
    if test_result['hits']['hits']:
    
        tooi_code = get_tooi(ori_id, alle_gemeenten)
        if not tooi_code:
            continue

        print(tooi_code)

        file_name = os.path.join(output_dir, f"{tooi_code}.json")
        if os.path.exists(file_name):
            continue
        
        json_data = openraadsinformatie_data(ori_id, tooi_code)
        
        with gzip.open(file_name, "wt", encoding="utf-8") as json_file:
            json.dump(json_data, json_file)
    

gm0603
5611
2930
2649
gm0575
gm0779
12863
4898
164
gm1978
gm0432
7236
2658
549
gm1931
gm0119
gm0777
gm0059
gm0312
gm1884
gm0303
gm0362
13865
5409
2605
gm0762
gm1525
7377
3105
1802
gm0168
gm1719
gm0268
gm0277
gm0737
gm1894
gm1895
gm0275
gm0965
gm0193
4759
2134
363
gm0917
gm0385
gm0243
gm0153
gm0180
gm0246
gm0828
9477
2999
740
gm1949
gm0809
gm0202
11829
6116
5700
gm1961
12064
4363
1739
gm0080
7738
2463
2156
gm0356
gm0114
6812
3252
968
gm0177
gm0986
gm1734
gm0321
gm0957
gm1842
gm0141
8676
2804
1939
gm0453
9213
1849
0
gm1701
gm0203
7598
1418
848
gm0402
13777
6425
3830
gm0074
12326
7065
5268
gm0396
gm1859
8166
3555
1717
gm0866
gm1721
gm0946
gm0096
gm1699
7675
2497
1987
gm0294
11040
3091
1915
gm1954
gm0384
gm0269
gm0629
gm0405
gm0757
gm0971
gm0569
10118
2029
0
gm0513
6653
3240
2912
gm0267
gm0050
gm0296
gm0244
gm0420
18756
8358
5026
gm0085
gm1640
gm1970
6218
2857
1224
gm0415
gm0944
gm0037
6890
3030
1570
gm0626
gm0852
gm0060
gm0770
gm1659
14427
4396
1182
gm1729
4818
3058
2833
gm0448
8866
4485
