In [2]:
import requests
from datetime import datetime
import pandas as pd
from io import StringIO
from collections import Counter

In [3]:
# Code voor het activeren van de API

def gather_metadata(org_id, count):

    # URL and headers
    url = "https://api.openraadsinformatie.nl/v1/elastic/ori_*/_search?"
    headers = {
        "Content-Type": "application/json"
    }
    
    # Request JSON data
    data = {
        "query": {
            "bool": {
                "must": [
                    {
                        "simple_query_string": {
                            "fields": ["text", "title", "description", "name"],
                            "default_operator": "or",
                            "query": "*"
                        }
                    },
                    {
                        "terms": {
                            "_index": [org_id],
                        }
                    }
                ]
            }
        },
        "size": count,
        "_source": {
            "includes": ["*"],
            "excludes": []
        },
        "from": 0,
        "sort": [
            {
                "_score": {
                    "order": "desc"
                }
            }
        ],
    }
    
    
    # Post request versturen
    response = requests.post(url, json=data, headers=headers)
    
    # Check de response
    if response.status_code == 200:
        result = response.json()
    
    else:
        print("Request failed with status code:", response.status_code)
        print("Response content:", response.text)

    print(len(result['hits']['hits']))

    all_results = {r['_id']: r for r in result['hits']['hits']}

    return all_results


In [5]:
def process_report(id, all_results, gemeentenaam):
    json_data = dict()

    total_len = 0
    
    data = all_results[id]

    json_data['id'] = id
    try:
        json_data['dc_title'] = data['_source']['name']   
    except KeyError:
        return {}
    json_data['dc_source'] = f"https://{gemeentenaam}.bestuurlijkeinformatie.nl/Agenda/Index/{data['_source']['was_generated_by']['original_identifier']}"
    print(json_data['dc_source'])
    
    json_data['dc_type'] = data['_source']['@type']   
    json_data['dc_publisher'] = all_results[data['_source']['has_organization_name']]['_source']['name']
    try:
        json_data['foi_chair'] = data['_source']['chair']
    except KeyError:
        pass
    try:
        json_data['foi_committee'] = all_results[data['_source']['committee']]['_source']['name']
    except KeyError:
        pass
    try:
        json_data['foi_location'] = data['_source']['location']
    except KeyError:
        pass
    
    try:
        json_data['dc_description'] = data['_source']['description']
    except KeyError:
        pass

    try:
        json_data['foi_publishedDate'] = datetime.fromisoformat(data['_source']['start_date']).strftime('%Y-%m-%d')

    except KeyError:
        json_data['foi_publishedDate'] = datetime.fromisoformat(data['_source']['was_generated_by']['started_at_time']).strftime('%Y-%m-%d')

    json_data['dc_date_year'] = json_data['foi_publishedDate'][:4]

    if 'agenda' in data['_source']:

        agenda = []
    
        agendapunten = data['_source']['agenda']['@list']

        for agendapunt in agendapunten:
            try:
                agenda_data = all_results[agendapunt]
            except KeyError:
                continue
            agendapunt_name = agenda_data['_source']['name']

            if 'attachment' in agenda_data['_source']:
        
                attachments = agenda_data['_source']['attachment']
                if isinstance(attachments, str):
                    attachments = [attachments]
            
                for a in attachments:
                    try:
                        tempdata = all_results[a]
                        temp_data = dict()
                        temp_data['dc_isPartOf'] = agendapunt_name
                        temp_data['dc_title'] = tempdata['_source']['name']
                        temp_data['dc_type'] = tempdata['_source']['@type']
                        temp_data['dc_source'] = tempdata['_source']['original_url']
                        temp_data['dc_format'] = tempdata['_source']['content_type']
                        bodytext = [txt for txt in tempdata['_source']['text'] if txt != '\x0c']
                        total_len += len(bodytext)
                        agenda.append(temp_data)
                    except KeyError:
                        pass
        
        json_data['agenda'] = agenda
    else:
        pass

    invitees_list = []
    try:
        invitees = data['_source']['invitee']
        if isinstance(invitees, str):
            invitees = [invitees]

        json_data['foi_invitees'] = [all_results[invitee]['_source']['name'] for invitee in invitees]
    except KeyError:
        pass
                
    
    return total_len

In [6]:
def create_dataframe(all_results, org_id):
    all_result_list = []
    
    for k,v in all_results.items():
    
        if v['_source']['@type'] == 'Meeting':

            print(k)
            
            x = process_report(k, all_results, org_id.split('_')[1].lower())

            display(x)

            print('')
            
            
            if x != {}:
                all_result_list.append(x)
                
    if len(all_result_list) < 100:
        return pd.DataFrame()
    
    df = pd.DataFrame(all_result_list)
    
    df['foi_files_length'] = df['foi_files'].apply(len)

    return df

In [7]:
all_results = gather_metadata('ori_drechterland_20230708034005', 10000)

5449


In [None]:
total_all = 0
total_docs = 0

for k,v in all_results.items():
    if v['_source']['@type'] == 'Meeting':
        length = process_report(k, all_results, 'westervoort')

        print

        if length is not None:
            total_all += length
            total_docs += 1

print(total_docs)
print(total_all)
print(total_all / total_docs)

- Westervoort = 352 Agenda's, 57837 pagina's. Dus 164 pagina's per agenda.
- Drechterland = 146 Agenda's, 28539 pagina's. Dus 195 pagina's per agenda.
- De Ronde Venen = 422 Agenda's, 20610 pagina's Dus 49 pagina's per agenda.
- Maastricht = 115 Agenda's, 7376 pagina's Dus 64 pagina's per agenda.
- Apeldoorn = 151 Agenda's, 16886 pagina's Dus 112 pagina's per agenda.
- Zaanstad (had meer dan 600 agendas volgens de zoekmachine, maar waarschijnlijk de meeste zonder agendapunten. 
Ik vond hier 68 agenda's met 21224 items dus 312 paginna's per agenda.

In [None]:
# df = create_dataframe(all_results, 'ori_westervoort_20230707060943')
# display(df)

In [None]:
display(all_results['2324597'])