# Download debates

This notebook is used to download the initial debates dataset and takes about 3 hours to execute. In case you want to skip that effort feel free to reach out to the email address s252890@dtu.dk and we can sent you the raw data file.

In [None]:
import requests
import json
import os

Query all debate information from the uk parliament house of commons from the 58th electorial period.

In [None]:
url = "https://hansard-api.parliament.uk/search/debates.json"
params = {
    "queryParameters.house": "Commons",
    "queryParameters.startDate": "2019-12-17",
    "queryParameters.endDate": "2024-05-30",
    "queryParameters.take": 100000,
}

response = requests.get(url, params=params)

response.raise_for_status()
data = response.json()
print(len(data['Results']))

22585


In [None]:
debates_to_download = set()
for debate in data['Results']:
    debates_to_download.add(debate['DebateSectionExtId'])

Filtering out debates that have already been downloaded to skip them enabling the script to be rerun at a different time.

In [None]:
def get_downloaded_debate_ids(path="data"):
    debate_ids = set()
    for file in os.listdir(path):
        if file.endswith(".json"):
            debate_id = file.replace(".json", "").replace("debate_", "")
            debate_ids.add(debate_id)
    return debate_ids

downloaded_debate_ids = get_downloaded_debate_ids()
print(len(debates_to_download))
debates_to_download = debates_to_download - downloaded_debate_ids
print(len(debates_to_download))


22584
30


Downloading... Have fun waiting

In [None]:
for debate_id in debates_to_download:
    debate_url = f"https://hansard-api.parliament.uk/Debates/Debate/{debate_id}.json"
    debate_response = requests.get(debate_url)
    if debate_response.status_code != 200:
        print(f"Failed to download debate {debate_id}: {debate_response.status_code}")
        continue
    debate_data = debate_response.json()
    
    # Save each debate JSON to a file
    with open(f"data/debate_{debate_id}.json", "w", encoding="utf-8") as f:
        json.dump(debate_data, f, ensure_ascii=False, indent=4)