# Prison population of England and Wales

## Using Gov.uk Content API to download all weekly prison population data
https://content-api.publishing.service.gov.uk/

## Interacting with API to filter relevant files

In [1]:
import requests
import pandas as pd

In [4]:
url = 'https://www.gov.uk/api/content/government/statistics/prison-population-2012'
response = requests.get(url)
response.headers.get("Content-Type")

'application/json; charset=utf-8'

In [35]:
response.json()

{'analytics_identifier': None,
 'base_path': '/government/statistics/prison-population-2012',
 'content_id': '5dc71785-7631-11e4-a3cb-005056011aef',
 'description': 'Statistics on the prison population in 2012.\r\n',
 'details': {'attachments': [{'accessible': False,
    'alternative_format_contact_email': 'web.comments@justice.gov.uk',
    'attachment_type': 'file',
    'command_paper_number': '',
    'content_type': 'application/msword',
    'file_size': 50836,
    'filename': 'prison-population-monthly-dec2012.doc',
    'id': '367193',
    'isbn': '',
    'title': 'Population bulletin - Monthly December 2012',
    'unique_reference': '',
    'unnumbered_command_paper': False,
    'unnumbered_hoc_paper': False,
    'url': 'https://assets.publishing.service.gov.uk/media/5a7564c840f0b6360e473c50/prison-population-monthly-dec2012.doc'},
   {'accessible': False,
    'alternative_format_contact_email': 'web.comments@justice.gov.uk',
    'attachment_type': 'file',
    'command_paper_number

In [6]:
response.json()['details']['attachments']

[{'accessible': False,
  'alternative_format_contact_email': 'web.comments@justice.gov.uk',
  'attachment_type': 'file',
  'command_paper_number': '',
  'content_type': 'application/msword',
  'file_size': 50836,
  'filename': 'prison-population-monthly-dec2012.doc',
  'id': '367193',
  'isbn': '',
  'title': 'Population bulletin - Monthly December 2012',
  'unique_reference': '',
  'unnumbered_command_paper': False,
  'unnumbered_hoc_paper': False,
  'url': 'https://assets.publishing.service.gov.uk/media/5a7564c840f0b6360e473c50/prison-population-monthly-dec2012.doc'},
 {'accessible': False,
  'alternative_format_contact_email': 'web.comments@justice.gov.uk',
  'attachment_type': 'file',
  'command_paper_number': '',
  'content_type': 'application/vnd.ms-excel',
  'file_size': 28672,
  'filename': 'prison-population-281212.xls',
  'id': '367194',
  'isbn': '',
  'title': 'Population Bulletin - Weekly 28 December 2012',
  'unique_reference': '',
  'unnumbered_command_paper': False,
  '

In [27]:
attachments = response.json()['details']['attachments']

# Exclude Word document content types
spreadsheet_attachments = [
    attachment['url'] for attachment in attachments
    if attachment.get('content_type') not in {
        'application/msword',
        'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
    }
]

In [28]:
spreadsheet_attachments

['https://assets.publishing.service.gov.uk/media/5a7c77d8ed915d6969f45155/prison-population-281212.xls',
 'https://assets.publishing.service.gov.uk/media/5a7ca1cae5274a38e5755c58/prison-population-211212.xls',
 'https://assets.publishing.service.gov.uk/media/5a74ee70ed915d502d6cc273/prison-population-141212.xls',
 'https://assets.publishing.service.gov.uk/media/5a7b7f07ed915d131105f9e1/prison-population-071212.xls',
 'https://assets.publishing.service.gov.uk/media/5a7cd07c40f0b6629523bf9d/prison-population-301112.xls',
 'https://assets.publishing.service.gov.uk/media/5a756b83e5274a3edd9a4c6d/prison-population-231112.xls',
 'https://assets.publishing.service.gov.uk/media/5a7c5bf940f0b6601cef574d/prison-population-161112.xls',
 'https://assets.publishing.service.gov.uk/media/5a7c681fe5274a5590059a74/prison-population-091112.xls',
 'https://assets.publishing.service.gov.uk/media/5a7c233b40f0b645ba3c6eb3/prison-population-021112.xls',
 'https://assets.publishing.service.gov.uk/media/5a7c33

Great, that has filtered the list to only show spreadsheets. Now to attempt to download

## File download development

In [38]:
from concurrent.futures import ThreadPoolExecutor
import requests
import os

In [None]:
url = 'https://www.gov.uk/api/content/government/statistics/prison-population-2012'

In [32]:
def download_file(url, path='data/raw/'):
    response = requests.get(url)
    data = response.json()
    year = data['base_path'].split('-')[-1]
    attachments = response.json()['details']['attachments']

    # Exclude Word document content types
    spreadsheet_attachments = [
        attachment['url'] for attachment in attachments
        if attachment.get('content_type') not in {
            'application/msword',
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
        }
    ]

    for spreadsheet_url in spreadsheet_attachments:
        # Make a GET request to download the spreadsheet file
        spreadsheet_response = requests.get(spreadsheet_url)
        # Extracting the filename from the URL
        filename = path + year + spreadsheet_url.split('/')[-1]
        # Save the spreadsheet content to a local file
        with open(filename, 'wb') as file:
            file.write(spreadsheet_response.content)
        print(f"Downloaded file {filename.split('/')[-1]}")

In [33]:
download_file(url=url)

Downloaded file data/raw/test/prison-population-06-01-12.xls


Great, that's downloaded a year's worth of data. It would be helpful to separate this out into a year folder. Having investigated the `response.json()` it would seem that `base_path` is probably the most reliable attribute to use for this.

In [36]:
response.json()['base_path']

'/government/statistics/prison-population-2012'

In [37]:
response.json()['base_path'].split('-')[-1]

'2012'

Redefining the function to include the year folder logic and using `os.makedirs()` and `os.path.join()`

In [42]:
def download_file(url, path='data/raw/'):
    response = requests.get(url)
    data = response.json()
    year = data['base_path'].split('-')[-1]
    attachments = data['details']['attachments']

    # Exclude Word document content types
    spreadsheet_attachments = [
        attachment['url'] for attachment in attachments
        if attachment.get('content_type') not in {
            'application/msword',
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
        }
    ]

    # Define the full directory path with the year subfolder
    year_path = os.path.join(path, year)
    os.makedirs(year_path, exist_ok=True)  # Ensure the directory exists

    for spreadsheet_url in spreadsheet_attachments:
        # Make a GET request to download the spreadsheet file
        spreadsheet_response = requests.get(spreadsheet_url)
        
        # Extract the filename from the URL
        filename = os.path.join(year_path, spreadsheet_url.split('/')[-1])
        
        # Save the spreadsheet content to a local file
        with open(filename, 'wb') as file:
            file.write(spreadsheet_response.content)
        
        print(f"Downloaded file {os.path.basename(filename)} to {year_path}/")
    print("Download complete!")

In [40]:
download_file(url=url)

Downloaded file prison-population-281212.xls to data/raw/2012/
Downloaded file prison-population-211212.xls to data/raw/2012/
Downloaded file prison-population-141212.xls to data/raw/2012/
Downloaded file prison-population-071212.xls to data/raw/2012/
Downloaded file prison-population-301112.xls to data/raw/2012/
Downloaded file prison-population-231112.xls to data/raw/2012/
Downloaded file prison-population-161112.xls to data/raw/2012/
Downloaded file prison-population-091112.xls to data/raw/2012/
Downloaded file prison-population-021112.xls to data/raw/2012/
Downloaded file prison-population-26-10-12.xls to data/raw/2012/
Downloaded file prison-population-191012.xls to data/raw/2012/
Downloaded file prison-population-121012.xls to data/raw/2012/
Downloaded file prison-population-051012.xls to data/raw/2012/
Downloaded file prison-population-280912.xls to data/raw/2012/
Downloaded file prison-population-210912.xls to data/raw/2012/
Downloaded file prison-population-140912.xls to data/

Great, that works. Now to move one level up so I can apply the `download_file` function to all of the years of data that I want.

## Expanding to download data for multiple years

In [43]:
url = 'https://www.gov.uk/api/content/government/collections/prison-population-statistics'
response = requests.get(url)
response.json()

{'analytics_identifier': None,
 'base_path': '/government/collections/prison-population-statistics',
 'content_id': '5eb8ebe9-7631-11e4-a3cb-005056011aef',
 'description': 'Statistics on the prison population.',
 'details': {'body': '<div class="govspeak"><p>Statistics on the prison population.</p>\n</div>',
  'change_history': [{'note': ' Prison population: weekly estate figures 2025 added. ',
    'public_timestamp': '2025-01-06T15:18:12.000+00:00'},
   {'note': 'Prison population figures: 2024  added',
    'public_timestamp': '2024-01-05T11:25:00.000+00:00'},
   {'note': 'Prison population figures: 2023 published.',
    'public_timestamp': '2023-01-06T10:56:24.000+00:00'},
   {'note': '2015 page added',
    'public_timestamp': '2015-01-02T11:24:44.000+00:00'},
   {'note': 'First published.',
    'public_timestamp': '2013-10-18T00:00:00.000+01:00'}],
  'collection_groups': [{'body': '<div class="govspeak">\n</div>',
    'documents': ['edd0df70-cd64-4f90-958e-bd7717d8e445',
     'dff9e

In [46]:
response.json()['links']['documents']

[{'api_path': '/api/content/government/publications/prison-population-weekly-estate-figures-2025',
  'api_url': 'https://www.gov.uk/api/content/government/publications/prison-population-weekly-estate-figures-2025',
  'base_path': '/government/publications/prison-population-weekly-estate-figures-2025',
  'content_id': 'edd0df70-cd64-4f90-958e-bd7717d8e445',
  'document_type': 'transparency',
  'links': {},
  'locale': 'en',
  'public_updated_at': '2025-02-03T11:43:39Z',
  'schema_name': 'publication',
  'title': 'Prison population: weekly estate figures 2025',
  'web_url': 'https://www.gov.uk/government/publications/prison-population-weekly-estate-figures-2025',
  'withdrawn': False},
 {'api_path': '/api/content/government/publications/prison-population-figures-2024',
  'api_url': 'https://www.gov.uk/api/content/government/publications/prison-population-figures-2024',
  'base_path': '/government/publications/prison-population-figures-2024',
  'content_id': 'dff9e4c8-7845-417e-8798-1955b

In [66]:
response.json()['links']['documents'][0]['api_url']

'https://www.gov.uk/api/content/government/publications/prison-population-weekly-estate-figures-2025'

In [65]:
url = 'https://www.gov.uk/api/content/government/collections/prison-population-statistics'
response = requests.get(url)
data = response.json()
documents = response.json()['links']['documents']

# Create list of API URLs for each year of prison population statistics
api_urls = [document['api_url'] for document in documents]
api_urls

['https://www.gov.uk/api/content/government/publications/prison-population-weekly-estate-figures-2025',
 'https://www.gov.uk/api/content/government/publications/prison-population-figures-2024',
 'https://www.gov.uk/api/content/government/publications/prison-population-monthly-prison-figures-2024',
 'https://www.gov.uk/api/content/government/publications/prison-population-figures-2023',
 'https://www.gov.uk/api/content/government/publications/prison-population-figures-2022',
 'https://www.gov.uk/api/content/government/statistics/prison-population-figures-2021',
 'https://www.gov.uk/api/content/government/statistics/prison-population-figures-2020',
 'https://www.gov.uk/api/content/government/statistics/prison-population-figures-2019',
 'https://www.gov.uk/api/content/government/statistics/prison-population-figures-2018',
 'https://www.gov.uk/api/content/government/statistics/prison-population-figures-2017',
 'https://www.gov.uk/api/content/government/statistics/prison-population-figures-

### Downloading multiple years

In [67]:
test_urls = api_urls[:2]
test_urls

['https://www.gov.uk/api/content/government/publications/prison-population-weekly-estate-figures-2025',
 'https://www.gov.uk/api/content/government/publications/prison-population-figures-2024']

In [68]:
with ThreadPoolExecutor() as executor:
    executor.map(download_file, test_urls)

Downloaded file prison-pop-3-feb-2025.ods to data/raw/2025/
Downloaded file PSWEBREPORT_30_December.ods to data/raw/2024/
Downloaded file prison-pop-27-jan-2025.ods to data/raw/2025/
Downloaded file 23_December_2024.ods to data/raw/2024/
Downloaded file prison-pop-20-jan-2025.ods to data/raw/2025/
Downloaded file prison-pop-16-dec-2024.ods to data/raw/2024/
Downloaded file prison-pop-13-january-2025.ods to data/raw/2025/
Downloaded file prison_pop__dec_24.ods to data/raw/2024/
Downloaded file 6_January_PSWEBREPORT.ods to data/raw/2025/
Download complete!
Downloaded file prison-pop-2-dec-2024.ods to data/raw/2024/
Downloaded file prison-pop-25-nov-2024.ods to data/raw/2024/
Downloaded file prison-pop-18-nov-2024.ods to data/raw/2024/
Downloaded file prison-pop-11-nov-2024.ods to data/raw/2024/
Downloaded file prison-pop-4-november-2024.ods to data/raw/2024/
Downloaded file Prison_stats_28_October.ods to data/raw/2024/
Downloaded file Prison_stats_weekly_21_October_2024.ods to data/raw/2