# Search across State Library's family history index datasets

State Library of Queensland has released a number of family history datasets on the <a href="https://www.data.qld.gov.au/organization/state-library-queensland">Queensland Government Open Data Portal.</a>

Instead of downloading each family history dataset from the portal and searching through individually, use this tool to search across the open datasets using the portal's API.

This Jupyter notebook allows you to search across multiple datafiles without the need to download. After seeing results, you can then download directly from the notebook.

In [3]:
# Setup data file import from open data portal
import requests
import ipywidgets as widgets
import pandas as pd

from IPython.display import display, HTML, clear_output

# Settings some font options I can use later
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

# Nested list of State Library's family history csvs available on open data portal. Listed at file (resource) level instead of dataset level. Input here to maintain accurate references.
# ['Resource title', 'datasetID', 'resourceID']
nestedDatafiles = [['Police Gazette Inquests 1875-1885','f1a5089b-01f9-40a8-8aee-ce62a4721bde','15d01937-097c-45ed-a365-feb493b05c2c'],
                   ['British convict transportation registers','458eb59f-e5f1-466f-925b-9dbcebb4f073','dbcfa4a6-3ec7-4264-bcee-43b21a470d34'],
                   ['Queensland mining accidents 1882-1945','2e5b65d7-09d5-403f-a5d5-a552410f2d5d','35ea936d-083e-4ad6-beab-e0fede2cd3a6'],
                   ['World War 1 soldier portraits','4a99c4e7-393b-40cb-afb7-1754cd23a551','a46b4d2b-243f-41f9-9a61-a231f1d1b6d0'],
                   ['Queensland railway employees 1890-1915 - Appointees','a6149d43-9f72-40e3-8ed6-905b3131ef55','cdafbbbf-c9ca-46a1-9f18-ecd9e8943040'],
                   ['Queensland railway employees 1890-1915 - Removals','a6149d43-9f72-40e3-8ed6-905b3131ef55','200ab9db-4071-47f1-971b-1286f89c097c'],
                   ['Licensed victuallers Index','96c624e5-c6fd-4e26-8c07-90a236185437','e427f733-1386-4299-9c8d-852c69ce3575'],
                   ['Southern and Western Railway employees 1866-1876 - Appointees','eef8cc6b-6bdd-4439-994e-5f74c509618a','9e230691-9ca2-41a8-ba09-ae9ca70c9f00',],
                   ['Southern and Western Railway employees 1866-1876 - Removals','eef8cc6b-6bdd-4439-994e-5f74c509618a','75ef82f6-16cd-4432-b76d-b9dade6d21e7',],
                   ['Persons called before Queensland Government committees 1860-1920','623d1ff9-8c20-486b-ada2-a69a35c16134','390ffd43-8a07-4b3f-b523-27dceadb904d']
                  ]

# Separate nested list into flat lists for easy use
resourceTitles = []
datasetIDs = []
resourceIDs = []
for x in range (len(nestedDatafiles)):
    resourceTitles.append(nestedDatafiles [x][0])
    datasetIDs.append(nestedDatafiles [x][1])
    resourceIDs.append(nestedDatafiles [x][2])
    
# Empty list to store which datafiles are selected for search
selectSearch = []

# Base data API URL for a data file on data.qld.gov.au
baseURL = 'https://www.data.qld.gov.au/api/3/action/datastore'
resourceURL = baseURL + '_search?resource_id='

In [4]:
%%javascript
// This is necessary in Jupyter notebook to stop the output area folding up
// Will give an error in Jupyter Lab
IPython.OutputArea.prototype._should_scroll = function(lines) {return false}

<IPython.core.display.Javascript object>

In [5]:
# Setup the widgets that will allow input for search term, result limit, and button to search

# This code populates the list into an interactive selector widget - widget needs formatting 
# I need to figure out how to dynamically populate the options list from the API, instead of recording each individually.
datasetSelector = widgets.SelectMultiple(
    options = (resourceTitles),
    disabled = False,
    layout=widgets.Layout(width = '600px', height = '180px')
)

# This input widget let's you type in any string to search as a keyword
keywordSetter = widgets.Text(
    placeholder = 'Search anything',
    disabled = False,
    layout=widgets.Layout(width = '500px')
)

# This input widget lets you set any numeric value manually, within the set min/max, and also set how big the 'step' is to increase/decrease the integer in the widget
limitSetter = widgets.BoundedIntText(
    value = 10000,
    min = 1,
    max = 10000,
    step = 50,
    disabled = False,
    layout=widgets.Layout(width = '100px')
)

# This widget creates an action button for the user to kick off the search
button = widgets.Button(
    description = 'Start search',
    button_style = 'primary',
    disabled = False
)

output = widgets.Output()

In [6]:
# This defines the function that will be run when users click the Start search button in a later cell

def search_values(b):
    # Clear output from any previous searches
    output.clear_output()
    
    # Set the values from the widgets
    global selectSearch
    global keyword
    global resultLimit
    
    selectSearch = datasetSelector.index
    # If nothing is selected, search all files
    if len(selectSearch) == 0:
        selectSearch =[]
        for x in range (len(nestedDatafiles)):
            selectSearch.append(x)
    keyword = keywordSetter.value
    resultLimit = limitSetter.value
      
    # Search in each dataset selected (stored in selectSearch array) with the values input by widgets. 
    # The code loops through to search each dataset, display results, then search the next dataset, one at a time.
    for x in range (len(selectSearch)):
        response = requests.get(baseURL + '_search?q=' + keyword + '&resource_id=' + resourceIDs[selectSearch[x]] + '&limit=' + str(resultLimit))
    
        # extract records from search in JSON format
        results = response.json()
         
        # identify where the record data I want sits in the JSON
        records = results['result']['records']
            
        # identify where the total count of results sits in the JSON
        total = results['result']['total']
            
        with output:
            # print the total count of results, search term, and dataset title
            print ('\n' + color.BOLD + color.UNDERLINE + resourceTitles[selectSearch[x]] + color.END)
            print (color.BOLD + str(total) + color.END + ' search results for ' + color.BOLD + color.GREEN + keyword)
               
            # link to download full file from portal
            # The data API URL for download isn't turned on for the Qld portal, but I can still make the URL work with the datasetID and resourceID
            downloadURL = 'https://www.data.qld.gov.au/dataset/' + datasetIDs[selectSearch[x]] + '/resource/' + resourceIDs[selectSearch[x]] + '/download/'
            display(HTML(f'<a href="{downloadURL}" download="{resourceTitles[selectSearch[x]]}.csv">Download the full CSV</a>'))
            
            # display a message if the total count of results is larger than the result limit selected
            if total > resultLimit:
                print(color.RED + '(Only displaying the first ' + str(resultLimit) + ' of ' + str(total) +  ' results)')
    
            # displays records from search in each dataset
            pd.set_option('display.max_rows', None)
            df = pd.DataFrame(records)
            display(df)

In [7]:
# This sets what function the notebook will run when a user clicks the button. The function is defined in the previous cell.
button.on_click(search_values)

In [8]:
# List available datasets to be searched with option to select

# display all the widgets we've built with labels
display(HTML('<b>Select datasets to search</b> (leave blank for all):'))
display(datasetSelector)
display(HTML('<b>Search:</b>'))
display(keywordSetter)
display(HTML('Record limit per CSV:'))
display(limitSetter)
display(button, output)
output.clear_output()

SelectMultiple(layout=Layout(height='180px', width='600px'), options=('Police Gazette Inquests 1875-1885', 'Br…

Text(value='', layout=Layout(width='500px'), placeholder='Search anything')

BoundedIntText(value=10000, layout=Layout(width='100px'), max=10000, min=1, step=50)

Button(button_style='primary', description='Start search', style=ButtonStyle())

Output()

In [11]:
# This prints a static list of the datasets selected. Only to check code is working correctly
#print (color.BOLD + 'keyword= ' + color.END + keyword)
#print (color.BOLD + 'limit= ' + color.END + str(resultLimit))
#print (color.BOLD + 'selected= ' + color.END + str(selectSearch))
#print ('\n' + color.BOLD + 'You are searching in the following datasets:' + color.END)
#for x in range(len(selectSearch)): 
#    print ('\u2022 ' + resourceTitles[selectSearch[x]],)

See all of State Library's open datasets: https://www.data.qld.gov.au/organization/state-library-queensland