# Notebook for retrieving, processing, and saving the latest UK rainfall data via the EA API

In [1]:
# importing required libraries
import numpy as np
import pandas as pd
import json
import requests
from datetime import datetime, date
import os

In [2]:
# printing current date and time
print('Date and time: ' + str(datetime.today().replace(microsecond=0)))

Date and time: 2023-05-21 21:03:40


In [3]:
# list present working directory
#os.getcwd()

## Retrieving the data

In [4]:
# data retrieval functions

# function to retrieve data on rainfall stations
def get_data_stations(parameter_name = None,
                 parameter = None,
                 qualifier = None,
                 label = None,
                 town = None,
                 catchment_name = None,
                 river_name = None,
                 station_reference = None,
                 rloi_id = None,
                 search = None,
                 lat = None,
                 long = None,
                 d = None,
                 type = None,
                 status = None):
    """Get details of rainfall monitoring stations from the EA API
    
      Query parameter details:
    
      :param parameter_name: Return only those stations which measure parameters with the given name, for example Rainfall, Water Level or Flow.
      :param parameter: Return only those stations which measure parameters with the given short form name, for example rainfall, level or flow.
      :param qualifier: Return only those stations which measure parameters with qualifier. Useful qualifiers are Stage and Downstream Stage (for stations such as weirs which measure levels at two locations), Groundwater for groundwater levels as opposed to river levels and Tidal Level for tidal levels.
      :param label: Return only those stations whose label is exactly as given.
      :param town: Return only those stations whose town is as given. Not all stations have an associated town.
      :param catchment_name: Return only those stations whose catchment name is exactly as given. Not all stations have an associated catchment area.
      :param river_name: Return only those stations whose river name is exactly as given. Not all stations have an associated river name.
      :param station_reference: Return only those stations whose reference identifier is as given. The station reference is an internal identifier used by the Environment Agency.
      :param rloi_id: Return only the station (if there is one) whose RLOIid (River Levels on the Internet identifier) matches.
      :param search: Return only those stations whose label contains the given value.
      :param lat: Return those stations whose location falls within d km of the given latitude/longitude (in WGS84 coordinates), this may be approximated by a bounding box.
      :param long: Return those stations whose location falls within d km of the given latitude/longitude (in WGS84 coordinates), this may be approximated by a bounding box.
      :param d: Return those stations whose location falls within d km of the given latitude/longitude (in WGS84 coordinates), this may be approximated by a bounding box.
      :param type: Return only those stations of the given type, where type can be one of "SingleLevel", "MultiTraceLevel", "Coastal", "Groundwater" or "Meteorological"
      :param status: Return only those stations with the given status. Can be one of "Active", "Closed" or "Suspended".
      """
    
    # URL of the UK EA API
    api_url = "https://environment.data.gov.uk/flood-monitoring/id/stations"
 
    # build a dictionary of the query parameters
    params =  {'parameterName': parameter_name,
                'parameter': parameter,
                'qualifier': qualifier,
                'label': label,
                'town': town,
                'catchmentName': catchment_name,
                'riverName': river_name,
                'stationReference': station_reference,
                'RLOIid': rloi_id,
                'search': search,
                'lat': lat,
                'long': long,
                'dist': d,
                'type': type,
                'status': status
              }
    
    # getting data on stations from the EA API
    response = requests.get(api_url, 
                            params = params)

    # ensuring that cookies are allowed
    #response = requests.get(api_url, params, cookies=response.cookies)
    
    # extracting JSON data from the response container
    data = response.json()

    # extracting the items elements and loading data to a pandas data frame
    stations = pd.DataFrame(data["items"])

    return(stations)


# function to retrieve data on rainfall measures
def get_data_measures(parameter_name = None,
                     parameter = None,
                     qualifier = None,
                     station_reference = None,
                     station = None,
                     search = None):
    """Get details of measures available from the EA API

       :param parameter_name: Return only measures for parameters with the given name, for example Water Level or Flow.
       :param parameter: Return only measures for parameters with the given short form name, for example level or flow.
       :param qualifier: Return only those measures with qualifier. Useful qualifiers are Stage and Downstream Stage (for stations such as weirs which measure levels at two locations), Groundwater for groundwater levels as opposed to river levels and Tidal Level for tidal levels.
       :param station_reference: Return only those measures which are available from the station with the given reference identifier.
       :param station: Return only those measures which are available from the station with the given URI.
       :param search: Return only those measures whose label contains the given value.
    """
    api_url = "http://environment.data.gov.uk/flood-monitoring/id/measures"
    
     # build a dictionary of the query parameters
    params = {'parameterName': parameter_name,
    'parameter': parameter,
    'qualifier': qualifier,
    'stationReference': station_reference,
    'station': station,
    'search': search
    }
    
    # getting data about measures from the EA API
    response = requests.get(api_url, 
                            params)

    # ensuring that cookies are allowed
    #response = requests.get(api_url, params, cookies=response.cookies)
    
    # extracting JSON data from the response container
    data = response.json()

    # extracting the items elements and loading data to a pandas data frame
    measures = pd.DataFrame(data["items"])

    return(measures)

# function to retrieve data on rainfall readings
def get_data_readings(       limit = None,
                             date = None, 
                             startdate = None, 
                             enddate = None,
                             since = None, 
                             latest = False,
                             today = False, 
                             sorted = True,
                             parameter_name = None,
                             parameter = None,
                             qualifier = None,
                             station_reference = None,
                             station = None,
                             search = None):
    """Get readings for a given measure from the EA API

      :param measure_id: EA API measure id
      :param limit: Maximum number of records to return. Defaults to 500. Max 10000.
      :param date: Return all the readings taken on the specified day.
      :param startdate: Return the readings taken on the specified range of days. Date format 2023-05-17.
      :param enddate: Return the readings taken on the specified range of days. Date format 2023-05-17.
      :param since: Return the readings taken since the given date time (not inclusive), up to the specified limit. If no limit is given then a default limit of 500 will be used. Typically when tracking a particular measurement then use the dateTime of the last retrieved value as the since parameter to find any new readings. Will accept a simple date value such as 2016-09-07 which will be interpreted as 2016-09-07T:00:00:00Z. The latter (with timestamp) is also accepted.
      :param latest: Return only the latest reading.
      :param today: Return only all readings from today.
      :param sorted: Order the array of returned readings into descending order by date, this done before the limits is applied thus enabling you to fetch the most recent n readings.     
      :param parameter_name: Return only measures for parameters with the given name, for example Water Level or Flow.
      :param parameter: Return only measures for parameters with the given short form name, for example level or flow.
      :param qualifier: Return only those measures with qualifier. Useful qualifiers are Stage and Downstream Stage (for stations such as weirs which measure levels at two locations), Groundwater for groundwater levels as opposed to river levels and Tidal Level for tidal levels.
      :param station_reference: Return only those measures which are available from the station with the given reference identifier.
      :param station: Return only those measures which are available from the station with the given URI.
      :param search: Return only those measures whose label contains the given value.
  """
    # Set True/False argument to blank or None so it is handled correctly
    if latest:
        latest = ''
    else:
        latest = None
        
    if today:
        today = ''
    else:
        today = None
  
    if sorted:
        sorted = ''
    else:
        sorted = None
    
    # build a dictionary of the query parameters
    params = {'_limit': limit,
    'date': date,
    'startdate': startdate,
    'enddate': enddate,
    'since': since, 
    'latest': latest,
    'today': today,
    '_sorted': sorted,       
    'parameterName': parameter_name,
    'parameter': parameter,
    'qualifier': qualifier,
    'stationReference': station_reference,
    'station': station,
    'search': search
    }
  
    api_url = 'http://environment.data.gov.uk/flood-monitoring/data/readings?parameter=rainfall'
    
    # getting data about readings from the EA API
    response = requests.get(api_url, params)
    
    # ensuring that cookies are allowed
    #response = requests.get(api_url, params, cookies=response.cookies)

    # extracting JSON data from the response container
    data = response.json()

    # extracting the items elements and loading data to a pandas data frame
    readings = pd.DataFrame(data["items"])

    return(readings)

In [5]:
# retrieving the rainfall stations, measures, and readings data 
rainfall_stations_data = get_data_stations(parameter='rainfall')
rainfall_measures_data = get_data_measures(parameter='rainfall')
rainfall_readings_data_latest = get_data_readings(latest=True, sorted=True, limit=10000)

In [6]:
# Checking the number of entries for each dataset
print('Shape of station data: ' + str(rainfall_stations_data.shape))
print('Shape of measures data: ' + str(rainfall_measures_data.shape))
print('Shape of latest readings data: ' + str(rainfall_readings_data_latest.shape))

Shape of station data: (997, 19)
Shape of measures data: (1023, 13)
Shape of latest readings data: (919, 4)


In [7]:
# checking for duplicates in each data frame by 'station_id' or 'stationReference' and dropping them
print('Number of duplicates in the rainfall stations data: ' + str(rainfall_stations_data[rainfall_stations_data.duplicated(['stationReference'])].shape[0]))
print('Number of duplicates in the rainfall measures data: ' + str(rainfall_measures_data[rainfall_measures_data.duplicated(['stationReference'])].shape[0]))
print('Number of duplicates in the rainfall readings data: ' + str(rainfall_readings_data_latest[rainfall_readings_data_latest.duplicated(['@id'])].shape[0]))

# dropping the duplicates and retaining the first values - this can be investigated later and rows with more entries can be
# kept to retain the most amount of information
rainfall_stations_data.drop_duplicates(subset=['stationReference'], keep='first', inplace = True)
rainfall_measures_data.drop_duplicates(subset=['stationReference'], keep='first', inplace = True)
rainfall_readings_data_latest.drop_duplicates(subset=['@id'], keep='first', inplace = True)

# resetting the indices of the data frames
rainfall_stations_data.reset_index(drop = True, inplace = True)
rainfall_measures_data.reset_index(drop = True, inplace = True)
rainfall_readings_data_latest.reset_index(drop = True, inplace = True)

# Checking the number of entries for each dataset after dropping duplicates
print('Shape of datasets after dropping duplicates:')
print('Shape of station data: ' + str(rainfall_stations_data.shape))
print('Shape of measures data: ' + str(rainfall_measures_data.shape))
print('Shape of latest readings data: ' + str(rainfall_readings_data_latest.shape))

Number of duplicates in the rainfall stations data: 2
Number of duplicates in the rainfall measures data: 25
Number of duplicates in the rainfall readings data: 0
Shape of datasets after dropping duplicates:
Shape of station data: (995, 19)
Shape of measures data: (998, 13)
Shape of latest readings data: (919, 4)


## Data cleaning and processing

In [8]:
# preparing the datasets to be combined

# extracting the 'measure' variable across datasets, which will serve as the UID primary key for combination

# stations data: creating the 'measure' variable by extracting values from the 'measures' dictionary
rainfall_stations_data['measure'] = rainfall_stations_data['measures'].copy()
for i in range(rainfall_stations_data.shape[0]):
    rainfall_stations_data.at[i,'measure'] = next(iter(rainfall_stations_data['measures'][i][0].values()))
    
# measures data: renaming the '@id' column to 'measures' to enable merging and better readability
rainfall_measures_data.rename(columns = {'@id': 'measure'}, inplace = True)

In [9]:
# merging the 3 dataframes for rainfall stations, measures, and readings data

# left join of readings with measures to keep all readings and outer join of resulting df with stations to keep
# details of non-responsive stations as well
rainfall_combined_data_latest = rainfall_readings_data_latest.merge(rainfall_measures_data,on='measure', how='left').merge(rainfall_stations_data,on='measure', how='outer')

# viewing the shape of the resulting df
print(rainfall_combined_data_latest.shape)

(1000, 35)


In [10]:
# further data cleaning

# filling missing values for station reference from secondary duplicate variable (from the stations dataframe)
rainfall_combined_data_latest.loc[rainfall_combined_data_latest['stationReference_x'].isnull(),'stationReference_x'] = rainfall_combined_data_latest['stationReference_y']

# dropping duplicate and redundant columns
rainfall_combined_data_latest = rainfall_combined_data_latest.drop(columns=['@id_x',
                                                              'label_x',
                                                              'measure',
                                                              'latestReading',
                                                              'notation_x',
                                                              'station',
                                                              '@id_y',                                                
                                                              'stationReference_y',
                                                              'measures',
                                                              'notation_y',
                                                              # columns relevant only for river data
                                                              'catchmentName',
                                                              'dateOpened',
                                                              'riverName',
                                                              'stageScale',
                                                              'status',
                                                              'town',
                                                              'wiskiID',
                                                              'datumOffset',
                                                              'RLOIid'
                                                             ])

# renaming columns
rainfall_combined_data_latest.rename(columns =   {'dateTime': 'date_and_time',
                                           'value': 'reading_value',
                                           'parameter': 'parameter_id',
                                           'parameterName': 'parameter_name',
                                           'period': 'reading_period',
                                           'qualifier': 'reading_qualifier',
                                           'stationReference_x': 'station_id',
                                           'unit':'reading_unit_id',
                                           'unitName':'reading_unit_name',
                                           'valueType':'reading_value_type',
                                           'easting':'station_easting',
                                           'northing': 'station_northing',
                                           'gridReference': 'station_grid_reference',
                                           'label_y': 'station_type',
                                           'lat': 'station_latitude',
                                           'long':'station_longitude',
                                           'valueType':'reading_value_type',
                                           'gridReference': 'station_grid_reference'},
                              inplace = True)

# separating date and time variables

# creating datetime object
rainfall_combined_data_latest['date_and_time'] = pd.to_datetime(rainfall_combined_data_latest['date_and_time'])

# creating date variable
rainfall_combined_data_latest['date'] = rainfall_combined_data_latest['date_and_time'].dt.date

# creaating time variable
rainfall_combined_data_latest['time'] = rainfall_combined_data_latest['date_and_time'].dt.time
                                                           
# reordering columns and dropping 'date_and_time' column
rainfall_combined_data_latest = rainfall_combined_data_latest[[   'date',
                                                    'time',
                                                    'station_id',
                                                    'station_type',
                                                    'station_grid_reference',
                                                    'station_latitude',
                                                    'station_longitude',
                                                    'station_easting',
                                                    'station_northing',
                                                    'parameter_id',
                                                    'parameter_name',
                                                    'reading_qualifier',
                                                    'reading_value',
                                                    'reading_unit_id',
                                                    'reading_unit_name',
                                                    'reading_value_type',
                                                    'reading_period'
                                               ]]

# viewing the shape of the resulting combined df
print("Shape of the combined dataframe:" + str(rainfall_combined_data_latest.shape))

Shape of the combined dataframe:(1000, 17)


## Explanation of columns

__date__: date  
__time__: time  
__station_id__: station identifier  
__station_type__: type of station  
__station_grid_reference__: grid reference for the station, rounded to a 100m grid  
__station_latitude__: latitude coordinates of station  
__station_longitude__: longitude coordinates of station  
__station_easting__: easting coordinates of station  
__station_northing__: northing coordinates of station  
__parameter_id__: short name/id of the quantity being measured  
__parameter_name__: name of the quantity being measured  
__reading_qualifier__: a qualifier for the quantity being measured, "Tipping Bucket Raingauge" for rainfall  
__reading_value__: the value of the reading for the associated measurement  
__reading_unit_id__: unit id/url for the reading  
__reading_unit_name__: unit name for the reading  
__reading_value_type__: type of measurement, e.g., total, mean, etc.  
__reading_period__: the period between successive readings, in seconds

## Data quality checks

In [11]:
# check if there are any stations without recent rainfall measurements and list their Station IDs

# list total stations with no recent rainfall measurements
print( "Total stations with no recent rainfall measurements: " + str(rainfall_combined_data_latest['reading_value'].isna().sum()))

print('')

print('IDs of the respective stations:')

print('')

# listing their IDs
print(rainfall_combined_data_latest[rainfall_combined_data_latest['reading_value'].isna() == True]['station_id'].values)

Total stations with no recent rainfall measurements: 81

IDs of the respective stations:

['4163' 'E1310' '0' '1117' '45101' '270400TP' '055223' 'E15520' 'E1965'
 'E11060' '008632' '000032' 'E23535' '019356' 'E43041' '266474TP' '577805'
 'E22876' 'E11040' '000102TP' '49106' '592848' '4103' '550148' '1795'
 'E23657' '50108' 'E7120' 'E22735' '238097TP' '593321' 'E2450' '562992'
 'E1691' '45157' '007533' '584098' 'E14880' 'E7190' '46101' 'E2859'
 '476898_TG_333' '246847TP' 'E24775' '568363' 'E1928' '585022' 'E24585'
 '605382' '563599' '55000A' 'E24141' '562811' 'E24499' 'E5720' '45108_'
 'E1711' '4527' '6666LO' '1409Lucas' '4549' '4548' 'E5650' '7015' '1607'
 '47176' '1338' '1750' '1810' '1408' 'Coldw1' 'Mitch1' 'Edgeh1' 'Dubbs1'
 '585122_' '2102SO' '575092______________________________________'
 '574786______________________________________' '062916' 'E60880' 'NE063']


In [12]:
# NaN value check with an assert statement
#assert len(rainfall_combined_data_latest[rainfall_combined_data_latest['reading_value'].isna() == True]['station_id'].tolist()) == 0, f"0 expected, got: {len(rainfall_combined_data_latest[rainfall_combined_data_latest['reading_value'].isna() == True]['station_id'].tolist())}"

In [13]:
# listing stations without latitude (can also be checked for longitude)
print( "Total stations without latitude coordinates: " + str(rainfall_combined_data_latest['station_latitude'].isna().sum()))

print('')

print('IDs of the respective stations:')

print('')

# listing their IDs
print(rainfall_combined_data_latest[rainfall_combined_data_latest['station_latitude'].isna() == True]['station_id'].values)

Total stations without latitude coordinates: 11

IDs of the respective stations:

['024121' '282947TP' 'E7040' nan '068416' nan 'E23500' '305111' '6666LO'
 '4549' '4548']


In [14]:
# check for duplicate stations by station ID on combined data frame (they could have come from the responses df)
print('Number of duplicates in the combined data frame: ' + str(rainfall_combined_data_latest[rainfall_combined_data_latest.duplicated(['station_id'])].shape[0]))

# dropping the duplicates and retaining the first values - this can be investigated later and rows with more entries can be
# kept to retain the most amount of information
rainfall_combined_data_latest.drop_duplicates(subset=['station_id'], keep='first', inplace = True)

# resetting the indices of the data frames
rainfall_combined_data_latest.reset_index(drop = True, inplace = True)

# Checking the number of entries for the dataset after dropping duplicates
print('Shape of combined data after dropping duplicates: ' + str(rainfall_combined_data_latest.shape))

Number of duplicates in the combined data frame: 2
Shape of combined data after dropping duplicates: (998, 17)


## Saving processed data

In [15]:
# saving combined dataset as csv and excel files
rainfall_combined_data_latest.to_csv(f'../data/rainfall_data_latest_{date.today()}.csv')
rainfall_combined_data_latest.to_excel(f'../data/rainfall_data_latest_{date.today()}.xlsx')