In [1]:
#Imports
import pandas as pd
import matplotlib.pyplot as plt
import requests
import ipywidgets as wdg
from IPython.display import display, clear_output
import time

%matplotlib inline

#File paths and API URLs
file_path = 'timeseries.json'

respiratory_api_urls = {
    "Adenovirus": "https://api.ukhsa-dashboard.data.gov.uk/themes/infectious_disease/sub_themes/respiratory/topics/Adenovirus/geography_types/Nation/geographies/England/metrics/adenovirus_testing_positivityByWeek",
    "hMPV": "https://api.ukhsa-dashboard.data.gov.uk/themes/infectious_disease/sub_themes/respiratory/topics/hMPV/geography_types/Nation/geographies/England/metrics/hMPV_testing_positivityByWeek",
    "Influenza": "https://api.ukhsa-dashboard.data.gov.uk/themes/infectious_disease/sub_themes/respiratory/topics/Influenza/geography_types/Nation/geographies/England/metrics/influenza_testing_positivityByWeek",
    "Rhinovirus": "https://api.ukhsa-dashboard.data.gov.uk/themes/infectious_disease/sub_themes/respiratory/topics/Rhinovirus/geography_types/Nation/geographies/England/metrics/rhinovirus_testing_positivityByWeek",
    "RSV": "https://api.ukhsa-dashboard.data.gov.uk/themes/infectious_disease/sub_themes/respiratory/topics/RSV/geography_types/Nation/geographies/England/metrics/RSV_testing_positivityByWeek"
}

lineage_api_url = "https://api.ukhsa-dashboard.data.gov.uk/themes/infectious_disease/sub_themes/respiratory/topics/COVID-19/geography_types/Nation/geographies/England/metrics/COVID-19_cases_lineagePercentByWeek"
lineage_output_file = "lineage_data.json"

#Globs
covid_df = pd.DataFrame()
lineage_df = pd.DataFrame()
respiratory_data = pd.DataFrame()

#Output widgets
covid_output = wdg.Output()
lineage_output = wdg.Output()
debug_output = wdg.Output()
plot_output = wdg.Output()

"""This cell of code sets up the imports I am using, global variables, and the configurations necessary for fetching,
processing and visualising the covid 19 and other resp virus data. This is done by setting out the file path for the
timeseries file I use to make the timeseries graph, a dictionary containing respiratory virus APIs, and my lineage data API
along with the output file where Im storing the data. The global variables are just 3 dataframes to hold the time series,
lineage, and combined resp virus positivity rate data. The output widgets are pretty self explanatory - just widgets to
display the plots and outputs for all the data, and an extra widget to display debug logs during the API requests."""

'This cell of code sets up the imports I am using, global variables, and the configurations necessary for fetching,\nprocessing and visualising the covid 19 and other resp virus data. This is done by setting out the file path for the\ntimeseries file I use to make the timeseries graph, a dictionary containing respiratory virus APIs, and my lineage data API\nalong with the output file where Im storing the data. The global variables are just 3 dataframes to hold the time series,\nlineage, and combined resp virus positivity rate data. The output widgets are pretty self explanatory - just widgets to\ndisplay the plots and outputs for all the data, and an extra widget to display debug logs during the API requests.'

In [2]:
#Covid timeseries funcs
def wrangle_data(file_path):
    try:
        rawdata = pd.read_json(file_path)
        if 'data' in rawdata.columns:
            normal_data = pd.json_normalize(rawdata['data'])
            normal_data["date"] = pd.to_datetime(normal_data["date"])
            return normal_data
        else:
            return pd.DataFrame()
    except Exception as e:
        print(f"Error loading data: {e}")
        return pd.DataFrame()

def plot_data(dataframe, column):
    with covid_output:
        clear_output(wait=True)
        if dataframe.empty or column not in dataframe.columns:
            print("No data available to plot.")
            return

        display_name = "Hospitalisations" if column == "hospital" else column.capitalize()
        plt.figure(figsize=(10, 6))
        plt.plot(dataframe["date"], dataframe[column], marker='', linestyle='-')
        plt.title(f"{display_name} Over Time")
        plt.xlabel("Date")
        plt.ylabel("Count")
        plt.grid()
        plt.tight_layout()
        plt.show()

def refresh_plot(change=None):
    metric = covid_column_selector.value.lower()
    plot_data(covid_df, "hospital" if metric == "hospitalisations" else metric)

"""This cell contains all the functions needed to handle the covid time series data. Wrangle data reads and processes
the data from a json file, converts it to a pandas dataframe, normalizes it, converts the date column to datetime, and then plots the 
plots it on a timeseries graph. Additionally, if there's no data in the frame, or the columns are structured incorrectly,
it returns an error message (no data available to plot). The plotting function plots, and the refresh function refreshes
the plot based upon metrics selected by user from a dropdown menu."""

"This cell contains all the functions needed to handle the covid time series data. Wrangle data reads and processes\nthe data from a json file, converts it to a pandas dataframe, normalizes it, converts the date column to datetime, and then plots the \nplots it on a timeseries graph. Additionally, if there's no data in the frame, or the columns are structured incorrectly,\nit returns an error message (no data available to plot). The plotting function plots, and the refresh function refreshes\nthe plot based upon metrics selected by user from a dropdown menu."

In [3]:
#Lineage timeseries funcs
def fetch_lineage_data_all_pages():
    results = []
    next_url = lineage_api_url
    while next_url:
        response = requests.get(next_url)
        if response.status_code != 200:
            print(f"Failed to fetch data from {next_url}")
            break
        data = response.json()
        results.extend(data["results"])
        next_url = data.get("next") 
    return pd.DataFrame(results)

def save_data_to_json(dataframe, filename):
    dataframe.to_json(filename, orient="records", date_format="iso")
    print(f"Data saved to {filename}")

def preprocess_lineage_data(dataframe):
    if dataframe.empty:
        print("No lineage data available.")
        return pd.DataFrame()

    filtered_df = dataframe[["date", "stratum", "metric_value"]]
    filtered_df = filtered_df.rename(columns={"stratum": "Lineage", "metric_value": "Percentage"})
    filtered_df["date"] = pd.to_datetime(filtered_df["date"])
    return filtered_df

def plot_lineage_time_series(dataframe):
    with lineage_output:
        clear_output(wait=True)
        if dataframe.empty:
            print("No data available to plot.")
            return

        plt.figure(figsize=(12, 6))
        for lineage in dataframe["Lineage"].unique():
            lineage_data = dataframe[dataframe["Lineage"] == lineage]
            plt.plot(lineage_data["date"], lineage_data["Percentage"], label=lineage, marker='', linestyle='-')

        plt.title("COVID-19 Cases by Lineage Over Time")
        plt.xlabel("Date")
        plt.ylabel("Percentage of Cases (%)")
        plt.legend(title="Lineage", loc='upper left')
        plt.grid()
        plt.tight_layout()
        plt.show()

def fetch_and_update_lineage_data(button):
    global lineage_df
    with debug_output:
        clear_output(wait=True)
        print("Fetching lineage data from API...")
        new_data = fetch_lineage_data_all_pages()
        if not new_data.empty:
            print("Data fetched successfully.")
            lineage_df = preprocess_lineage_data(new_data)
            save_data_to_json(new_data, lineage_output_file)
            plot_lineage_time_series(lineage_df)
        else:
            print("Failed to fetch lineage data or no new data available.")

"""This cell's functions handle the lineage data. This includes fetching, saving, processing and plotting the data. It 
fetches data from the API with a while loop that gets pages so long as more pages from the url exist, combines it all into 
a dataframe and handles previous errors I fixed that happened when data fetching failed. It then converts and
saves the lineage data to a json file. It then cleans the data by filtering by metric and changing date to datetime.
It then plots it on a graph, and the final function combines all of this to be done by a button the user presses."""

"This cell's functions handle the lineage data. This includes fetching, saving, processing and plotting the data. It \nfetches data from the API with a while loop that gets pages so long as more pages from the url exist, combines it all into \na dataframe and handles previous errors I fixed that happened when data fetching failed. It then converts and\nsaves the lineage data to a json file. It then cleans the data by filtering by metric and changing date to datetime.\nIt then plots it on a graph, and the final function combines all of this to be done by a button the user presses."

In [4]:
#Other respiratory virus funcs
def fetch_all_pages(api_url, virus_name, delay=0.2):
    results = []
    next_url = api_url
    while next_url:
        response = requests.get(next_url)
        if response.status_code != 200: #Status code 200 signifies success (server returned desired response)
            print(f"Failed to fetch data from {next_url}")
            break
        data = response.json()
        results.extend(data["results"])
        next_url = data.get("next")  
        time.sleep(delay)  
    if not results:
        return pd.DataFrame()
    df = pd.DataFrame(results)
    df["virus"] = virus_name  
    return df
#The function above is horrifically inefficient, but, I tried.

def fetch_and_combine_data(delay=0.2):
    global respiratory_data
    combined_data = []
    for virus_name, api_url in respiratory_api_urls.items():
        print(f"Fetching data for {virus_name}...")
        virus_data = fetch_all_pages(api_url, virus_name, delay=delay)
        if not virus_data.empty:
            combined_data.append(virus_data)
    if combined_data:
        respiratory_data = pd.concat(combined_data, ignore_index=True)
        print("All data fetched and combined.")
    else:
        print("No data fetched from APIs.")

def preprocess_data(dataframe):
    if dataframe.empty:
        print("No data available for preprocessing.")
        return pd.DataFrame()
    dataframe = dataframe[["date", "metric_value", "virus"]].copy()
    dataframe.rename(columns={"metric_value": "PositivityRate"}, inplace=True)
    dataframe["date"] = pd.to_datetime(dataframe["date"])
    return dataframe

def plot_time_series(dataframe):
    with plot_output:
        clear_output(wait=True)
        if dataframe.empty:
            print("No data available to plot.")
            return
        plt.figure(figsize=(12, 6))
        for virus in dataframe["virus"].unique():
            virus_data = dataframe[dataframe["virus"] == virus]
            plt.plot(virus_data["date"], virus_data["PositivityRate"], label=virus, marker='', linestyle='-')
        plt.title("Respiratory Viruses Positivity Rates by Week")
        plt.xlabel("Date")
        plt.ylabel("Positivity Rate (%)")
        plt.legend(title="Virus", loc='upper left')
        plt.grid()
        plt.tight_layout()
        plt.show()

def rip_data_from_apis(button):
    with debug_output:
        clear_output(wait=True)
        print("Fetching data from APIs...")
        fetch_and_combine_data()
        if not respiratory_data.empty:
            processed_data = preprocess_data(respiratory_data)
            plot_time_series(processed_data)
        else:
            print("No data available after fetching from APIs.")

"""This cell fetches, processes and plots a timeseries graph for 5 different respiratory viruses (Adenovirus, hMPV, 
influenza, rhinovirus and RSV). The graph is for positive test rates. It fetches the pages with a delay as I kept getting
timed out when I didn't use a delay, thus failing to generate results. It does this by iterating through the pages using
the next URL, packages it and returns it in a dataframe containing data for each virus with an added column for the virus
name (fetch_all_pages). It then iterates through the API dictionary of URLs for the different viruses, 
using the fetch_all_pages function to receive data for each virus and combines each individual dataframe into a 
consolidated dataframe (fetch_and_combine_data). It outputs error messages when data cannot be received and a success message when all data
is fetched. It then cleans the data, selecting only relevant columns, and renames metric_value to PositivityRate for 
greater clarity for the user. All of this is done when the user clicks the "rip_data_from_apis" button."""

"""IMPORTANT KET NOTE PLEASE READ BEFORE TRYING TO USE THE BUTTON: This process loops through (I think it was) 5164 pages 
worth of data. With the 200ms delays, this takes around 15 to 20 minutes, and the risk of certain data being lost to
timeouts isnt entirely mitigated (though the grand majority of the data will be receieved and plotted). Just bear 
in mind this process will take a long while. You will know it works as when one virus' pages has been fetched, it will 
give a message moving onto the next one. Please please please bear this in mind when using."""

"IMPORTANT KET NOTE PLEASE READ BEFORE TRYING TO USE THE BUTTON: This process loops through (I think it was) 5164 pages \nworth of data. With the 200ms delays, this takes around 15 to 20 minutes, and the risk of certain data being lost to\ntimeouts isnt entirely mitigated (though the grand majority of the data will be receieved and plotted). Just bear \nin mind this process will take a long while. You will know it works as when one virus' pages has been fetched, it will \ngive a message moving onto the next one. Please please please bear this in mind when using."

In [5]:
#Widget logic
covid_column_selector = wdg.Dropdown(
    options=['Cases', 'Hospitalisations', 'Deaths'],
    value='Cases',
    description='COVID Metric:',
)
covid_column_selector.observe(refresh_plot, names='value')

lineage_button = wdg.Button(description="Rip Data from Lineage API", icon="download", button_style="info")
lineage_button.on_click(fetch_and_update_lineage_data)

respiratory_button = wdg.Button(description="Rip Data from APIs", icon="download", button_style="info")
respiratory_button.on_click(rip_data_from_apis)

"""This cell just defines the interactive widgets so that the user can select options and trigger the functions for
dealing with the data. There's a dropdown menu for the timeseries metrics, the rip from lineage button and the rip from
respiratory button."""


"This cell just defines the interactive widgets so that the user can select options and trigger the functions for\ndealing with the data. There's a dropdown menu for the timeseries metrics, the rip from lineage button and the rip from\nrespiratory button."

In [6]:
#Display widgets
clear_output(wait=True)
print("COVID-19 Metrics:")
display(wdg.VBox([covid_column_selector]), covid_output)

print("\nLineage Data:")
display(wdg.VBox([lineage_button, debug_output]), lineage_output)

print("\nRespiratory Virus Data:")
display(wdg.VBox([respiratory_button, debug_output]), plot_output)

"""This cell just clears existing outputs and displays the buttons. This is so a clean display can be created for updates 
and so that the user has something to interact with"""

COVID-19 Metrics:


VBox(children=(Dropdown(description='COVID Metric:', options=('Cases', 'Hospitalisations', 'Deaths'), value='C…

Output()


Lineage Data:


VBox(children=(Button(button_style='info', description='Rip Data from Lineage API', icon='download', style=But…

Output()


Respiratory Virus Data:


VBox(children=(Button(button_style='info', description='Rip Data from APIs', icon='download', style=ButtonStyl…

Output()

'This cell just clears existing outputs and displays the buttons. This is so a clean display can be created for updates \nand so that the user has something to interact with'

In [7]:
#Initialise plots
covid_df = wrangle_data(file_path)
refresh_plot()
plot_lineage_time_series(lineage_df)

"""This cell initialises the first set of visualisations for the widgets and their corresponding datasets. It ensures that
the data is loaded, processed then displayed when the notebook is run for the first time"""

'This cell initialises the first set of visualisations for the widgets and their corresponding datasets. It ensures that\nthe data is loaded, processed then displayed when the notebook is run for the first time'