## 0. Setup

This cell will always need to be run

In [7]:
import os
import json

# Change cwd to this file's dir, so we can use a relative path when saving and loading files:
this_dir = os.getcwd()
os.chdir(this_dir)

## 1. Get raw data from EU

In [2]:
import urllib.request

print('Step 1: Hacking into the EU mainframe...')

covid_19_data_url = "https://opendata.ecdc.europa.eu/covid19/casedistribution/json/"

with urllib.request.urlopen(covid_19_data_url) as input_file:
	# We read the json, but decode it as plaintext, not bytes:
	covid_19_json = input_file.read().decode('utf-8')

	# We save the fetched data to file:
	with open('../data/input/covid_19_raw.json', 'w') as output_file:
		output_file.write(covid_19_json)

Step 1: Hacking into the EU mainframe...


## Optional - Save 1.
Save the output of step 1 in `/data/input/` for later use

## Optional - Load 1.
Load the data needed for step 2, if the data from step 1 has already been saved in `/data/input/`

## 2. Calculate deaths and cases per capita

In [3]:
print('Step 2: Calculating deaths and cases per capita...')

def insert_variable_per_cap_columns(covid_19_records):
    for record in covid_19_records:
        # Skip cases in international waters etc:
        if record['popData2019'] == None:
            continue

        # Insertion of new 'columns':
        record['deaths_per_cap'] = record['deaths'] / record['popData2019']
        record['cases_per_cap'] = record['cases'] / record['popData2019']


with open('../data/input/covid_19_raw.json') as input_file:
    covid_19_dict = json.loads(input_file.read())

    insert_variable_per_cap_columns(covid_19_dict['records'])

    with open('../data/temp/covid_19_vals_per_cap.json', 'w') as output_file:
        json.dump(covid_19_dict, output_file)


Step 2: Calculating deaths and cases per capita...


## Optional - Save 2.
Save the output of step 2 in `/data/temp/` for later use

## Optional - Load 2.
Load the data needed for step 3, if the data from step 2 has already been saved in `/data/temp/`

## 3. Calculate max values for the visualiser limits

In [4]:
print('Step 3: Calculating max covid-19 cases and deaths...')

def remove_top_countries(covid_19_records, sort_by):
  # The amount of countries to remove:
  remove_amt = 3

  # There aren't numbers for all records with these variables:
  if sort_by == 'cases_per_cap' or sort_by == 'deaths_per_cap':
    covid_19_records = filter(
        lambda record: record.get(sort_by), covid_19_records)

  # Sorts by the given variable:
  sorted_list = sorted(covid_19_records, key=lambda record: record[sort_by])

  # Removes the countries with the top numbers
  for i in range(remove_amt):
    highest_num_country = sorted_list[len(
        sorted_list) - 1]['countriesAndTerritories']
    sorted_list = [
        record for record in sorted_list if record['countriesAndTerritories'] != highest_num_country]
  return sorted_list


def get_max_vals(covid_19_records):
  def get_max_value(variable):
    sorted_filtered = remove_top_countries(covid_19_records, sort_by=variable)
    max = sorted_filtered[len(sorted_filtered) - 1][variable]
    return max

  return {
      "cases": get_max_value('cases'),
      "deaths": get_max_value('deaths'),
      "cases_per_cap": get_max_value('cases_per_cap'),
      "deaths_per_cap": get_max_value('deaths_per_cap')
  }

# Data has not yet been transformed, since it is easier to loop through before nesting data with dates:
with open('../data/temp/covid_19_vals_per_cap.json') as input_file:
  covid_19_records = json.loads(input_file.read())['records']

  # The max values are not necessarily all for the same country, since they are just used to set the color limits in the visualizer:
  max_vals = get_max_vals(covid_19_records)

  with open('../data/temp/covid_19_max_vals.json', 'w') as output_file:
    json.dump(max_vals, output_file)


Step 3: Calculating max covid-19 cases and deaths...


## Optional - Save 3.
Save the output of step 3 in `/data/temp/` for later use

## Optional - Load 3.
Load the data needed for step 4, if the data from step 3 have already been saved in `/data/temp/`

## 4. Transform data into a better format for the visualiser

In [5]:
print('Step 4: Transforming the data to make it easier to work with...')

def get_unique_dates(covid_19_list):
    used_dates = []

    for entry in covid_19_list:
        date = entry['dateRep']
        if date not in used_dates:
            used_dates.append(date)
        
    return used_dates


# The file is opened from the url via the urllib library
with open("../data/temp/covid_19_vals_per_cap.json") as input_file:
    # The file contains a list "records" which holds the information
    covid_19_dict = json.loads(input_file.read())

    # An array for the finished data is created
    data_correct_form = []

    # I use my helper function to get an array of dates and run through them.
    for date in get_unique_dates(covid_19_dict['records']):
        # This is the form i want the data to be in, so i create the template dictionary
        date_dict = {'date': date, 'data': []}

        # I run through the entire dataset again, and gather all the data points with the corresponding data.
        # When found, the entire chunk of data is appended into the countries key in the dictionary.
        for entry in covid_19_dict['records']:
            if entry['dateRep'] == date:
                date_dict['data'].append(entry)

        # The entire dictionary is then appended to the list.
        data_correct_form.append(date_dict)

    covid_19_dict['records'] = data_correct_form

    # The end folder is located and the file inside is updated with the new information.
    with open('../data/temp/covid_19_transformed.json', 'w') as output_file:
        json.dump(covid_19_dict, output_file, indent=2)

Step 4: Transforming the data to make it easier to work with...


## Optional - Save 4.
Save the output of step 4 in `/data/temp/` for later use

## Optional - Load 4.
Load the data needed for step 5, if the data from step 4 & 3 have already been saved in `/data/temp/`

## 5. Sorting and filtering out unnecessary data

In [6]:
print('Step 5: Sorting and filtering the data for the visualization...')

# Paths to the json files we need
max_vals_path = "../data/temp/covid_19_max_vals.json"
covid_19_path = "../data/temp/covid_19_transformed.json"

# The sort function takes a compare function
# When sorting, we want to get the earliest date first
# The function creates an integer by concatenating year-month-day and sorts on that


def compare_function(e):
  split_date = e["date"].split("/")
  return int(split_date[2] + split_date[1] + split_date[0])


# loads both covid_19_max_values.json and covid_19_transformed.json
# sorts the records list in the data and merges the two files into a final product
# Filtering might be added later
with open(max_vals_path) as max_vals, open(covid_19_path) as covid_19_data:
  covid_19_dict = json.loads(covid_19_data.read())
  max_vals_dict = json.loads(max_vals.read())

  covid_19_dict["records"].sort(key=compare_function)

  output_dict = {
      "max_vals": max_vals_dict,
      "records": covid_19_dict['records']
  }

  for entry in output_dict['records']:
    for data_point in entry['data']:
      del data_point['dateRep']
      del data_point['day']
      del data_point['month']
      del data_point['year']
      del data_point['popData2019']
      del data_point['continentExp']
      del data_point['Cumulative_number_for_14_days_of_COVID-19_cases_per_100000']

  with open("../data/output/covid_19_output.json", "w") as output_file:
    json.dump(output_dict, output_file, indent=2)


Step 5: Sorting and filtering the data for the visualization...
