# Set-Up

In [2]:
import logging 
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import altair as alt
import pycountry
import geopandas
import countrynames
from matplotlib.ticker import FixedLocator
from sklearn.metrics import pairwise_distances
from geopy.geocoders import Nominatim

In [None]:
logging.basicConfig(filename='../debug_info.log', encoding='utf-8', level=logging.DEBUG)

# Fetch data

In [None]:
initial_data_url = "https://www.arcgis.com/apps/opsdashboard/index.html#/bda7594740fd40299423467b48e9ecf6"
final_data_url = "https://data.mendeley.com/datasets/kg72dst75p/1"

In [None]:
data_1day = pd.read_csv("../data/Coronavirus_Data_Cases_1a. Per day.csv", sep=";", encoding="latin-1", index_col=0, header=0, thousands=',')
data_1date = pd.read_csv("../data/Coronavirus_Data_Cases_1b. Per date.csv", sep=";", encoding="latin-1", index_col=0, header=0, thousands=',')
data_2day = pd.read_csv("../data/Coronavirus_Data_Cases_2a. Per day_population.csv", sep=";", encoding="latin-1", index_col=0, header=0, thousands=',')
data_2date = pd.read_csv("../data/Coronavirus_Data_Cases_2b. Per date_population.csv", sep=";", encoding="latin-1", index_col=0, header=0, thousands=',')
data_5date = pd.read_csv("../data/Coronavirus_Data_Cases_5a. Per date_popul_surf.csv", sep=";", encoding="latin-1", index_col=0, header=0, thousands=',')
data_5day = pd.read_csv("../data/Coronavirus_Data_Cases_5a. Per day_popul_surf.csv", sep=";", encoding="latin-1", index_col=0, header=0, thousands=',')

# Cleaning Data
## Adjust Data types and columns

In [None]:
def clean_util(dataframe, name, statistics=False):
    if statistics:
        dataframe_with_metric = dataframe.iloc[-4:].copy()
        dataframe = dataframe.iloc[:-5].copy()
    else:
        dataframe_with_metric = dataframe.iloc[0].T.squeeze()
        dataframe = dataframe.iloc[1:].copy()
    dataframe.index.name = name
    if name == 'Date':
        dataframe.index = pd.to_datetime(dataframe.index, format='%d/%m/%Y')
    else:
        dataframe.index = dataframe.index.astype(int)
    return (dataframe, dataframe_with_metric)

def clean_up(dataframe):

    dataframe.dropna(axis=1, how='all', inplace=True)
    dataframe = dataframe.apply(pd.to_numeric, args={'coerce'})
    dataframe.columns = [col.replace('United Kingdom', 'UK') for col in dataframe.columns]

    if type(dataframe.index[0]) != str:
        dataframe.index.name = 'Day'
        return dataframe

    elif 'Density' in dataframe.index[0]  or 'Population' in dataframe.index[0]:
        temp = dataframe.index[1]
        if '/' not in dataframe.index[1]:
            return clean_util(dataframe, 'Day')
        elif '/' in dataframe.index[1]:
            return clean_util(dataframe, 'Date')
    
    else:
        return clean_util(dataframe, 'Date', statistics=True)


data_1day = clean_up(data_1day)
data_1date, data_1date_metrics = clean_up(data_1date)
data_2day, data_2day_population = clean_up(data_2day)
data_2date, data_2date_population = clean_up(data_2date)
data_5day, data_5day_density = clean_up(data_5day)
data_5date, data_5date_density = clean_up(data_5date)