In [3]:
# from get_data import get_data
import pandas as pd
from datetime import datetime


def get_data(
    date_from=None,
    date_to=None,
    location=None
):
    """Get covid data
    Retrieve covid data in pandas dataframe format with the time periods and countries provided.

    Parameters
    ----------
    date_from : str, optional
        Start date of the data range with format in "YYYY-MM-DD" format. By default 'None' is used to represent 7 days prior to today's date
    date_to : str, optional
        End date of data range with format in "YYYY-MM-DD" format. By default 'None' is used to represent today's date
    location : list, optional
        List of target country names. By default "None" is used for all countries.

    Returns
    -------
    pandas.DataFrame
        Pandas dataframe of the selected covid data.

    Examples
    --------
    >>> get_data(date_from="2022-01-01", date_to="2022-01-07", location=["Canada", "China"])
    """
    query = "@date_from <= date <= @date_to"
    url = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv"

    if date_from is None:
        date_from = (pd.to_datetime("today").normalize() -
                     pd.to_timedelta(7, unit="d")).strftime('%Y-%m-%d')

    if date_to is None:
        date_to = pd.to_datetime("today").normalize().strftime('%Y-%m-%d')

    try:
        if date_from != datetime.strptime(date_from, "%Y-%m-%d").strftime("%Y-%m-%d"):
            raise ValueError
    except ValueError:
        raise ValueError(
            'Invalid argument value: date_from must be in format of YYYY-MM-DD. Also check if it is a valid date.'
        )
    except TypeError:
        raise TypeError(
            'Invalid argument type: date_from must be in string format of YYYY-MM-DD.'
        )

    try:
        if date_to != datetime.strptime(date_to, "%Y-%m-%d").strftime("%Y-%m-%d"):
            raise ValueError
    except ValueError:
        raise ValueError(
            'Invalid argument value: date_to must be in format of YYYY-MM-DD. Also check if it is a valid date.'
        )
    except TypeError:
        raise TypeError(
            'Invalid argument type: date_to must be in string format of YYYY-MM-DD.'
        )

    if pd.to_datetime(date_to) < pd.to_datetime(date_from):
        raise ValueError(
            "Invalid values: date_from should be smaller or equal to date_to (or today's date if date_to is not specified)."
        )
    if pd.to_datetime(date_to) > pd.to_datetime("today").normalize():
        raise ValueError(
            "Invalid values: date_to should be smaller or equal to today."
        )

    if location is not None:

        if not (isinstance(location, list)):
            raise TypeError(
                "Invalid argument type: location must be a list of strings."
            )

        for item in location:
            if not (isinstance(item, str)):
                raise TypeError(
                    "Invalid argument type: values inside location list must be a strings."
                )

        query += " and location in @location"

    try:
        covid_df = pd.read_csv(url, parse_dates=["date"])
    except:
        return "The link to the data is broken."

    covid_df = covid_df.query(query)

    return covid_df


In [31]:
plot_metric()

'date_from: Date Formart incorrect, should be YYYY-MM-DD'

In [30]:
df = get_data()
plot_metric(date_from="2021-01-01")

'date_from: Date Formart incorrect, should be YYYY-MM-DD'

In [13]:
date_to = "2010"
try:
    if date_to != datetime.strptime(date_to, "%Y-%m-%d").strftime("%Y-%m-%d"):
        raise ValueError
except ValueError:
    raise ValueError(
        'Invalid argument value: date_to must be in format of YYYY-MM-DD. Also check if it is a valid date.'
    )

ValueError: Invalid argument value: date_to must be in format of YYYY-MM-DD. Also check if it is a valid date.

In [12]:
import pandas as pd
import altair as alt
from dateutil.parser import parse

def plot_summary(
    df,
    var="location",
    val="new_cases",
    fun="sum",
    date_from=None,
    date_to=None,
    top_n=5,
):
    """Generate summary plot

    Create a horizontal bar chart summarising a specified variable and value
    within a time period

    Parameters
    ----------
    df  : Pandas dataframe
        Pandas dataframe of the selected covid data from get_data()
    var : str, optional
        Qualitative values to segment data. Must be a categorical variable.
        Also known as a 'dimension'. By default 'location'
    val : str, optional
        Quantitative values to be aggregated. Must be numeric variable.
        Also known as a 'measure'. By default 'new_cases'
    fun : str, optional
        Aggregation function for val, by default 'sum'
    date_from : str, optional
        Start date of the data range with format 'YYYY-MM-DD'. By default 'None' is used to represent 7 days prior to today's date
    date_to : str, optional
        End date of data range with format 'YYYY-MM-DD'. By default 'None' is used to represent today's date
    top_n : int, optional
        Specify number of qualitative values to show, by default 5
    """
    # init dates if None
    if date_from is None:
        date_from = (
            pd.to_datetime("today").normalize() - pd.to_timedelta(7, unit="d")
        ).strftime("%Y-%m-%d")

    if date_to is None:
        date_to = pd.to_datetime("today").normalize().strftime("%Y-%m-%d")

    # Exception Handling
    if not isinstance(df, pd.DataFrame):
        raise FileNotFoundError("Data not found! There may be a problem with data URL.")

    if not isinstance(var, str):
        raise TypeError("var needs to be of str type!")

    if not isinstance(val, str):
        raise TypeError("val needs to be of str type!")

    if not isinstance(fun, str):
        raise TypeError("fun needs to be of str type!")

    if df[var].dtypes.kind != "O":
        raise TypeError("var needs to be a categorical variable!")

    if df[val].dtypes.kind == "O":
        raise TypeError("val needs to be a numeric variable!")

    if not isinstance(top_n, int) or top_n <= 0:
        raise ValueError("top_n must be an integer bigger than zero")

    if pd.to_datetime(date_to) < pd.to_datetime(date_from):
        raise ValueError(
            "Invalid values: date_from should be smaller or equal to date_to (or today's date if date_to is not specified)."
        )
    if pd.to_datetime(date_to) > pd.to_datetime("today").normalize():
        raise ValueError("Invalid values: date_to should be smaller or equal to today.")

    # Parse date, else raise ValueError
    date_from = parse(date_from)
    date_to = parse(date_to)

    # Convert 'date' to date format
    df["date"] = pd.to_datetime(df["date"])

    # Filter by date
    df = df.query("date >= @date_from & date <= @date_to")

    # Remove aggregated locations
    df = df[~df["iso_code"].str.startswith("OWID")]

    # Aggregation
    df_plot = df.groupby(var).agg({val: fun})[val].nlargest(top_n)
    df_plot = df_plot.to_frame().reset_index()

    return alt.Chart(df_plot).mark_bar().encode(y=alt.Y(var, sort="x"), x=val)


In [34]:
# File Name: plot_metric.py
# Author: Rohit Rawat

# import altair as alt
# from get_data import get_data
# alt.data_transformers.enable('data_server')

def plot_metric(metric='positive_rate', date_from=None, date_to=None):
    """
    Create a line chart visualizing COVID total new
    cases and another metric for a specific time period
    
    Parameters
    ----------
    metric    : str, optional
                The name of the metric to be plotted with the new COVID cases. 
                It can be one of the these: "reproduction_rate", "positive_rate",
                or any other numeric column
    date_from : str, optional
                Start date of the plot in "YYYY-MM-DD" format, by default "2022-01-01"
    date_to   : str, optional
                End date of the plot in "YYYY-MM-DD" format, by default "2022-01-13"
    Returns
    -------
    chart
        The line chart created
    """
    
    # Check the input format of arguments
    if not isinstance(metric, str):
        return 'Incorrect argument type: Metric 1 input should be a float'

    if (not isinstance(date_from, str)) and date_from is not None:
        return 'Incorrect argument type: The starting date should be in string format'

    if (not isinstance(date_to, str)) and date_to is not None:
        return 'Incorrect argument type: The end date should be in string format'

    try:
        df = get_data(date_from, date_to)
    except:
        return 'Error in date format: Could not fetch data using get_data. Incorrect date format'
    
    # Check if the metric provided is present in the data frame or not
    if metric not in df.columns:
        return 'Incorrect argument value: The metric chosen is not one of the columns in dataframe'

    
    metric_label = "Mean " + metric.replace("_", " ")
    
    base = alt.Chart(df).encode(x=alt.X('yearmonthdate(date):T',
                                axis=alt.Axis(format='%e %b, %Y'),
                                title='Date'))

    line1 = base.mark_line(color='skyblue', interpolate='monotone'
                           ).encode(alt.Y('sum(new_cases)',
                                    scale=alt.Scale(zero=False),
                                    axis=alt.Axis(title='Daily new cases'
                                    , titleColor='skyblue')))
    
    line2 = base.mark_line(color='orange', interpolate='monotone'
                           ).encode(alt.Y(f"mean({metric})",
                                    scale=alt.Scale(zero=False),
                                    axis=alt.Axis(title=metric_label
                                    , titleColor='orange')))

    plot = alt.layer(line1, line2,
                     title= 'Daily COVID cases versus ' + metric_label
                     ).resolve_scale(y='independent')

    return plot


In [14]:
df=get_data(date_from="2020-10-10")
plot_summary(df, val="new_cases")

In [28]:
date_to ="20139"
if date_to != datetime.strptime(date_to, "%Y-%m-%d").strftime("%Y-%m-%d"):
    raise ValueError('Invalid argument value: date_to must be in format of YYYY-MM-DD. Also check if it is a valid date.')

ValueError: time data '20139' does not match format '%Y-%m-%d'

In [225]:
title=None
if title!=None:
    if not isinstance(title, str):
        raise TypeError("Invalid argument type: title must be a string.")

In [226]:
import pandas as pd
from datetime import datetime
import altair as alt
from dateutil.parser import parse


def plot_spec(
    df,
    location=["Canada"],
    val="new_cases",
    date_from=None,
    date_to=None,
    title=None
):
    """
    Create a line chart presenting specific country/countries COVID information
    within a time period
    
    Parameters
    ----------
    df  : Pandas dataframe
        Pandas dataframe of the selected covid data from get_data()
    location : list, optional
        List of target country names or . By default ["Canada"]
    val : str, optional
        Quantitative values of interests. Must be numeric variable.
        Also known as a 'measure'. By default 'new_cases'
    date_from : str, optional
        Start date of the data range with format in "YYYY-MM-DD" format. By 
        default 'None' is used to represent 7 days prior to today's date
    date_to : str, optional
        End date of data range with format in "YYYY-MM-DD" format. By default
        'None' is used to represent today's date
    title : str, optional
        The title of the plot. By default 'None' will be generated based on val.
    
    Returns
    -------
    plot
        The line chart created
    """
    # init dates if None
    if date_from is None:
        date_from = (pd.to_datetime("today").normalize() - pd.to_timedelta(7, unit="d")).strftime("%Y-%m-%d")

    if date_to is None:
        date_to = pd.to_datetime("today").normalize().strftime("%Y-%m-%d")
    

    
    # Exception Handling
    if not isinstance(df, pd.DataFrame):
        raise FileNotFoundError("Data not found. There may be a problem with data URL.")

    if not isinstance(location, list):
        raise TypeError("Invalid argument type: location must be a list of strings.")
    for item in location:
        if not (isinstance(item, str)):
            raise TypeError("Invalid argument type: values inside location list must be strings.")
    
    if not isinstance(val, str):
        raise TypeError("Invalid argument type: val must be a string.")

    if df[val].dtypes.kind == "O":
        raise TypeError("Invalid argument type: val must be a numeric variable.")
    
    try:
        date_from != datetime.strptime(date_from, "%Y-%m-%d").strftime("%Y-%m-%d")
    except ValueError:
        raise ValueError(
            'Invalid argument value: date_from must be in format of YYYY-MM-DD. Also check if it is a valid date.'
        )
    
    try:
        date_to != datetime.strptime(date_to, "%Y-%m-%d").strftime("%Y-%m-%d")
    except ValueError:
        raise ValueError(
            'Invalid argument value: date_to must be in format of YYYY-MM-DD. Also check if it is a valid date.'
        )

    if pd.to_datetime(date_to) < pd.to_datetime(date_from):
        raise ValueError(
            "Invalid values: date_from should be smaller or equal to date_to (or today's date if date_to is not specified)."
        )
    if pd.to_datetime(date_to) > pd.to_datetime("today").normalize():
        raise ValueError("Invalid values: date_to should be smaller or equal to today.")
        
    if title != None:
        if not isinstance(title, str):
            raise TypeError("Invalid argument type: title must be a string.")
    
    # Parse date, else raise ValueError
    date_from = parse(date_from)
    date_to = parse(date_to)

    # Convert 'date' to date format
    df["date"] = pd.to_datetime(df["date"])

    # Filter by date
    df = df.query("date >= @date_from & date <= @date_to")
    
    # Filter by country
    df = df.query("location in @location")
    
    # Remove aggregated locations
    df = df[~df["iso_code"].str.startswith("OWID")]
    
    # Create Y axis lable
    val_label = val.replace("_", " ").title()

    # init plot title if None
    if title is None:
        title = f"COVID {val_label}"
    
    # Create line plot
    line = alt.Chart(df, title = title).mark_line().encode(
        x=alt.X('monthdate(date):T', axis=alt.Axis(format='%e %b, %Y'), title='Date'),
        y=alt.Y(val, title=val_label),
        color=alt.Color('location', legend=None),
        tooltip=['location', val]
    )
    
    # Use direct labels
    order = (df.loc[df['date'] == df['date'].max()].sort_values(val, ascending=False))

    text = alt.Chart(order).mark_text(dx=20).encode(
        x=alt.X('monthdate(date):T', axis=alt.Axis(format='%e %b, %Y'), title='Date'),
        y=alt.Y(val, title=val_label),
        text='location',
        color='location',
    )
    plot = line + text
    return plot

In [244]:
assert plot_metric().layer[1].mark.type, 'not a line'

In [246]:
assert plot_metric().layer[1].mark.type == 'not a line', "some optional message if the test failed"

AssertionError: some optional message if the test failed

In [233]:
assert plot_metric(date_from=123), 'Incorrect argument type: The starting date should be in string format'
assert plot_metric(date_from=123), 'Incorrect'
assert True, "sdfihs"

In [144]:
import pickle


pickle.dump(df, open("test_df_plot_spec.pkl", "wb"))  # save it into a file named save.p


df = pickle.load(open("test_df_plot_spec.pkl", "rb"))
df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
607,AFG,Asia,Afghanistan,2021-10-23,155940.0,9.0,28.714,7253.0,1.0,2.143,...,,,37.746,0.5,64.83,0.511,,,,
608,AFG,Asia,Afghanistan,2021-10-24,155944.0,4.0,25.714,7255.0,2.0,1.714,...,,,37.746,0.5,64.83,0.511,,,,
609,AFG,Asia,Afghanistan,2021-10-25,156040.0,96.0,37.714,7260.0,5.0,2.000,...,,,37.746,0.5,64.83,0.511,,,,
610,AFG,Asia,Afghanistan,2021-10-26,156071.0,31.0,38.571,7262.0,2.0,2.143,...,,,37.746,0.5,64.83,0.511,,,,
611,AFG,Asia,Afghanistan,2021-10-27,156124.0,53.0,37.857,7266.0,4.0,2.714,...,,,37.746,0.5,64.83,0.511,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156309,ZWE,Africa,Zimbabwe,2022-01-17,226460.0,382.0,494.286,5258.0,11.0,11.143,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,
156310,ZWE,Africa,Zimbabwe,2022-01-18,226460.0,0.0,494.286,5258.0,0.0,11.143,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,
156311,ZWE,Africa,Zimbabwe,2022-01-19,226887.0,427.0,350.571,5266.0,8.0,7.286,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,
156312,ZWE,Africa,Zimbabwe,2022-01-20,227552.0,665.0,352.571,5276.0,10.0,7.714,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,


In [179]:
assert plot_metric().layer[0].mark.type, 'line'
assert plot_metric().layer[0].mark.type, 'xfsafdsfsdf'

In [141]:
df

<function __main__.df()>

In [199]:
from covizpy.plot_spec import plot_spec
from covizpy.get_data import get_data
import pandas as pd
from pytest import raises, fixture
import pickle


@fixture
def df():
    """
    Retrieve the dataframe
    """
    with open("tests/test_df_plot_spec.pkl", "rb") as pickle_file:
        return pickle.load(pickle_file)


def test_plot_spec_inputs(df):
    """
    Test the input type exceptions of plot_spec()
    """
    # check input type of location
    with raises(TypeError) as e:
        plot_spec(df, location="Canada")
    assert "Invalid argument type: location must be a list of strings." == str(e.value)

    # check input type of the item inside location
    with raises(TypeError) as e:
        plot_spec(df, location=["Canada", 231])
    assert "Invalid argument type: values inside location list must be strings." == str(e.value)

    # check input type of val
    with raises(TypeError) as e:
        plot_spec(df, val=123)
    assert "Invalid argument type: val must be a string." == str(e.value)
    
    # check the column type of val
    with raises(TypeError) as e:
        plot_spec(df, val="iso_code")
    assert "Invalid argument type: val must be a numeric variable." == str(e.value)
    
    # check the Value of date_from
    with raises(ValueError) as e:
        plot_spec(df, date_from="iso_code")
    assert "Invalid argument value: date_from must be in format of YYYY-MM-DD. Also check if it is a valid date." == str(e.value)
    
    # check the Value of date_to
    with raises(ValueError) as e:
        plot_spec(df, date_to="342432")
    assert "Invalid argument value: date_to must be in format of YYYY-MM-DD. Also check if it is a valid date." == str(e.value)
    
    # check date_from and date_to logic
    with raises(ValueError) as e:
        plot_spec(df, date_from="2021-06-15", date_to="2021-06-11")
    assert "Invalid values: date_from should be smaller or equal to date_to (or today's date if date_to is not specified)." == str(e.value)
    
    # check date_to value
    with raises(ValueError) as e:
        plot_spec(df, date_to="2022-06-11")
    assert "Invalid values: date_to should be smaller or equal to today." == str(e.value)


def test_plot_spec_mapping(df):
    """
    Test plot output of plot_spec()
    """
    # check y-axis is using the correct variable
    assert plot_spec(df, val="new_deaths").layer[0].encoding.y.shorthand == "new_deaths", "Altair chart y-axis should be using variable 'new_deaths'"
    
    # check y-axis is using the correct lable
    assert plot_spec(df, val="new_deaths").layer[1].encoding.y.title == "New Deaths", "Altair chart y-axis should have lable 'New Deaths'"
    
    # check the first layer of the graph is line
    assert plot_spec(df).layer[0].mark == 'line', "Altair chart first layer should be line"
    
    # check the title of the graph
    assert plot_spec(df, title="Daily cases").layer[0].title=="Daily cases", "Altair chart title should be changeable"
    
    # check the data of the graph
    assert (plot_spec(df, date_from="2022-01-19", date_to="2022-01-20").layer[0].data["new_cases"] == [16849, 15775]).all(), "Altair chart data is wrong"

In [220]:
assert (plot_spec(df, date_from="2022-01-19", date_to="2022-01-20").layer[0].data["new_cases"] == [16849, 15775]).all(), "Altair chart data retried is wrong"

AssertionError: dsf

In [218]:
plot_spec(df, date_from="2022-01-19", date_to="2022-01-20").layer[0].data["new_cases"].all()

True

In [137]:
def df():
    """
    Retrieve the dataframe
    """
    with open("test_df_plot_spec.pkl", "rb") as pickle_file:
        return pickle.load(pickle_file)


def test_plot_spec_inputs(df):
    """
    Test the input type exceptions of plot_spec()
    """
    # check input type of location
    with raises(TypeError) as e:
        plot_spec(df, location="sadsd")
    assert "Invalid argument type: location must be a list of strings." == str(e.value)
    
test_plot_spec_inputs(df())

In [109]:
df2 = pd.read_pickle('test_df_plot_spec.pkl')
# print the dataframe
print(df2)

<function df at 0x7f8f7bf1f0d0>


In [35]:
name = list(df["location"].unique())
name

['Afghanistan',
 'Africa',
 'Albania',
 'Algeria',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Asia',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bonaire Sint Eustatius and Saba',
 'Bosnia and Herzegovina',
 'Botswana',
 'Brazil',
 'British Virgin Islands',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Colombia',
 'Comoros',
 'Congo',
 'Cook Islands',
 'Costa Rica',
 "Cote d'Ivoire",
 'Croatia',
 'Cuba',
 'Curacao',
 'Cyprus',
 'Czechia',
 'Democratic Republic of Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Eswatini',
 'Ethiopia',
 'Europe',
 'European Union',


In [78]:
plot_spec(
    df,
    location=name,
    val="new_cases",
    date_from= "2020-12-23",
    date_to=None,
)


MaxRowsError: The number of rows in your dataset is greater than the maximum allowed (5000). For information on how to plot larger datasets in Altair, see the documentation

alt.LayerChart(...)

In [37]:
plot_metric()

In [88]:
import plot_metric
def test_plot_metric_input():
    """
    Test the inputs of plot_metric()
    """
    # check the data type of the date_from
    assert plot_metric(date_from=123), 'Incorrect argument type: The starting date should be in string format'
test_plot_metric_input()

ModuleNotFoundError: No module named 'covizpy.get_data'; 'covizpy' is not a package

In [36]:
assert plot_spec(df,val=123), "Invalid argument type: val must be a numeric variable."

TypeError: Invalid argument type: val must be a string.