# Import necessary modules

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

# Load dataset

1. Super_dataset is the final dataset from our previous stage.
2. World_dataset is the dataset for world-wide data.

In [2]:
super_dataset = pd.read_csv('../../../data/stage_I/superDataset.csv')
world_dataset = pd.read_csv('../../../data/stage_II/owid-covid-data.csv')
super_dataset.head()

Unnamed: 0,countyFIPS,County Name,State,StateFIPS,population,2020-01-22_cases,2020-01-22_deaths,2020-01-23_cases,2020-01-23_deaths,2020-01-24_cases,...,2021-02-28_cases,2021-02-28_deaths,2021-03-01_cases,2021-03-01_deaths,2021-03-02_cases,2021-03-02_deaths,2021-03-03_cases,2021-03-03_deaths,2021-03-04_cases,2021-03-04_deaths
0,0,Statewide Unallocated,AL,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1001,Autauga County,AL,1,55869,0,0,0,0,0,...,6264,91,6270,91,6303,91,6313,91,6324,92
2,1003,Baldwin County,AL,1,223234,0,0,0,0,0,...,19732,283,19758,283,19790,284,19856,285,19873,289
3,1005,Barbour County,AL,1,24686,0,0,0,0,0,...,2115,51,2116,51,2124,51,2129,51,2136,51
4,1007,Bibb County,AL,1,22394,0,0,0,0,0,...,2450,60,2450,60,2454,60,2459,60,2461,60


In [3]:
world_dataset.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index
0,AFG,Asia,Afghanistan,2020-02-24,1.0,1.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
1,AFG,Asia,Afghanistan,2020-02-25,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
2,AFG,Asia,Afghanistan,2020-02-26,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
3,AFG,Asia,Afghanistan,2020-02-27,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511
4,AFG,Asia,Afghanistan,2020-02-28,1.0,0.0,,,,,...,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511


## Get United States data

1. Pull DataFrame whose location is "United States".
2. Fill empty columns with 0 value. Empty columns in this case represents that there are no recorded values
3. Extract year and week number from date column by converting date string to datetime object

In [4]:
us_dataset = world_dataset[world_dataset.location == 'United States'].fillna(0)
us_dataset[['year', 'week', 'day']] = pd.DataFrame(us_dataset.date.apply(lambda x: datetime.strptime(x, "%Y-%m-%d").isocalendar()).to_list(), index=us_dataset.index)
us_dataset.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,year,week,day
68974,USA,North America,United States,2020-01-22,1.0,0.0,0.0,0.0,0.0,0.0,...,10.79,19.1,24.6,0.0,2.77,78.86,0.926,2020,4,3
68975,USA,North America,United States,2020-01-23,1.0,0.0,0.0,0.0,0.0,0.0,...,10.79,19.1,24.6,0.0,2.77,78.86,0.926,2020,4,4
68976,USA,North America,United States,2020-01-24,2.0,1.0,0.0,0.0,0.0,0.0,...,10.79,19.1,24.6,0.0,2.77,78.86,0.926,2020,4,5
68977,USA,North America,United States,2020-01-25,2.0,0.0,0.0,0.0,0.0,0.0,...,10.79,19.1,24.6,0.0,2.77,78.86,0.926,2020,4,6
68978,USA,North America,United States,2020-01-26,5.0,3.0,0.0,0.0,0.0,0.0,...,10.79,19.1,24.6,0.0,2.77,78.86,0.926,2020,4,7


## Group data

1. Group data by year and week.
2. Apply mean on each group.
3. Apply lambda function to round each value to its nearest integer value.
4. Append starting date of each week in the aggregated dataframe.

In [5]:
weekly_data = us_dataset.groupby(['year', 'week']).mean()[["new_cases", "new_deaths"]].apply(lambda x: np.round(x).astype("int")).reset_index()
weekly_data['date'] = weekly_data.apply(lambda x: str(datetime.strptime("{0} {1} 2".format(x[0], x[1]), "%Y %W %w").date()), axis=1)
weekly_data.head(10)

Unnamed: 0,year,week,new_cases,new_deaths,date
0,2020,4,1,0,2020-01-28
1,2020,5,0,0,2020-02-04
2,2020,6,1,0,2020-02-11
3,2020,7,0,0,2020-02-18
4,2020,8,0,0,2020-02-25
5,2020,9,2,0,2020-03-03
6,2020,10,70,3,2020-03-10
7,2020,11,385,7,2020-03-17
8,2020,12,4527,75,2020-03-24
9,2020,13,15573,425,2020-03-31


## Display aggregated data

For both new_cases and new_deaths, read and display mean, median and mode values.

In [6]:
desc = weekly_data.describe().apply(lambda x: np.round(x).astype('int'))
for i in ['new_cases', 'new_deaths']:
    print("MEAN {0} per WEEK:\t".format(i.upper()), desc[i].loc["mean"])
    print("MEDIAN {0} per WEEK:\t".format(i.upper()), desc[i].loc["50%"])
    for k, v in weekly_data.new_cases.value_counts()[:1].to_dict().items():
        print("MODE {0} per WEEK:\t".format(i.upper()), k, ", frequency:", v)
    print("\n\n")

MEAN NEW_CASES per WEEK:	 70251
MEDIAN NEW_CASES per WEEK:	 43990
MODE NEW_CASES per WEEK:	 0 , frequency: 3



MEAN NEW_DEATHS per WEEK:	 1274
MEDIAN NEW_DEATHS per WEEK:	 957
MODE NEW_DEATHS per WEEK:	 0 , frequency: 3



