#### 4.4 Create the state facts table
- Load county facts table
- Discard weather data
- Group by state and timestamp
- Add up case/death total/delta
- Write out to parquet partitioned by date

##### Setup
I'm going to need Spark for this because I'll want to make use of some of its functionality, such as the ability to create temporary SQL views of my dataframes.

In [1]:
from setup import create_spark_session

spark = create_spark_session()

Imports and output paths:

In [2]:
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.types import *

from clean import *
from etl import *

# For now, just locally, later on maybe write this to S3 instead
output_path = "output/"

In [3]:
county_facts_df = load_county_facts_table(spark)

In [4]:
county_facts_df.limit(5).show()

+----+----------+----------------+----------------+-----------------+-----------------+--------+--------+-----------+----+----------+
|fips|     state|covid_case_total|covid_case_delta|covid_death_total|covid_death_delta|min_temp|max_temp|cloud_cover|wind| timestamp|
+----+----------+----------------+----------------+-----------------+-----------------+--------+--------+-----------+----+----------+
|1055|   Alabama|            6005|             100|               64|                0|    1.18|   15.19|       17.0|2.93|1606176000|
|1097|   Alabama|           19446|             140|              360|                2|    3.75|   20.04|       17.0|3.29|1606176000|
|5111|  Arkansas|            1602|              12|               39|                0|    2.77|   13.97|       20.0|1.41|1606176000|
|6079|California|            5885|              74|               35|                0|    6.09|   15.35|       12.0|2.19|1606176000|
|8059|  Colorado|           18369|             368|           

In [5]:
county_facts_df_reduced = county_facts_df[['state', 'timestamp', 'covid_case_total', 'covid_case_delta', 'covid_death_total', 'covid_death_delta']]
county_facts_df_reduced.limit(5).show()

+----------+----------+----------------+----------------+-----------------+-----------------+
|     state| timestamp|covid_case_total|covid_case_delta|covid_death_total|covid_death_delta|
+----------+----------+----------------+----------------+-----------------+-----------------+
|   Alabama|1606176000|            6005|             100|               64|                0|
|   Alabama|1606176000|           19446|             140|              360|                2|
|  Arkansas|1606176000|            1602|              12|               39|                0|
|California|1606176000|            5885|              74|               35|                0|
|  Colorado|1606176000|           18369|             368|              425|                9|
+----------+----------+----------------+----------------+-----------------+-----------------+



In [6]:
state_facts_df = county_facts_df_reduced.groupBy('state', 'timestamp').agg( \
    F.sum('covid_case_total').alias('covid_case_total'), \
    F.sum('covid_case_delta').alias('covid_case_delta'), \
    F.sum('covid_death_total').alias('covid_death_total'), \
    F.sum('covid_death_delta').alias('covid_death_delta'))

In [7]:
state_facts_df.limit(5).show()

+---------+----------+----------------+----------------+-----------------+-----------------+
|    state| timestamp|covid_case_total|covid_case_delta|covid_death_total|covid_death_delta|
+---------+----------+----------------+----------------+-----------------+-----------------+
|Tennessee|1600905600|          181808|             882|             2281|               37|
|    Texas|1608768000|         1661626|            9138|            26868|              288|
| Michigan|1602288000|          143360|            1600|             7137|               19|
|   Oregon|1601683200|           34511|             348|              571|                8|
|    Texas|1601337600|          774438|            6448|            16179|               87|
+---------+----------+----------------+----------------+-----------------+-----------------+



In [8]:
state_facts_df.write.partitionBy('timestamp').mode('append').parquet(output_path + "state_facts.parquet")