In [7]:
import pandas as pd
import glob
import re
import matplotlib.pyplot as plt
import seaborn as sns
from zipfile import ZipFile

In [8]:
def init_state_social_distancing_actions(path):
  df_master = pd.DataFrame()
  flag = False
  files_in_folder = glob.glob(path)
  for filename in files_in_folder:    
    zip_file = ZipFile(filename)
    for text_file in zip_file.infolist():
      # if not text_file.filename.startswith('__MACOSX/'):
      if text_file.filename.endswith('.csv'):
        date = re.search('\d*-\d*-\d*', text_file.filename)[0]
        if date == "20201-06-01":
          date = "2021-06-01"
        date_time_value = pd.to_datetime(date)
        df = pd.read_csv(zip_file.open(text_file.filename), sep=",", header=0)
        df["Date"] = date_time_value
        df.rename(columns = {'Unnamed: 0':'State'}, inplace = True)
        df.drop((df[df.State.isin(["United States"])].index) | (df[df.State.isnull()].index), inplace=True)
      if not flag:
        df_master = df
        flag = True
      else:
        df_master = pd.concat([df_master, df])
  df_master.set_index(["Date", "State"], inplace=True)
  df_master.sort_index(inplace=True)
  return df_master

state_social_distancing_actions = init_state_social_distancing_actions(r'**csv_files/state_social_distancing_actions.zip')

In [9]:
def clean_state_social_distancing_actions(df):
  df = df.drop(columns=["Primary Election Postponement"])
  return df

cleaned_state_social_distancing_actions = clean_state_social_distancing_actions(state_social_distancing_actions)
cleaned_state_social_distancing_actions

Unnamed: 0_level_0,Unnamed: 1_level_0,State Is Easing Social Distancing Measures,Stay at Home Order,Mandatory Quarantine for Travelers,Non-Essential Business Closures,Large Gatherings Ban,School Closures,Restaurant Limits,Emergency Declaration,Face Covering Requirement,Status of Reopening,Bar Closures,Bar Closures*,Statewide Face Mask Requirement
Date,State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-06-04,Alabama,Yes,Lifted,-,All Non-Essential Businesses Permitted to Reop...,Lifted,Closed for School Year,Reopened to Dine-in Service,Yes,,,,,
2020-06-04,Alaska,Yes,Lifted,All Travelers,All Non-Essential Businesses Permitted to Reopen,Lifted,Closed for School Year,Reopened to Dine-in Service,Yes,,,,,
2020-06-04,Arizona,Yes,Lifted,Lifted,All Non-Essential Businesses Permitted to Reop...,Lifted,Closed for School Year,Reopened to Dine-in Service with Capacity Limits,Yes,,,,,
2020-06-04,Arkansas,Yes,-,From Certain States,-,>10 People Prohibited,Closed for School Year,Reopened to Dine-in Service with Capacity Limits,Yes,,,,,
2020-06-04,California,Yes,Statewide,-,Some Non-Essential Businesses Permitted to Reo...,All Gatherings Prohibited,Recommended Closure for School Year,Closed Except for Takeout/Delivery,Yes,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-11-09,Virginia,,,,,,,,No,,Reopened,,,No
2021-11-09,Washington,,,,,,,,Yes,,Reopened,,,Indoor Only
2021-11-09,West Virginia,,,,,,,,Yes,,Reopened,,,No
2021-11-09,Wisconsin,,,,,,,,No,,Reopened,,,No


Clean up the dataframe to remove unused columns and solve for Nan fields. 
- Have to manually insert face mask requirements for recent months based on: https://statepolicies.com/data/graphs/face-masks/
- Face mask mandate was reintroduced from Dec 15, 2021 -> Feb 15, 2022

In [36]:
california_data = cleaned_state_social_distancing_actions[cleaned_state_social_distancing_actions.index.get_level_values('State').isin(['California'])]
california_data.reset_index("State", inplace=True)
california_data = california_data[~california_data.index.duplicated(keep='first')]
# Fill inn missing dates with rows equal the previous date with data
days_idx = pd.date_range(start=california_data.index[0], end="2022-04-18", freq="D")
california_data = california_data.reindex(days_idx, method="pad")
# Update facemask data
california_data.loc[: "2020-06-17", "Face Covering Requirement"] = "No"
california_data.loc["2020-06-18" : "2021-06-14", "Face Covering Requirement"] = "Yes"
california_data.loc["2021-06-15" : "2021-12-14", "Face Covering Requirement"] = "No"
california_data.loc["2021-12-15" : "2022-02-14", "Face Covering Requirement"] = "Yes"
california_data.loc["2022-02-15" : , "Face Covering Requirement"] = "No"

# TODO: Could add
# - Stay at home order (https://statepolicies.com/data/graphs/stay-at-home-order/)
# - Closed places like restuarants, schools, ... (https://statepolicies.com/data/graphs/reopening-2/)

# Must add
# - R number / infection rate
# - Vaccination rate

neccessary_ca_data = california_data[["Face Covering Requirement", "Large Gatherings Ban"]]
neccessary_ca_data


Unnamed: 0,Face Covering Requirement,Large Gatherings Ban
2020-06-04,No,All Gatherings Prohibited
2020-06-05,No,All Gatherings Prohibited
2020-06-06,No,All Gatherings Prohibited
2020-06-07,No,All Gatherings Prohibited
2020-06-08,No,All Gatherings Prohibited
...,...,...
2022-04-14,No,
2022-04-15,No,
2022-04-16,No,
2022-04-17,No,
