In [None]:
import pandas as pd
import glob
import re
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from zipfile import ZipFile

## Import Covid State Restrictions

In [None]:
def init_state_social_distancing_actions(path):
  df_master = pd.DataFrame()
  flag = False
  files_in_folder = glob.glob(path)
  for filename in files_in_folder:    
    zip_file = ZipFile(filename)
    for text_file in zip_file.infolist():
      # if not text_file.filename.startswith('__MACOSX/'):
      if text_file.filename.endswith('.csv'):
        date = re.search('\d*-\d*-\d*', text_file.filename)[0]
        if date == "20201-06-01":
          date = "2021-06-01"
        date_time_value = pd.to_datetime(date)
        df = pd.read_csv(zip_file.open(text_file.filename), sep=",", header=0)
        df["Date"] = date_time_value
        df.rename(columns = {'Unnamed: 0':'State'}, inplace = True)
        df.drop((df[df.State.isin(["United States"])].index) | (df[df.State.isnull()].index), inplace=True)
      if not flag:
        df_master = df
        flag = True
      else:
        df_master = pd.concat([df_master, df])
  df_master.set_index(["Date", "State"], inplace=True)
  df_master.sort_index(inplace=True)
  return df_master

state_social_distancing_actions = init_state_social_distancing_actions(r'**csv_files/state_social_distancing_actions.zip')

In [None]:
def clean_state_social_distancing_actions(df):
  df = df.drop(columns=["Primary Election Postponement"])
  return df

cleaned_state_social_distancing_actions = clean_state_social_distancing_actions(state_social_distancing_actions)
cleaned_state_social_distancing_actions

Clean up the dataframe to remove unused columns and solve for Nan fields. 
- Have to manually insert face mask requirements for recent months based on: https://statepolicies.com/data/graphs/face-masks/
- Face mask mandate was reintroduced from Dec 15, 2021 -> Feb 15, 2022

Other changes
- Manually inserted "No Limit" after 2021-08-15 because missing data

In [None]:
california_restrictions_data = cleaned_state_social_distancing_actions[cleaned_state_social_distancing_actions.index.get_level_values('State').isin(['California'])]
california_restrictions_data.reset_index("State", inplace=True)
california_restrictions_data = california_restrictions_data[~california_restrictions_data.index.duplicated(keep='first')]
# Fill inn missing dates with rows equal the previous date with data
days_idx = pd.date_range(start=california_restrictions_data.index[0], end="2022-04-18", freq="D")
california_restrictions_data = california_restrictions_data.reindex(days_idx, method="pad")
# Update facemask data
california_restrictions_data.loc[: "2020-06-17", "Face Covering Requirement"] = 0 # No
california_restrictions_data.loc["2020-06-18" : "2021-06-14", "Face Covering Requirement"] = 1 # Yes
california_restrictions_data.loc["2021-06-15" : "2021-12-14", "Face Covering Requirement"] = 0
california_restrictions_data.loc["2021-12-15" : "2022-02-14", "Face Covering Requirement"] = 1
california_restrictions_data.loc["2022-02-15" : , "Face Covering Requirement"] = 0
# Manually insert gathering limit for missing values
california_restrictions_data.loc["2021-08-16 ":, "Large Gatherings Ban"] = "No Limit"
# Transform Large Gatherings Ban to a sevearity of the rules (1: no restrictions, 5: All gatherings prohibited)
california_restrictions_data["Large Gatherings Ban"].replace({'All Gatherings Prohibited': 5, '>50 Prohibited': 4, 'Expanded Limit to 25 or Fewer': 3, '>25 Prohibited': 3, '>10 Prohibited': 2, 'No Limit': 1}, inplace=True)
# Set the missing restaurant values to open
california_restrictions_data[california_restrictions_data["Restaurant Limits"].isna()]["Restaurant Limits"] = "Open"

ohe_restaurant_limits = pd.get_dummies(california_restrictions_data["Restaurant Limits"], prefix='Restaurant Limits', columns = 'Restaurant Limits', drop_first=True)

selected_ca_restrictions = pd.concat([california_restrictions_data[["Face Covering Requirement", "Large Gatherings Ban"]], ohe_restaurant_limits], axis=1)

#selected_ca_restrictions

california_restrictions_data[california_restrictions_data["Restaurant Limits"].notna()]
#sns.heatmap(california_restrictions_data.isnull(),yticklabels = False, cbar = False, cmap='viridis')

## Import Vaccination Data

In [None]:
def init_vaccination_df_from_zip(path, index_col):
    df_master = pd.read_csv(glob.glob(path)[0], compression='zip', header=0, sep=',', index_col=index_col)
    return df_master

vaccination_df = init_vaccination_df_from_zip(r'**csv_files/COVID-19_Vaccinations_in_the_United_States_Jurisdiction.csv.zip', ["Date"])
vaccination_df = vaccination_df[vaccination_df["Location"] == "CA"]
vaccination_df.index = pd.to_datetime(vaccination_df.index)
vaccination_df.sort_index(inplace=True)
vaccination_df

Add ekstra columns to have the same time-range as the other datasets
- Insert 0 as the number of vaccinated since no-one was vaccinated at the first date of the current df

In [None]:
days_idx = pd.date_range(start="2020-06-04", end="2020-12-13", freq="D")
days_with_missing_data = vaccination_df.reindex(days_idx, fill_value=0)
days_with_missing_data["Location"] = "CA"
days_with_missing_data.index.names = ['Date']
vaccination_df = days_with_missing_data.append(vaccination_df)

In [None]:
vaccination_df_administered = pd.DataFrame(vaccination_df["Administered"])
vaccination_df_administered.rename(columns={"Administered": "Total Vaccines Administered"}, inplace=True)
vaccination_df_administered

vaccination_df_administered["Daily vaccinations"] = vaccination_df_administered.diff(periods=1)
vaccination_df_administered = vaccination_df_administered[~vaccination_df_administered.index.duplicated(keep='first')]
vaccination_df_administered.loc["2020-06-04", "Daily vaccinations"] = 0
vaccination_df_administered[(vaccination_df_administered["Daily vaccinations"] == 0) | (vaccination_df_administered["Daily vaccinations"] < 0)]

# Becuase there is a mistake in the data on 2022-01-27 where the total number of vaccines suddenly drops by -1593072 we have to solve this
# and chose to do it with setting this value to 0
vaccination_df_administered.loc["2022-01-27", "Daily vaccinations"] = 0
vaccination_df_administered = vaccination_df_administered[["Daily vaccinations"]]
vaccination_df_administered

Convert Total Vaccines from incremental to day by day number of 

## Import daily covid deaths

In [None]:
def init_daily_reports(path):
  df_master = pd.DataFrame()
  flag = False
  files_in_folder = glob.glob(path)
  for filename in files_in_folder:    
    zip_file = ZipFile(filename)
    for text_file in zip_file.infolist():
      # if not text_file.filename.startswith('__MACOSX/'):
      if text_file.filename.endswith('.csv'):
        date = re.search('\d*-\d*-\d*', text_file.filename)[0]
        date_time_value = pd.to_datetime(date)
        df = pd.read_csv(zip_file.open(text_file.filename), sep=",", header=0)
        df["Date"] = date_time_value
        df.rename(columns = {'Province_State':'State'}, inplace = True)
        df.drop((df[df.State.isin(["United States"])].index) | (df[df.State.isnull()].index), inplace=True)
      if not flag:
        df_master = df
        flag = True
      else:
        df_master = pd.concat([df_master, df])
  df_master.set_index(["Date", "State"], inplace=True)
  df_master.sort_index(inplace=True)
  return df_master

covid_daily_reports = init_daily_reports(r'**csv_files/csse_covid_19_daily_reports_us.zip')

covid_daily_reports = covid_daily_reports.reset_index()

states = ['California']
covid_daily_reports = covid_daily_reports[covid_daily_reports["State"].isin(states) == True]
covid_daily_reports.set_index(["Date"], inplace=True)

In [None]:
deaths_data = pd.DataFrame(covid_daily_reports["Deaths"])
deaths_data["Daily Deaths"] = deaths_data.diff(periods=1)
deaths_data = deaths_data[~deaths_data.index.duplicated(keep='first')]
deaths_data.drop(["Deaths"], axis=1, inplace=True)
# For all the values where the death count is less than 0 we set the value to be equal 0
deaths_data[deaths_data["Daily Deaths"] < 0] = 0

## Import infection rate data

In [None]:
def init_infection_dataframe_from_zip(path, index_col):
    df_master = pd.read_csv(glob.glob(path)[0], compression='zip', header=0, sep=',', index_col=index_col)
    return df_master

infected_df = init_infection_dataframe_from_zip(r'**csv_files/United_States_COVID-19_Cases_and_Deaths_by_State_over_Time.csv.zip', ["submission_date", "state"])
infected_df.index = infected_df.index.set_levels([pd.to_datetime(infected_df.index.levels[0]), infected_df.index.levels[1]])
infected_df_CA = infected_df[infected_df.index.get_level_values('state').isin(['CA'])]
infected_df_CA = infected_df_CA.reset_index()
infected_df_CA.set_index(["submission_date"], inplace=True)
infected_df_CA = infected_df_CA[["tot_cases"]].sort_index()

## Import Covid Virus Variant Data

In [None]:
def init_virus_variant_df(path):
    df = pd.read_json(glob.glob(path)[0], compression='zip', orient="records")
    df.index.names = ['County']
    df.drop("plotting_dates", axis=1, inplace=True)
    df.drop(["min_date", "max_date"], axis=0, inplace=True)

    ca_df = df[df.index == "California"]["countries"]
    normalized = pd.json_normalize(ca_df)

    full_data = []
    for i in range(len(normalized["week"][0])):
        one_date = []
        for col_name in normalized.columns:
            one_date.append(normalized[col_name][0][i])
        full_data.append(one_date)
    df_output = pd.DataFrame(data=full_data, columns=normalized.columns)
    df_output["State"] = "California"
    # Get State at front
    cols = list(df_output.columns)
    cols = [cols[-1]] + cols[:-1]
    ordered_df = df_output[cols]
    ordered_df.set_index("week", inplace=True)

    return ordered_df

virus_variants_df = init_virus_variant_df(r'**csv_files/USAClusters_data.json.zip')

### Clean up and format covid variants data
- Convert virus variants to be percentage of total infections
- Fill inn missing dates

In [None]:
virus_variants_df.iloc[:, 2:]
variant_percentage_df = virus_variants_df.iloc[:,2:].div(virus_variants_df.total_sequences, axis=0)
variant_percentage_df.index = pd.to_datetime(variant_percentage_df.index)
days_index = pd.date_range(start=variant_percentage_df.index[0], end=variant_percentage_df.index[-1], freq="D")
variant_percentage_df = variant_percentage_df.reindex(days_index, method="pad")
variant_percentage_df

## Make sure all dataframes are for the same time interval
- selected_ca_restrictions: 2020-06-04 - 2022-04-18	
- vaccination_df: 2020-06-04 - 2022-04-20
- Death data: 2020-04-12 - 2022-03-28

In [None]:
merged_data = pd.concat([selected_ca_restrictions["2020-06-04" : "2022-03-28"], vaccination_df_administered["2020-06-04" : "2022-03-28"], infected_df_CA["2020-06-04" : "2022-03-28"], deaths_data["2020-06-04" : ]], axis=1)
#merged_data = pd.concat([selected_ca_restrictions["2020-06-04" : "2022-03-28"], vaccination_df_administered["2020-06-04" : "2022-03-28"], infected_df_CA["2020-06-04" : "2022-03-28"], deaths_data["2020-06-04" : ], variant_percentage_df["2020-06-04" : "2022-03-28"]], axis=1)
#merged_data

## Feature Engieering

In [None]:
merged_data["Daily vaccinations"] = np.log(merged_data["Daily vaccinations"] + 1).fillna(0)
merged_data["tot_cases"] = np.log(merged_data["tot_cases"] + 1).fillna(0)
merged_data

In [None]:
from sklearn import linear_model as lm
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

train, val = train_test_split(merged_data, test_size = 0.1, random_state = 42)
Y_train = train["Daily Deaths"]
X_train = train.drop(["Daily Deaths"], axis=1)

linear_model = lm.LinearRegression(fit_intercept=True)
linear_model.fit(X_train, Y_train)
y_prediction = linear_model.predict(X_train)
rmse = mean_squared_error(Y_train, y_prediction, squared=False)
rmse
plt.scatter(Y_train, Y_train-y_prediction, alpha=0.5);
rmse

#clf = svm.SVC(kernel='linear', C=1, random_state=42)
#scores = cross_val_score(clf, X_train, Y_train, cv=5)
#scores

In [None]:
# Followed example: https://towardsdatascience.com/improve-linear-regression-for-time-series-forecasting-e36f3c3e3534
# https://github.com/cerlymarco/MEDIUM_NoteBook/blob/master/ModelTrees_TimeSeries/ModelTrees_TimeSeries.ipynb
# https://github.com/cerlymarco/MEDIUM_NoteBook

X_train_, X_test_, y_train_, y_test_ = train_test_split(
    merged_data.drop('Daily Deaths', axis=1), 
    merged_data['Daily Deaths'], 
    test_size=0.3, shuffle=False)

X_train_.shape, X_test_.shape

In [None]:
### PLOT STORE DATA ###

y_train_.plot(label='train', figsize=(16,6))
y_test_.plot(label='test')
plt.title("State: {}".format("California")); plt.legend();

In [None]:
#model = lm.LinearRegression(fit_intercept=True)

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge

#model = GridSearchCV(estimator=Ridge(), param_grid={'alpha': [1, 3, 5, 10, 20]}, 
#                     scoring='neg_mean_squared_error', cv=2, refit=True)
model = lm.LinearRegression(fit_intercept=True)
model.fit(X_train_, y_train_)

pred_lr = pd.Series(model.predict(X_test_), index = y_test_.index)
pred_lr.plot(label='linear_regression')
y_test_.plot(label='true', figsize=(10,6));


#model.best_params_

# Total cases for the last 2 week for each day

In [None]:
# Find number of days with deaths equal to 0
#zero_death_days = merged_data.loc[merged_data["Daily Deaths"] == 0]
#zero_death_days.reset_index(inplace=True)
#zero_death_days['weekday'] = zero_death_days['index'].dt.dayofweek # Monday=0, Sunday=6
#zero_death_days

## After not setting days with negative numbers equal to zero then there are 13 days equal to 0
## 6 of them is a sunday, 6 is a saturday, and 1 is a monday