In [None]:
import pandas as pd

In [None]:
import tensorflow as tf
print(tf.__version__)

## Loading the Data

In [None]:
url1 = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv" # confirmed cases day by day
url2 = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv" # deaths
url3 = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv" # recovery day by day

covid_confirmed = pd.read_csv(url1) # main df to work with
covid_death = pd.read_csv(url2) 
covid_recovered = pd.read_csv(url3)

print(covid_death.head())

In [None]:
covid_confirmed[covid_confirmed['Country/Region'] == 'US']

#### Very few null entries, whereas there are a lot of columns for each day
Lets zoom in on a country in particular, the US

In [None]:
df_us = covid_confirmed[covid_confirmed['Country/Region'] == 'US'].drop(columns=['Lat', 'Long', 'Country/Region', 'Province/State'], errors='ignore') # drop columns that are not necessary 
daily_cases = df_us.sum(axis=0).diff().fillna(0) # difference of cases from col to col
daily_cases.index = pd.to_datetime(daily_cases.index) # columns to actual datetime objects, proper date handling for plotting and modelling
daily_cases = daily_cases.to_frame(name='new_cases_diff_by_day') # daily cases series to df w single col named new_cases

In [None]:
df_us_cumulative = df_us.sum(axis=0) # add all US rows, cumulative sum
df_us_cumulative.index = pd.to_datetime(df_us_cumulative.index)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 10))

# --- Plot 1: Cumulative cases ---
plt.subplot(2, 1, 1)  # (nrows, ncols, index)
plt.plot(df_us_cumulative.index, df_us_cumulative.values, color='blue')
plt.title("Cumulative COVID-19 Confirmed Cases in the US (by hundreds of millions)")
plt.ylabel("Total Confirmed Cases")
plt.grid(True)

# --- Plot 2: Daily new cases ---
plt.subplot(2, 1, 2)
plt.plot(daily_cases.index, daily_cases["new_cases_diff_by_day"], color='orange')
plt.title("Daily New COVID-19 Cases in the US (by millions)")
plt.xlabel("Date")
plt.ylabel("New Cases")
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
#covid_confirmed 
#covid_death
covid_recovered 

#### Compare to New Zealand which was said to have a better resonse to COVID, having one of the lowest death rates to COVID in the world. 



In [None]:
df_nz = covid_confirmed[covid_confirmed['Country/Region'] == 'New Zealand'].drop(columns=['Lat', 'Long', 'Country/Region', 'Province/State'], errors='ignore') # drop columns that are not necessary 
daily_cases_nz = df_nz.sum(axis=0).diff().fillna(0) # difference of cases from col to col
daily_cases_nz.index = pd.to_datetime(daily_cases_nz.index) # columns to actual datetime objects, proper date handling for plotting and modelling
daily_cases_nz = daily_cases_nz.to_frame(name='new_cases_diff_by_day') # daily cases series to df w single col named new_cases

In [None]:
daily_cases_nz

In [None]:
df_nz_cumulative = df_nz.sum(axis=0) # add all US rows, cumulative sum
df_nz_cumulative.index = pd.to_datetime(df_nz_cumulative.index)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(14, 10))

# --- Plot 1: Cumulative cases ---
plt.subplot(2, 1, 1)  # (nrows, ncols, index)
plt.plot(df_nz_cumulative.index, df_nz_cumulative.values, color='blue')
plt.title("Cumulative COVID-19 Confirmed Cases in New Zealand (by millions)")
plt.ylabel("Total Confirmed Cases")
plt.grid(True)

# --- Plot 2: Daily new cases ---
plt.subplot(2, 1, 2)
plt.plot(daily_cases_nz.index, daily_cases_nz["new_cases_diff_by_day"], color='orange')
plt.title("Daily New COVID-19 Cases in New Zealand")
plt.xlabel("Date")
plt.ylabel("New Cases")
plt.grid(True)

plt.tight_layout()
plt.show()

### Rolling Averages comparison of US and NZ

In [None]:
us_rolling = daily_cases.rolling(window=7).mean()
nz_rolling = daily_cases_nz.rolling(window=7).mean()

# rollingaverage to smooth out spikes and noise, see trendlines a bit mroe clearly and concisely


In [None]:
plt.figure(figsize=(14, 6))
plt.plot(us_rolling.index, us_rolling['new_cases_diff_by_day'], label='US (7-day avg)', color='blue')
plt.plot(nz_rolling.index, nz_rolling['new_cases_diff_by_day'], label='New Zealand (7-day avg)', color='green')
plt.title("7-Day Rolling Average of COVID-19 New Cases")
plt.xlabel("Date")
plt.ylabel("New Cases")
plt.legend()
plt.grid(True)
plt.tight_layout()

lockdown_dates = {
    'US Lockdown': '2020-03-19',
    'NZ Full Lockdown': '2020-03-25',
    'NZ Reopening': '2020-05-14',
    'US CDC Drops Mask Reccomendations for Fully Vaccinated': '2021-05-13'
}

for label, date in lockdown_dates.items():
    plt.axvline(pd.to_datetime(date), color='red', linestyle='--', linewidth=1)
    plt.text(pd.to_datetime(date), plt.ylim()[1]*0.8, label, rotation=90, color='red')

plt.show()



Rolling average shows covid daily spikes as well as some key time indices

#### Deep dive into Stringency Index and covid cases together 
Stringency refers to a scale on governments policy strictness regarding COVID 19

In [None]:
owid_url = "https://covid.ourworldindata.org/data/owid-covid-data.csv" # our world in data COVID dataset (stringency index)
owid = pd.read_csv(owid_url, parse_dates=['date'])

In [None]:
stringency_us = owid[owid['location'] == 'United States'][['date', 'stringency_index']].set_index('date')
stringency_nz = owid[owid['location'] == 'New Zealand'][['date', 'stringency_index']].set_index('date')

In [None]:
fig, ax1 = plt.subplots(figsize=(14, 6))

ax1.plot(us_rolling.index, us_rolling['new_cases_diff_by_day'], label='US New Cases (7-day avg)', color='blue')
ax1.set_ylabel('New Cases', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')

# Create second y-axis
ax2 = ax1.twinx()
ax2.plot(stringency_us.index, stringency_us['stringency_index'], label='Stringency Index', color='red', alpha=0.7)
ax2.set_ylabel('Stringency Index (0-100)', color='red')
ax2.tick_params(axis='y', labelcolor='red')

plt.title("US: COVID-19 Cases vs Government Stringency Index")
fig.tight_layout()
plt.grid(True)

In [None]:
fig, ax1 = plt.subplots(figsize=(14, 6))

ax1.plot(us_rolling.index, nz_rolling['new_cases_diff_by_day'], label='NZ New Cases (7-day avg)', color='blue')
ax1.set_ylabel('New Cases', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')

# Create second y-axis
ax2 = ax1.twinx()
ax2.plot(stringency_nz.index, stringency_nz['stringency_index'], label='Stringency Index', color='red', alpha=0.7)
ax2.set_ylabel('Stringency Index (0-100)', color='red')
ax2.tick_params(axis='y', labelcolor='red')

plt.title("New Zealand: COVID-19 Cases vs Government Stringency Index")
fig.tight_layout()
plt.grid(True)

With relaxed policies, there do seem to be spikes in cases occurring after the ralaxation in accordance to the strignency index. Could be used as a possible feature in the model! We can combine the dataframes as such

In [None]:
us_combined = daily_cases.join(stringency_us, how='left')
nz_combined = daily_cases_nz.join(stringency_nz, how='left')

us_combined['stringency_index'] = us_combined['stringency_index'].ffill()
nz_combined['stringency_index'] = nz_combined['stringency_index'].ffill()
# we added new column regarding stringency index , utilizing missigness imputation of assuming stringency stays the same since NAN

In [None]:
columns = [
    'date', 'location', 'new_cases', 'stringency_index',
    'people_vaccinated_per_hundred'
] # possible features form OWID dataset

owid_filtered = owid[columns] # 

us_df = owid_filtered[owid_filtered['location'] == 'United States'].copy()
nz_df = owid_filtered[owid_filtered['location'] == 'New Zealand'].copy()

us_df

In [None]:


us_df.drop(columns=['location'], inplace=True)
nz_df.drop(columns=['location'], inplace=True)

us_df.set_index('date', inplace=True)
nz_df.set_index('date', inplace=True)

us_df.ffill()
nz_df.ffill() # forward fill for time series data (impute using previous values and continuing forward)

In [None]:
us_df['new_cases'] = us_df['new_cases'].rolling(window=7).mean().fillna(0)
nz_df['new_cases'] = nz_df['new_cases'].rolling(window=7).mean().fillna(0) # rolling average to smooth out noise and spikes and irregular values

In [None]:
import seaborn as sns
# visualize correlations with heat matrix
corr = us_df.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Feature Correlation Matrix")
plt.show()

In [None]:
sns.scatterplot(x=us_df['stringency_index'], y=us_df['new_cases'])

In [None]:
sns.scatterplot(x=nz_df['stringency_index'], y=nz_df['new_cases'])

In [None]:
# TODO 
# normalize the features
# create sliding window sequences for TF to predict upon

In [None]:
us_df.isnull().sum()

In [None]:
nz_df.isnull().sum()

### Start fully cleaning NAN and normalizing the features to be used in our model, use Forward Fill on time series data

In [None]:
us_df = us_df.ffill()
us_df = us_df.bfill()

In [None]:
nz_df = nz_df.ffill()
nz_df = nz_df.bfill()


In [None]:
us_df_active = us_df[(us_df['new_cases'] > 1000) & (us_df['stringency_index'] > 0)] # address issue of too many 0 values
nz_df_active = nz_df[(nz_df['new_cases'] > 1000) & (nz_df['stringency_index'] > 0)]

In [None]:
(us_df_active == 0).mean() * 100 # perfect, much less to no non zero activity, model can ACTUALLY learn
us_df_active

#### Normalize with Min Max Scaler and doesnt change data distribution / centering around 0

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [None]:
us_scaled = pd.DataFrame(scaler.fit_transform(us_df_active), columns=us_df_active.columns, index=us_df_active.index) # using this for now
nz_scaled = pd.DataFrame(scaler.fit_transform(nz_df_active), columns=nz_df_active.columns, index=nz_df_active.index)

#### Sliding Windows in Tensor Flow -
For time series models (LSTMs and GRUs) converts a continuous time series into overlapping sequences of fixed size for the model to learn from

In [None]:
import numpy as np

def create_sliding_windows(data, target_col='new_cases', window_size=14):
    X, y = [], []
    for i in range(len(data) - window_size):
        window = data.iloc[i:i+window_size].values # window of past values
        target = data.iloc[i+window_size][target_col] # target for prediction
        X.append(window)
        y.append(target)
    return np.array(X), np.array(y)

In [None]:
X, y = create_sliding_windows(us_scaled, window_size=14) # 2 week sliding window on US scaled data
X_nz, y_nz = create_sliding_windows(nz_scaled)

##### Train test splits prior to modelling, dont want to shuffle the data for time series model

In [None]:
split_ratio = 0.8
split_idx = int(len(X) * split_ratio)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

#### Time to build a simple LTSM model !

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential # stack layers linearly
from tensorflow.keras.layers import LSTM, Dense, Dropout 

model = Sequential([
    LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2])), # 14 day window with 3 features
    Dropout(0.2), # drops 20% neurons randomly during training, overfitting prevention
    Dense(1) # 1 output neuron, predicts SINGLE VALUE
]) # model with 3 layers, LTSM layer with 64 units (neurons)

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()

In [None]:
history = model.fit(
    X_train, y_train,
    validation_split=0.1,  # last 10% of train used for validation
    epochs=30,
    batch_size=16,
    verbose=1
)

In [None]:
import matplotlib.pyplot as plt

y_pred = model.predict(X_test)

plt.figure(figsize=(14, 6))
plt.plot(y_test, label='Actual')
plt.plot(y_pred, label='Predicted')
plt.title("LSTM Model: Actual vs Predicted New Cases")
plt.xlabel("Days (test set)")
plt.ylabel("Normalized New Cases")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# predict on new zealand using US trained model
y_nz_pred = model.predict(X_nz)


In [None]:
plt.figure(figsize=(14, 6))
plt.plot(y_nz, label='NZ Actual')
plt.plot(y_nz_pred, label='NZ Predicted (US-trained)')
plt.title("Generalization: US-Trained Model on New Zealand")
plt.xlabel("Days (Test Set)")
plt.ylabel("Normalized New Cases")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

#### New Zealand data is VERY well predicted on by the American trained COVID model

In [None]:
new_cases_idx = list(nz_df.columns).index('new_cases')
y_nz_pred_real = scaler.inverse_transform(
    np.concatenate([
        y_nz_pred, np.zeros((len(y_nz_pred), len(nz_df.columns) - 1))
    ], axis=1)
)[:, new_cases_idx]

In [None]:
y_nz_pred_real # real predictions of new zealand 

#### Next step, create a multi - country model to generalize predictions upon MOST countreis based off of policy, vaccination rate, and disease spread rates. US, NEW ZEALAND, INDIA, and BRAZIL

In [None]:
import multi_country_pipeline as mcp
import importlib

df_raw = mcp.load_owid_data() # laod the owid data as step 1!

In [None]:
importlib.reload(mcp) # debugging pipeline measures

#initialize and fit the preprocessor custom created in the script
countries = ['United States', 'New Zealand', 'India', 'Brazil']
preprocessor = mcp.MultiCountryCOVIDPreprocessor(
    countries=['United States', 'New Zealand', 'India', 'Brazil'],
    window_size=14
)
preprocessor.fit(df_raw)

In [None]:
# transform the data now with the preprocessor
X, y = preprocessor.transform(df_raw)

In [None]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)



In [None]:
model = mcp.build_lstm_model(input_shape=(X_train.shape[1], X_train.shape[2]))

history = model.fit(X_train, y_train, epochs=50, validation_split=0.1, verbose=1)

In [None]:
y_pred = model.predict(X_test)

#inverse transform just the new_cases column
y_test_real = target_scaler.inverse_transform(np.concatenate(
    [y_test.reshape(-1, 1),
     np.zeros((len(y_test), 2))], axis=1))[:, 0]

y_pred_real = target_scaler.inverse_transform(np.concatenate(
    [y_pred,
     np.zeros((len(y_pred), 2))], axis=1))[:, 0]

plt.figure(figsize=(14, 6))
plt.plot(y_test_real, label='Actual')
plt.plot(y_pred_real, label='Predicted')
plt.title("LSTM Multi-Country Model: Actual vs Predicted")
plt.xlabel("Days (Test Set)")
plt.ylabel("Normalized New Cases")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import boto3
import io

In [None]:
s3 = boto3.client('s3')
obj = s3.get_object(Bucket='covid-pipeline-data', Key='processed/owid-covid-data-filtered_2025-04-06_13-47-00.csv')
#df = pd.read_csv(io.BytesIO(obj['Body'].read()))