In [None]:
#Data Analysis
import numpy as np
import pandas as pd
import datetime as dt

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Machine Learning
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
# Import dataset
covid = pd.read_csv(r'C:\Users\samra\Desktop\Data Science Bootcamp\Regression-Project\Quebec Covid-19.csv')

# Filter dataset to Quebec only
covid = covid.loc[covid['prname'] == 'Quebec']

# Remove unwanted columns
covid.drop(['pruid', 'prnameFR', 'numprob', 'update', 'numtested', 'numtests', 'numrecover', 'percentrecover','ratetested', 'ratetests', 'percentoday', 'ratetotal', 'ratedeaths',
            'percentdeath', 'numtestedtoday', 'numteststoday', 'numrecoveredtoday','percentactive', 'rateactive', 'numtotal_last14', 'ratetotal_last14', 'ratedeaths_last14',
            'numdeaths_last14', 'ratetotal_last7', 'ratedeaths_last7', 'avgtotal_last7','avgincidence_last7', 'avgdeaths_last7', 'avgratedeaths_last7', 'raterecovered'], axis = 1, inplace = True)

# Convert date
covid['date'] = pd.to_datetime(covid['date'])

covid['date_year'] = covid['date'].dt.year
covid['date_month'] = covid['date'].dt.month
#covid['date'] = covid['date'].dt.date

# Deal with N/A values
# Only num_total_last7 and numdeaths_last7 have minimal empty values due to the nature of the variable. Fill empty values with 0
covid['numtotal_last7'].fillna(0, inplace = True)
covid['numdeaths_last7'].fillna(0, inplace = True)

# X = date
# Y = numconf, numtoday, numdeathstoday, numrecoveredtoday, numtotal_last7, numdeaths_last7

covid.info()

In [None]:
# We are interested in desinging RDD using (numtoday | numtotal_last7 | numdeaths_last7)

# Lets use the date variable and set a threshold to see what we can expect from final model
# covid['date_month'] -= 8

# plt.figure(figsize = (8,8))
# ax = plt.subplot(3, 1, 1)
# covid.plot.scatter(x = 'date_month', y = 'numtoday', ax = ax)

# ax = plt.subplot(3, 1, 2, sharex = ax)
# covid.plot.scatter(x = 'date_month', y = 'numtotal_last7', ax = ax)

# ax = plt.subplot(3, 1, 3, sharex = ax)
# covid.plot.scatter(x = 'date_month', y = 'numdeaths_last7', ax = ax)

# Following the opening of schools in August, the total cases and 7 day rolling average seems to increase
# This doesnt show a good picture as the input date variable is not ideal.

In [None]:
covid.reset_index(inplace = True)

In [None]:
# One possible solution is to create a new column called days_from_start which represents a numerical value
# that incremenets by 1 for each day that have passed after 2020-03-01 (keeping in mind an increment represents available data)
# Correction: Start from 2020-03-05 as data is missing 02 and 04
covid = covid.drop(index = 0)
covid = covid.drop(index = 1)
covid.reset_index(inplace = True)

In [None]:
covid['days_from_start'] = covid.index

In [None]:
covid = covid.drop(columns = ['level_0', 'index'])

In [None]:
# Now we can use the new column as the threshold for the RDD model.
# Plug into a simple linear regression model
