# Linear Regression Analysis of Covid 19 Deaths in Canada

### Group 2: Ian, Sanjaya, Nermin, Stephanie

#### Import needed Libraries

In [45]:
##Import Key Libraries for use
import datetime as dt
import math
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score, mean_squared_error

### Import and Scrub Data from Github

In [47]:
#Import CSV
covid19Dataframe = pd.read_csv("https://raw.githubusercontent.com/WhipSnake23/Python-Class-Project/main/Data/owid-covid-data.csv")
#Filter Dataset to Canada
covid19Dataframe = covid19Dataframe[covid19Dataframe.iso_code.str.contains("CAN")]
#Remove Smoothed Columns from dataframe
covid19Dataframe = covid19Dataframe.drop(columns=['new_cases_smoothed','new_deaths_smoothed','new_cases_smoothed_per_million','new_deaths_smoothed_per_million','new_tests_smoothed','new_tests_smoothed_per_thousand','new_vaccinations_smoothed','new_vaccinations_smoothed_per_million'])
#Reset Index
covid19Dataframe.reset_index(drop=True, inplace=True)
#Drop rows where Total Deaths is blank. This is required for the ML
covid19Dataframe.dropna(subset=['total_deaths'],inplace=True)

#Import Comparison Data set containing data from March. The data set we are using stops in February
canadaMarchCovidData = pd.read_csv("https://raw.githubusercontent.com/WhipSnake23/Python-Class-Project/main/Data/Canada-MarchData.csv")
canadaMarchCovidData = canadaMarchCovidData.drop(columns=['new_cases_smoothed','new_deaths_smoothed','new_cases_smoothed_per_million','new_deaths_smoothed_per_million','new_tests_smoothed','new_tests_smoothed_per_thousand','new_vaccinations_smoothed','new_vaccinations_smoothed_per_million'])
#Reset Index
canadaMarchCovidData.reset_index(drop=True, inplace=True)
#Drop rows where hosp_patients hasnt been reported
canadaMarchCovidData.dropna(subset=['hosp_patients'],inplace=True)

### Define Notebook Functions

In [48]:
#These functions make it easier to test configuration changes and features easier

#Prints the Scores from the run of the algorithum against the data
def printMetrics(test, predictions):
    print(f"Score: {explained_variance_score(test, predictions):.2f}")
    print(f"MAE: {mean_absolute_error(test, predictions):.2f}")
    print(f"RMSE: {math.sqrt(mean_squared_error(test, predictions)):.2f}")
    print(f"r2: {r2_score(test, predictions):.2f}")

#Function to run the algorithum and call metric function
def  calculateScore(F_Features,F_Target):
    X_train, X_test, y_train, y_test = train_test_split(F_Features, F_Target, test_size=0.25, random_state=1)
    model = SVR(kernel='rbf',epsilon=0)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    printMetrics(y_test, predictions)

#Function which will run the predictions and return the predictions given a block of data
def  predictValue(F_Features,F_Target,predict_df):
    X_train, X_test, y_train, y_test = train_test_split(F_Features, F_Target, test_size=0.25, random_state=1)
    model = SVR(kernel='rbf',epsilon=0)
    model.fit(X_train, y_train)
    predictions = model.predict(predict_df)
    return predictions

### Set Target Column

In [49]:
#Target column is the same across all the test
targetColumn = "total_deaths"

## Test 1
### Using the following features
#### New Cases
#### Median Age
#### Population Density
#### Population
#### Total Cases

In [38]:
featureColumns = ["new_cases", "median_age","population_density","population",'total_cases']

features=covid19Dataframe[featureColumns]
target=covid19Dataframe[targetColumn]

calculateScore(features,target)

#Prediction Methods
featureColumns = ["new_cases", "median_age","population_density","population",'total_cases']
predictionDataFrame = canadaMarchCovidData[featureColumns]
predictedValues = predictValue(features,target,predictionDataFrame)
predictedValues = np.around(predictedValues)
predictedValuesFrame = predictionDataFrame.copy()
predictedValuesFrame['total_deaths'] = predictedValues

Score: 0.00
MAE: 4202.09
RMSE: 6022.67
r2: -0.09




## Test 2
### Using the following features
#### New Cases
#### Median Age
#### Population Density
#### Population


In [50]:
##Missing Total Cases
featureColumns = ["new_cases", "median_age","population_density","population"]

features=covid19Dataframe[featureColumns]
target=covid19Dataframe[targetColumn]

calculateScore(features,target)

Score: 0.00
MAE: 4201.90
RMSE: 6022.60
r2: -0.09




## Test 3
### Using the following features
#### New Cases
#### Median Age
#### Population Density
#### Population
#### Hospital Patients

In [52]:
covid19Dataframe.dropna(subset=['hosp_patients'],inplace=True)
featureColumns = ["new_cases", "median_age","population_density","population","hosp_patients"]

features=covid19Dataframe[featureColumns]
target=covid19Dataframe[targetColumn]

calculateScore(features,target)

Score: 0.00
MAE: 3231.30
RMSE: 4964.96
r2: -0.01




## Test 4
### Using the following features
#### New Cases
#### Median Age
#### Population Density
#### Population
#### Total Cases
#### Hospital Patients

In [53]:
featureColumns = ["new_cases", "median_age","population_density","population","hosp_patients","total_cases"]

features=covid19Dataframe[featureColumns]
target=covid19Dataframe[targetColumn]

calculateScore(features,target)

Score: 0.00
MAE: 3231.30
RMSE: 4964.96
r2: -0.01




## Test 5
### Using the following features
#### New Cases
#### Median Age
#### Population Density
#### Population
#### Total Cases
#### Hospital Patients
#### Positive Rate

In [54]:
covid19Dataframe.dropna(subset=['positive_rate'],inplace=True)
featureColumns = ["new_cases", "median_age","population_density","population","hosp_patients","total_cases","positive_rate"]

features=covid19Dataframe[featureColumns]
target=covid19Dataframe[targetColumn]

calculateScore(features,target)

Score: 0.00
MAE: 2875.20
RMSE: 4560.93
r2: -0.00




## Test 6
### Using the following features
#### New Cases
#### Median Age
#### Population Density
#### Population
#### Total Cases
#### Hospital Patients
#### Positive Rate

In [43]:
covid19Dataframe['positive_rate'] = covid19Dataframe['positive_rate'] * 100
featureColumns = ["new_cases", "median_age","population_density","population","hosp_patients","total_cases","positive_rate"]

features=covid19Dataframe[featureColumns]
target=covid19Dataframe[targetColumn]

calculateScore(features,target)


Score: 0.00
MAE: 2875.20
RMSE: 4560.93
r2: -0.00


