## Linear Regression Analysis of Covid 19 Deaths in Canada
#### Group 2: Ian, Sanjaya, Nermin, Stephanie

### Import needed Libraries

In [25]:
##Import Key Libraries for use
import datetime as dt
import math
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score, mean_squared_error

### Import and Scrub Data from Github

In [26]:
#Import CSV
covid19Dataframe = pd.read_csv("https://raw.githubusercontent.com/WhipSnake23/Python-Class-Project/main/Data/owid-covid-data.csv")
#Filter Dataset to Canada
covid19Dataframe = covid19Dataframe[covid19Dataframe.iso_code.str.contains("CAN")]
#Remove Smoothed Columns from dataframe
covid19Dataframe = covid19Dataframe.drop(columns=['new_cases_smoothed','new_deaths_smoothed','new_cases_smoothed_per_million','new_deaths_smoothed_per_million','new_tests_smoothed','new_tests_smoothed_per_thousand','new_vaccinations_smoothed','new_vaccinations_smoothed_per_million'])
#Reset Index
covid19Dataframe.reset_index(drop=True, inplace=True)
#Drop rows where Total Deaths is blank. This is required for the ML
covid19Dataframe.dropna(subset=['total_deaths'],inplace=True)

#Import Comparison Data set containing data from March. The data set we are using stops in February
canadaMarchCovidData = pd.read_csv("https://raw.githubusercontent.com/WhipSnake23/Python-Class-Project/main/Data/Canada-MarchData.csv")
canadaMarchCovidData = canadaMarchCovidData.drop(columns=['new_cases_smoothed','new_deaths_smoothed','new_cases_smoothed_per_million','new_deaths_smoothed_per_million','new_tests_smoothed','new_tests_smoothed_per_thousand','new_vaccinations_smoothed','new_vaccinations_smoothed_per_million'])
#Reset Index
canadaMarchCovidData.reset_index(drop=True, inplace=True)
#Drop rows where hosp_patients hasnt been reported
canadaMarchCovidData.dropna(subset=['hosp_patients'],inplace=True)

### Define Notebook Functions

In [27]:
#These functions make it easier to test configuration changes and features easier

#Prints the Scores from the run of the algorithum against the data
def printMetrics(test, predictions):
    print(f"Score: {explained_variance_score(test, predictions):.2f}")
    print(f"MAE: {mean_absolute_error(test, predictions):.2f}")
    print(f"RMSE: {math.sqrt(mean_squared_error(test, predictions)):.2f}")
    print(f"r2: {r2_score(test, predictions):.2f}")

#Function to run the algorithum and call metric function
def  calculateScore(F_Features,F_Target):
    X_train, X_test, y_train, y_test = train_test_split(F_Features, F_Target, test_size=0.25, random_state=1)
    model = ElasticNet(random_state=1)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    printMetrics(y_test, predictions)

#Function which will run the predictions and return the predictions given a block of data
def  predictValue(F_Features,F_Target,predict_df):
    X_train, X_test, y_train, y_test = train_test_split(F_Features, F_Target, test_size=0.25, random_state=1)
    model = ElasticNet(random_state=1)
    model.fit(X_train, y_train)
    predictions = model.predict(predict_df)
    return predictions

#### Set Target Column

In [28]:
#Target column is the same across all the test
targetColumn = "total_deaths"

### Test 1
#### Using the following features
- New Cases
- Median Age
- Population Density
- Population
- Total Cases

In [29]:
featureColumns = ["new_cases", "median_age","population_density","population",'total_cases']

features=covid19Dataframe[featureColumns]
target=covid19Dataframe[targetColumn]

calculateScore(features,target)

#Prediction Methods
featureColumns = ["new_cases", "median_age","population_density","population",'total_cases']
predictionDataFrame = canadaMarchCovidData[featureColumns]
predictedValues = predictValue(features,target,predictionDataFrame)
predictedValues = np.around(predictedValues)
predictedValuesFrame = predictionDataFrame.copy()
predictedValuesFrame['total_deaths'] = predictedValues

Score: 0.89
MAE: 1548.75
RMSE: 1921.68
r2: 0.89


#### Comparing Model against real data

In [30]:
columnToCompare = ['total_deaths']
actualDeaths = canadaMarchCovidData[columnToCompare]
predictedDeaths = predictedValuesFrame[columnToCompare]
comparedDataFrame = actualDeaths.compare(predictedDeaths)
comparedDataFrame = comparedDataFrame["total_deaths"]

comparedDataFrame = comparedDataFrame.rename(columns={"self": "Actual_Deaths", "other": "Predicted_Deaths"})

comparedDataFrame["Difference"] = np.nan
comparedDataFrame['Difference'] = comparedDataFrame['Actual_Deaths'] - comparedDataFrame['Predicted_Deaths']
comparedDataFrame

Unnamed: 0,Actual_Deaths,Predicted_Deaths,Difference
0,22014,23683.0,-1669.0
1,22043,24062.0,-2019.0
2,22105,24088.0,-1983.0
3,22152,24096.0,-1944.0
4,22183,24167.0,-1984.0
5,22212,24292.0,-2080.0
6,22234,24390.0,-2156.0
7,22271,24051.0,-1780.0
8,22302,24463.0,-2161.0
9,22330,24492.0,-2162.0


### Test 2
#### Using the following Columns
- New Cases
- Median Age
- Population Density
- Population

In [31]:
##Missing Total Cases
featureColumns = ["new_cases", "median_age","population_density","population"]

features=covid19Dataframe[featureColumns]
target=covid19Dataframe[targetColumn]

calculateScore(features,target)

Score: 0.35
MAE: 3503.82
RMSE: 4831.17
r2: 0.30


### Test 3
#### Using the following Columns
- New Cases
- Median Age
- Population Density
- Population
- Hospital Patients

In [32]:
covid19Dataframe.dropna(subset=['hosp_patients'],inplace=True)
featureColumns = ["new_cases", "median_age","population_density","population","hosp_patients"]

features=covid19Dataframe[featureColumns]
target=covid19Dataframe[targetColumn]

calculateScore(features,target)

Score: 0.45
MAE: 2792.94
RMSE: 3672.82
r2: 0.45


### Test 4
#### Using the following Columns
- New Cases
- Median Age
- Population Density
- Population
- Total Cases
- Hospital Patients

In [33]:
featureColumns = ["new_cases", "median_age","population_density","population","hosp_patients","total_cases"]

features=covid19Dataframe[featureColumns]
target=covid19Dataframe[targetColumn]

calculateScore(features,target)


Score: 0.85
MAE: 1431.59
RMSE: 1941.16
r2: 0.85


### Test 5
#### Using the following Columns
- New Cases
- Median Age
- Population Density
- Population
- Total Cases
- Hospital Patients
- Positive Rate

In [34]:
covid19Dataframe.dropna(subset=['positive_rate'],inplace=True)
featureColumns = ["new_cases", "median_age","population_density","population","hosp_patients","total_cases","positive_rate"]

features=covid19Dataframe[featureColumns]
target=covid19Dataframe[targetColumn]

calculateScore(features,target)

Score: 0.85
MAE: 1197.78
RMSE: 1784.20
r2: 0.85


### Test 6
#### Using the following Columns
- New Cases
- Median Age
- Population Density
- Population
- Total Cases
- Hospital Patients
- Positive Rate

In [35]:
covid19Dataframe['positive_rate'] = covid19Dataframe['positive_rate'] * 100
featureColumns = ["new_cases", "median_age","population_density","population","hosp_patients","total_cases","positive_rate"]

features=covid19Dataframe[featureColumns]
target=covid19Dataframe[targetColumn]

calculateScore(features,target)

Score: 0.89
MAE: 923.87
RMSE: 1485.52
r2: 0.89


#### Compairing Model against real data

In [36]:
featureColumns = ["new_cases", "median_age","population_density","population","hosp_patients","total_cases","positive_rate"]
predictionDataFrame = canadaMarchCovidData[featureColumns]
predictedValues = predictValue(features,target,predictionDataFrame)
predictedValues = np.around(predictedValues)
predictedValuesFrame = predictionDataFrame.copy()
predictedValuesFrame['total_deaths'] = predictedValues


columnToCompare = ['total_deaths']
actualDeaths = canadaMarchCovidData[columnToCompare]
predictedDeaths = predictedValuesFrame[columnToCompare]
comparedDataFrame = actualDeaths.compare(predictedDeaths)
comparedDataFrame = comparedDataFrame["total_deaths"]

comparedDataFrame = comparedDataFrame.rename(columns={"self": "Actual_Deaths", "other": "Predicted_Deaths"})

comparedDataFrame["Difference"] = np.nan
comparedDataFrame['Difference'] = comparedDataFrame['Actual_Deaths'] - comparedDataFrame['Predicted_Deaths']
comparedDataFrame

Unnamed: 0,Actual_Deaths,Predicted_Deaths,Difference
0,22014,26685.0,-4671.0
1,22043,26756.0,-4713.0
2,22105,26823.0,-4718.0
3,22152,26894.0,-4742.0
4,22183,26968.0,-4785.0
5,22212,27042.0,-4830.0
6,22234,27108.0,-4874.0
7,22271,27182.0,-4911.0
8,22302,27258.0,-4956.0
9,22330,27331.0,-5001.0


### Observations
- total_cases plays a significant factor in the model being good
- Adding more features does not mean the model will get better

### Questions
- How well does the algorithm perform?
    It performed moderately well. I think there are additional improvements to the data we could make further refining the feature list.
- What options performed the best?
    After trying several of the options, we found that the defaults really performed the best. Adjusting the Alpha or the L1_Ratio didn't have any real positive impact.
- What features were more useful than others?
    The feature with the biggest impact was total cases. Removing that column rendered the model unusable. Additionally, the hospital patients feature had positive impact as well.