## Linear Regression Analysis of Covid 19 data for Canada, Italy, India, and the United States
#### By: Ian

### Import needed Libraries

In [400]:
##Import Key Libraries for use
import datetime as dt
import math
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score, mean_squared_error

### Import and Scrub Data from Github

In [401]:
#Import CSV
covid19Dataframe = pd.read_csv("https://raw.githubusercontent.com/WhipSnake23/Python-Class-Project/main/Data/owid-covid-data.csv")
#Remove Smoothed Columns from dataframe
covid19Dataframe = covid19Dataframe[covid19Dataframe.iso_code.str.contains("CAN")| (covid19Dataframe.iso_code.str.contains("IND")) | (covid19Dataframe.iso_code.str.contains("ITA")) | (covid19Dataframe.iso_code.str.contains("USA"))]
covid19Dataframe = covid19Dataframe.drop(columns=['new_cases_smoothed','new_deaths_smoothed','new_cases_smoothed_per_million','new_deaths_smoothed_per_million','new_tests_smoothed','new_tests_smoothed_per_thousand','new_vaccinations_smoothed','new_vaccinations_smoothed_per_million'])

#Reset Index
covid19Dataframe.reset_index(drop=True, inplace=True)
#Drop rows where Total Deaths is blank. This is required for the ML
covid19Dataframe.dropna(subset=['total_cases_per_million'],inplace=True)
covid19Dataframe.dropna(subset=['total_deaths'],inplace=True)
covid19Dataframe.dropna(subset=['hosp_patients'],inplace=True)
covid19Dataframe.dropna(subset=['total_vaccinations'],inplace=True)
covid19Dataframe.dropna(subset=['extreme_poverty'],inplace=True)

#Import Comparison Data set containing data from March. The data set we are using stops in February
canadaMarchCovidData = pd.read_csv("https://raw.githubusercontent.com/WhipSnake23/Python-Class-Project/main/Data/Canada-MarchData.csv")
canadaMarchCovidData = canadaMarchCovidData.drop(columns=['new_cases_smoothed','new_deaths_smoothed','new_cases_smoothed_per_million','new_deaths_smoothed_per_million','new_tests_smoothed','new_tests_smoothed_per_thousand','new_vaccinations_smoothed','new_vaccinations_smoothed_per_million'])
#Reset Index
canadaMarchCovidData.reset_index(drop=True, inplace=True)
#Drop rows where hosp_patients hasnt been reported
canadaMarchCovidData.dropna(subset=['hosp_patients'],inplace=True)

### Define Notebook Functions

In [402]:
#These functions make it easier to test configuration changes and features easier

#Prints the Scores from the run of the algorithum against the data
def printMetrics(test, predictions):
    print(f"Score: {explained_variance_score(test, predictions):.2f}")
    print(f"MAE: {mean_absolute_error(test, predictions):.2f}")
    print(f"RMSE: {math.sqrt(mean_squared_error(test, predictions)):.2f}")
    print(f"r2: {r2_score(test, predictions):.2f}")

#Function to run the algorithum and call metric function
# def RunModel(f_Features,f_Target,f_execution,f_dataframe="NAN"):
#     x_train, x_test, y_train, y_test = train_test_split(f_Features, f_Target, test_size=0.25, random_state=1)
#     model = ElasticNet(random_state=1)
#     model.fit(x_train, y_train)
#     if f_execution == "score":
#         predictions = model.predict(x_test)
#         printMetrics(y_test, predictions)
#     elif f_execution == "predict":
#         predictions = model.predict(f_dataframe)
#         printMetrics(y_test, predictions)
#     return predictions

In [403]:
targetColumn = "total_cases_per_million"

In [404]:
featureColumns = ["new_cases_per_million","total_deaths_per_million","human_development_index","median_age","population"]
#featureColumns = ["total_deaths_per_million","new_cases_per_million","human_development_index","gdp_per_capita","median_age","population_density","population"]
features=covid19Dataframe[featureColumns]
target=covid19Dataframe[targetColumn]

#RunModel(features,target,"score")
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=1)
model = ElasticNet(random_state=1,max_iter=10000)
model.fit(x_train, y_train)
predictions = model.predict(x_test)
printMetrics(y_test, predictions)



Score: 0.99
MAE: 1229.18
RMSE: 2095.38
r2: 0.99


In [405]:
#Prediction Methods
predictionDataFrame = canadaMarchCovidData[featureColumns]
predictedValues = model.predict(predictionDataFrame)
printMetrics(y_test, predictions)
#predictedValues = RunModel(features,target,"predict",predictionDataFrame)
predictedValues = np.around(predictedValues,decimals=0)
#predictedValues
predictedValuesFrame = predictionDataFrame.copy()
predictedValuesFrame['total_cases_per_million'] = predictedValues

predictedValuesFrame
#columnToCompare = ['total_cases_per_million']
#actualDeaths = canadaMarchCovidData[columnToCompare]
#predictedDeaths = predictedValuesFrame[columnToCompare]
#comparedDataFrame = actualDeaths.compare(predictedDeaths)
#comparedDataFrame = comparedDataFrame["total_cases_per_million"]

#comparedDataFrame = comparedDataFrame.rename(columns={"self": "Actual_Deaths", "other": "Predicted_Deaths"})

#comparedDataFrame["Difference"] = np.nan
#comparedDataFrame['Difference'] = comparedDataFrame['Actual_Deaths'] - comparedDataFrame['Predicted_Deaths']
#comparedDataFrame

Score: 0.99
MAE: 1229.18
RMSE: 2095.38
r2: 0.99


Unnamed: 0,new_cases_per_million,total_deaths_per_million,human_development_index,median_age,population,total_cases_per_million
0,97.875,583.273,0.929,41.4,37742157,22350.0
1,70.558,584.042,0.929,41.4,37742157,22760.0
2,73.764,585.685,0.929,41.4,37742157,22766.0
3,78.957,586.93,0.929,41.4,37742157,22731.0
4,78.612,587.751,0.929,41.4,37742157,22761.0
5,73.366,588.52,0.929,41.4,37742157,22859.0
6,70.16,589.103,0.929,41.4,37742157,22923.0
7,107.254,590.083,0.929,41.4,37742157,22430.0
8,77.579,590.904,0.929,41.4,37742157,22874.0
9,81.05,591.646,0.929,41.4,37742157,22848.0
