## Linear Regression Analysis of Covid 19 data for Canada, Italy, India, and the United States
#### By: Ian

### Import needed Libraries

In [474]:
##Import Key Libraries for use
import datetime as dt
import math
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.metrics import explained_variance_score, mean_absolute_error, r2_score, mean_squared_error

### Import and Scrub Data from Github

In [475]:
neededColumns = ["total_cases_per_million","new_cases_per_million","total_deaths_per_million","human_development_index","median_age","population"]
#Import CSV
covid19Dataframe = pd.read_csv("https://raw.githubusercontent.com/WhipSnake23/Python-Class-Project/main/Data/owid-covid-data.csv")
#Remove Smoothed Columns from dataframe
covid19Dataframe = covid19Dataframe[covid19Dataframe.iso_code.str.contains("CAN")| (covid19Dataframe.iso_code.str.contains("IND")) | (covid19Dataframe.iso_code.str.contains("ITA")) | (covid19Dataframe.iso_code.str.contains("USA"))]
covid19Dataframe = covid19Dataframe[neededColumns]
covid19Dataframe.dropna(subset=['new_cases_per_million'],inplace=True)
covid19Dataframe.dropna(subset=['total_deaths_per_million'],inplace=True)

#Reset Index
covid19Dataframe.reset_index(drop=True, inplace=True)



#Import Comparison Data set containing data from March. The data set we are using stops in February
MarchCovidData = pd.read_csv("https://raw.githubusercontent.com/WhipSnake23/Python-Class-Project/main/Data/Covid-IND_CAN-USA_ITA-March.csv")
MarchCovidData = MarchCovidData[neededColumns]
MarchCovidData.dropna(subset=['new_cases_per_million'],inplace=True)
MarchCovidData.dropna(subset=['total_deaths_per_million'],inplace=True)
#Reset Index
MarchCovidData.reset_index(drop=True, inplace=True)


### Define Notebook Functions

In [476]:
#These functions make it easier to test configuration changes and features easier

#Prints the Scores from the run of the algorithum against the data
def printMetrics(test, predictions):
    print(f"Score: {explained_variance_score(test, predictions):.2f}")
    print(f"MAE: {mean_absolute_error(test, predictions):.2f}")
    print(f"RMSE: {math.sqrt(mean_squared_error(test, predictions)):.2f}")
    print(f"r2: {r2_score(test, predictions):.2f}")

#Function to run the algorithum and call metric function
# def RunModel(f_Features,f_Target,f_execution,f_dataframe="NAN"):
#     x_train, x_test, y_train, y_test = train_test_split(f_Features, f_Target, test_size=0.25, random_state=1)
#     model = ElasticNet(random_state=1)
#     model.fit(x_train, y_train)
#     if f_execution == "score":
#         predictions = model.predict(x_test)
#         printMetrics(y_test, predictions)
#     elif f_execution == "predict":
#         predictions = model.predict(f_dataframe)
#         printMetrics(y_test, predictions)
#     return predictions

In [477]:
targetColumn = "total_cases_per_million"

In [478]:
featureColumns = ["new_cases_per_million","total_deaths_per_million","human_development_index","median_age","population"]
#featureColumns = ["total_deaths_per_million","new_cases_per_million","human_development_index","gdp_per_capita","median_age","population_density","population"]
features=covid19Dataframe[featureColumns]
target=covid19Dataframe[targetColumn]

#RunModel(features,target,"score")
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=1)
model = ElasticNet(random_state=1,max_iter=10000)
model.fit(x_train, y_train)
predictions = model.predict(x_test)
printMetrics(y_test, predictions)


Score: 0.92
MAE: 3185.80
RMSE: 4726.74
r2: 0.92


In [479]:
#Prediction Methods
# predictionDataFrame = canadaMarchCovidData[featureColumns]
# predictedValues = model.predict(predictionDataFrame)
# printMetrics(y_test, predictions)
# #predictedValues = RunModel(features,target,"predict",predictionDataFrame)
# predictedValues = np.around(predictedValues,decimals=0)
# #predictedValues
# predictedValuesFrame = predictionDataFrame.copy()
# predictedValuesFrame['total_cases_per_million'] = predictedValues
#
# columnToCompare = ['total_cases_per_million']
# actual_total_cases_per_million = canadaMarchCovidData[columnToCompare]
# predicted_actual_total_cases_per_million = predictedValuesFrame[columnToCompare]
# comparedDataFrame = actual_total_cases_per_million.compare(predicted_actual_total_cases_per_million)
# comparedDataFrame = comparedDataFrame["total_cases_per_million"]
#
# comparedDataFrame = comparedDataFrame.rename(columns={"self": "Actual", "other": "Predicted"})
#
# comparedDataFrame["Difference"] = np.nan
# comparedDataFrame['Difference'] = comparedDataFrame['Actual'] - comparedDataFrame['Predicted']
# comparedDataFrame


In [480]:
##Increase/Decrease by 10% new_cases_per_million total_deaths_per_million - based on March Data
MarchCovidData.insert(6,"new_cases_per_mil_Increase",(MarchCovidData['new_cases_per_million'] * .15) + MarchCovidData['new_cases_per_million'],True)
MarchCovidData.insert(7,"total_deaths_per_mil_Increase",(MarchCovidData['total_deaths_per_million'] * .15) + MarchCovidData['total_deaths_per_million'] ,True)
MarchCovidData


Unnamed: 0,total_cases_per_million,new_cases_per_million,total_deaths_per_million,human_development_index,median_age,population,new_cases_per_mil_Increase,total_deaths_per_mil_Increase
0,23193.905,97.875,583.273,0.929,41.4,37742157,112.55625,670.76395
1,23264.463,70.558,584.042,0.929,41.4,37742157,81.14170,671.64830
2,23338.226,73.764,585.685,0.929,41.4,37742157,84.82860,673.53775
3,23417.183,78.957,586.930,0.929,41.4,37742157,90.80055,674.96950
4,23495.795,78.612,587.751,0.929,41.4,37742157,90.40380,675.91365
...,...,...,...,...,...,...,...,...
119,91296.394,187.521,1658.334,0.926,38.3,331002647,215.64915,1907.08410
120,91428.399,132.005,1659.866,0.926,38.3,331002647,151.80575,1908.84590
121,91638.122,209.723,1661.984,0.926,38.3,331002647,241.18145,1911.28160
122,91823.136,185.014,1664.627,0.926,38.3,331002647,212.76610,1914.32105
