In [1]:
from bs4 import BeautifulSoup as bs4
from datetime import date
from sklearn import preprocessing
import pandas as pd
import numpy as np
import requests
import os
import csv
import keras
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Activation
from keras.layers import Dropout
from keras.layers import InputLayer
from keras.layers.core import Dense, Flatten
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy 

Using TensorFlow backend.


Pulls data from websites and stores them in csv files

In [2]:
def update_data():
    positive_cases_csv_URL = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
    req = requests.get(positive_cases_csv_URL)
    URL_content = req.content
    positive_cases_file = open("positive_cases.csv", "wb")
    positive_cases_file.write(URL_content)
    positive_cases_file.close()

This is purely for the sources of data that include data outside the UK as having international data would mean too much to parse through

In [3]:
def filter_data(filename):
    uk = list()
    with open(filename, 'r') as readFile:
        reader = csv.reader(readFile)
        for row in reader:
            if row[0] == "GBR" or row[0] == "iso_code":
                uk.append(row)
    
    with open(filename, 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerows(uk)  


Counts the number of days since the earliest data entry

In [4]:
def numberofdays(date_in_question):
    start_date = date(2020, 1, 31)
    dateq = date_in_question.split("-")
    cdate = date(int(dateq[0]), int(dateq[1]), int(dateq[2]))
    return (cdate - start_date).days

This will normalise all the data in a dataframe

In [5]:
def normalise_dataframe(df):
    for i in range(1, len(df.columns)):
        maxi = max(df.iloc[:,i])
        mini = min(df.iloc[:,i])
        for j in range(len(df.iloc[:,0])):
            df.iloc[j, i] = (df.iloc[j, i] - mini)/(maxi-mini)

This will only take columns in the data frame with no NaNs

In [27]:
def no_NaNs(df):
    data = []
    for column in range(len(df.columns)):
        temp = [float(i) for i in df.iloc[:, column]]
        if not np.isnan(np.sum(np.array(temp))):
            data.append(temp)
    return data

In [7]:
update_data() 
filter_data("positive_cases.csv")

Grabs the current working directory where the csv files are stored

In [8]:
working_dir = os.getcwd()

Reads the csv files into their respective dataframes

In [24]:
pos_cases_df = pd.read_csv(os.path.join(working_dir, "positive_cases.csv"))
pos_cases_df.drop(pos_cases_df.iloc[:, 0:3], inplace = True, axis=1)
dates = pos_cases_df["date"]
pos_cases_df.drop(["date"], inplace = True, axis=1)

In [28]:
no_nans = no_NaNs(pos_cases_df)

TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [17]:
print(no_nans)

['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases', 'total_cases_per_million', 'new_cases_per_million', 'population', 'population_density', 'median_age', 'aged_65_older', 'aged_70_older', 'gdp_per_capita', 'extreme_poverty', 'cardiovasc_death_rate', 'diabetes_prevalence', 'female_smokers', 'male_smokers', 'hospital_beds_per_thousand', 'life_expectancy', 'human_development_index']


Normalize all the data in the dataframe

In [None]:
normalise_dataframe(pos_cases_df)

Code just adds a column to the dataframe that counts the number of days since the earliest data entry

In [None]:
daysSince = []
for i in range(len(pos_cases_df)):
    daysSince.append(i)
pos_cases_df["daysSince"] = daysSince

In [25]:
print(pos_cases_df.tail())

     total_cases  new_cases  new_cases_smoothed  total_deaths  new_deaths  \
406    4261398.0     6684.0            5944.286      125579.0       176.0   
407    4267015.0     5617.0            5872.714      125701.0       122.0   
408    4271710.0     4695.0            5792.000      125753.0        52.0   
409    4276840.0     5130.0            5835.857      125817.0        64.0   
410    4282203.0     5363.0            5763.571      125927.0       110.0   

     new_deaths_smoothed  total_cases_per_million  new_cases_per_million  \
406              154.857                62772.851                 98.459   
407              149.571                62855.592                 82.742   
408              145.286                62924.753                 69.160   
409              145.143                63000.320                 75.568   
410              127.857                63079.320                 79.000   

     new_cases_smoothed_per_million  total_deaths_per_million  ...  \
406       