In [1]:
from bs4 import BeautifulSoup as bs4
from datetime import date
from sklearn import preprocessing
import pandas as pd
import numpy as np
import requests
import os
import csv
import keras
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Activation
from keras.layers import Dropout
from keras.layers import InputLayer
from keras.layers.core import Dense, Flatten
from keras.optimizers import Adam
from keras.metrics import categorical_crossentropy 

Using TensorFlow backend.


Pulls data from websites and stores them in csv files

In [2]:
def update_data():
    positive_cases_csv_URL = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
    req = requests.get(positive_cases_csv_URL)
    URL_content = req.content
    positive_cases_file = open("positive_cases.csv", "wb")
    positive_cases_file.write(URL_content)
    positive_cases_file.close()

This is purely for the sources of data that include data outside the UK as having international data would mean too much to parse through

In [3]:
def filter_data(filename):
    uk = list()
    with open(filename, 'r') as readFile:
        reader = csv.reader(readFile)
        for row in reader:
            if row[0] == "GBR" or row[0] == "iso_code":
                uk.append(row)
    
    with open(filename, 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerows(uk)  


Counts the number of days since the earliest data entry

In [4]:
def numberofdays(date_in_question):
    start_date = date(2020, 1, 31)
    dateq = date_in_question.split("-")
    cdate = date(int(dateq[0]), int(dateq[1]), int(dateq[2]))
    return (cdate - start_date).days

This will normalise all the data in a dataframe

In [5]:
def normalise_dataframe(df):
    for i in range(1, len(df.columns)):
        maxi = max(df.iloc[:,i])
        mini = min(df.iloc[:,i])
        for j in range(len(df.iloc[:,0])):
            df.iloc[j, i] = (df.iloc[j, i] - mini)/(maxi-mini)

This will only take columns in the data frame with no NaNs

In [45]:
def no_NaNs(df):
    data = []
    for column in df.columns:
        temp = [float(i) for i in df[column]]
        if np.isnan(np.sum(np.array(temp))):
            df.drop([column], inplace=True, axis=1)

In [49]:
update_data() 

In [50]:
filter_data("positive_cases.csv")

Grabs the current working directory where the csv files are stored

In [51]:
working_dir = os.getcwd()

Reads the csv files into their respective dataframes

In [52]:
pos_cases_df = pd.read_csv(os.path.join(working_dir, "positive_cases.csv"))
pos_cases_df.drop(pos_cases_df.iloc[:, 0:3], inplace = True, axis=1)
dates = pos_cases_df["date"]
pos_cases_df.drop(["date"], inplace = True, axis=1)
pos_cases_df.drop(["tests_units"], inplace = True, axis=1)

In [53]:
no_NaNs(pos_cases_df)

In [54]:
print(pos_cases_df.head())

   total_cases  new_cases  total_cases_per_million  new_cases_per_million  \
0          2.0        2.0                    0.029                  0.029   
1          2.0        0.0                    0.029                  0.000   
2          2.0        0.0                    0.029                  0.000   
3          8.0        6.0                    0.118                  0.088   
4          8.0        0.0                    0.118                  0.000   

   population  population_density  median_age  aged_65_older  aged_70_older  \
0  67886004.0             272.898        40.8         18.517         12.527   
1  67886004.0             272.898        40.8         18.517         12.527   
2  67886004.0             272.898        40.8         18.517         12.527   
3  67886004.0             272.898        40.8         18.517         12.527   
4  67886004.0             272.898        40.8         18.517         12.527   

   gdp_per_capita  extreme_poverty  cardiovasc_death_rate  \
0

In [56]:
print(pos_cases_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 411 entries, 0 to 410
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   total_cases                 411 non-null    float64
 1   new_cases                   411 non-null    float64
 2   total_cases_per_million     411 non-null    float64
 3   new_cases_per_million       411 non-null    float64
 4   population                  411 non-null    float64
 5   population_density          411 non-null    float64
 6   median_age                  411 non-null    float64
 7   aged_65_older               411 non-null    float64
 8   aged_70_older               411 non-null    float64
 9   gdp_per_capita              411 non-null    float64
 10  extreme_poverty             411 non-null    float64
 11  cardiovasc_death_rate       411 non-null    float64
 12  diabetes_prevalence         411 non-null    float64
 13  female_smokers              411 non

Normalize all the data in the dataframe

In [None]:
normalise_dataframe(pos_cases_df)

Code just adds a column to the dataframe that counts the number of days since the earliest data entry

In [57]:
daysSince = []
for i in range(len(pos_cases_df)):
    daysSince.append(i)
pos_cases_df["daysSince"] = daysSince

In [58]:
print(pos_cases_df.tail())

     total_cases  new_cases  total_cases_per_million  new_cases_per_million  \
406    4261398.0     6684.0                62772.851                 98.459   
407    4267015.0     5617.0                62855.592                 82.742   
408    4271710.0     4695.0                62924.753                 69.160   
409    4276840.0     5130.0                63000.320                 75.568   
410    4282203.0     5363.0                63079.320                 79.000   

     population  population_density  median_age  aged_65_older  aged_70_older  \
406  67886004.0             272.898        40.8         18.517         12.527   
407  67886004.0             272.898        40.8         18.517         12.527   
408  67886004.0             272.898        40.8         18.517         12.527   
409  67886004.0             272.898        40.8         18.517         12.527   
410  67886004.0             272.898        40.8         18.517         12.527   

     gdp_per_capita  extreme_poverty  