In [1]:
import numpy as np
import pandas as pd



## Aggregate data from multiple sources

### Johns Hopkins COVID-19 data

In [2]:
# data cleaning and summing total events by country
def preprocess_summation(df, description):
    df = df.drop(['Province/State', 'Lat', 'Long'], axis=1).groupby(['Country/Region']).sum().reset_index()
    col_name = 'Total' + ' ' + description
    df[col_name] = df.apply(lambda r: r[-1], axis=1)
    df = df[['Country/Region', col_name]]
    df.columns = ['Country', col_name]
    df.replace({'Korea, South': 'South Korea', 'Czechia': 'Czech Republic', 'Taiwan*': 'Taiwan', 'US': 'United States'}, inplace=True)
    return df

In [3]:
df_confirmed = pd.read_csv('./csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
total_infections = preprocess_summation(df_confirmed, 'Infected')
total_infections.head()

Unnamed: 0,Country,Total Infected
0,Afghanistan,94
1,Albania,174
2,Algeria,367
3,Andorra,224
4,Angola,4


### Import 2018 GDP data
https://data.worldbank.org/indicator/NY.GDP.MKTP.CD

In [4]:
df_gdp = pd.read_csv('./outside_data/GDP.csv')
df_gdp = df_gdp[['Country Name', '2018']]
df_gdp.columns = ['Country', 'GDP 2018']
df_gdp.replace({'Korea, Rep.': 'South Korea'}, inplace=True)
df_gdp.shape

(264, 2)

In [5]:
df = total_infections.merge(df_gdp, how='inner', on='Country')

### Crime and Population data
https://worldpopulationreview.com/countries/crime-rate-by-country/

In [6]:
df_crime = pd.read_csv('./outside_data/crime_pop.csv', header=0, names=['Country', 'Crime Index', 'Population 2020'])
df = df.merge(df_crime, how='inner', on='Country')

### Smoking data
https://ourworldindata.org/smoking#prevalence-of-smoking-across-the-world  
Percentage of country aged 15+ who smoke

In [7]:
df_smoke = pd.read_csv('./outside_data/smoking.csv', header=0, names=['Country', 'Code', 'Year', 'Smoking 2016'])
df_smoke = df_smoke[df_smoke.apply(lambda r: r['Year'] == 2016, axis=1)]
df_smoke.drop(['Code', 'Year'], axis=1, inplace=True)
df_smoke.shape

(186, 2)

In [8]:
df = df.merge(df_smoke, how='inner', on='Country')

### Gender data
https://data.worldbank.org/indicator/SP.POP.TOTL.FE.ZS  
Percentage of females in 2018

In [9]:
df_gender = pd.read_csv('./outside_data/gender.csv')
df_gender = df_gender[['Country Name', '2018']]
df_gender.columns = ['Country', 'Females 2018']
df_gender.replace({'Korea, Rep.': 'South Korea'}, inplace=True)

In [10]:
df = df.merge(df_gender, how='inner', on='Country')

### Age data
https://worldpopulationreview.com/countries/median-age/  
Median age

In [11]:
df_age = pd.read_csv('./outside_data/age.csv')
df_age = df_age[['Place', 'Median']]
df_age.columns = ['Country', 'Median Age']
df_age.shape

(214, 2)

In [12]:
df = df.merge(df_age, how='inner', on='Country')
df.head()

Unnamed: 0,Country,Total Infected,GDP 2018,Crime Index,Population 2020,Smoking 2016,Females 2018,Median Age
0,Albania,174,15102500000.0,40.02,2877.797,28.7,49.063095,32.9
1,Algeria,367,173758000000.0,54.41,43851.044,15.6,49.484268,28.1
2,Argentina,502,519871500000.0,62.96,45195.774,21.8,51.237348,31.7
3,Armenia,290,12433090000.0,20.78,2963.243,24.1,52.956577,35.1
4,Australia,2810,1433904000000.0,42.7,25499.884,14.7,50.199623,38.7


In [13]:
df.to_csv('covid19.csv', index=False)