## Combine my data with stuff from Kaggle

In [94]:
import numpy as np
import pandas as pd

In [95]:
df_mine = pd.read_csv('./covid19.csv')
df_mine.shape

(101, 8)

In [96]:
df_kag = pd.read_csv('./kaggle/covid19countryinfo.csv')
df_kag.columns = ['Country', *df_kag.columns[1:]]
df_kag = df_kag[['Country', 'tests', 'testpop', 'density', 'urbanpop', 'quarantine', 'schools', 'restrictions', 'hospibed', 'sex0', 'sex14', 'sex25', 'sex54', 'sex64', 'sex65plus', 'sexratio', 'lung', 'femalelung', 'malelung']]
df_kag.replace({'Korea, South': 'South Korea', 'Czechia': 'Czech Republic', 'Taiwan*': 'Taiwan', 'US': 'United States'}, inplace=True)

In [97]:
df = df_kag.merge(df_mine, how="inner", on="Country")
df.head()

Unnamed: 0,Country,tests,testpop,density,urbanpop,quarantine,schools,restrictions,hospibed,sex0,...,lung,femalelung,malelung,Total Infected,GDP 2018,Crime Index,Population 2020,Smoking 2016,Females 2018,Median Age
0,Albania,,,105.0,63.0,,,,2.9,1.08,...,11.67,7.02,17.04,174,15102500000.0,40.02,2877.797,28.7,49.063095,32.9
1,Algeria,,,18.0,73.0,,,,1.9,1.05,...,8.77,5.03,12.81,367,173758000000.0,54.41,43851.044,15.6,49.484268,28.1
2,Argentina,,,17.0,93.0,3/20/2020,,,5.0,1.05,...,29.27,20.16,42.59,502,519871500000.0,62.96,45195.774,21.8,51.237348,31.7
3,Armenia,694.0,4269.802594,104.0,63.0,,,,4.2,1.13,...,23.86,16.17,35.99,290,12433090000.0,20.78,2963.243,24.1,52.956577,35.1
4,Australia,31635.0,806.06556,3.0,86.0,,,3/23/2020,3.8,1.06,...,18.79,15.9,22.16,2810,1433904000000.0,42.7,25499.884,14.7,50.199623,38.7


In [98]:
df.columns = ['Country', 'Tests', 'Test Pop', 'Density', 'Urban Pop', 'Quarantine', 'Schools', 'Restrictions', 'Hospital Bed', 'sex0', 'sex14', 'sex25', 'sex54', 'sex64', 'sex65plus', 'Sex Ratio', 'lung', 'Female Lung', 'Male Lung', *df_mine.columns[1:]]
df.head()

Unnamed: 0,Country,Tests,Test Pop,Density,Urban Pop,Quarantine,Schools,Restrictions,Hospital Bed,sex0,...,lung,Female Lung,Male Lung,Total Infected,GDP 2018,Crime Index,Population 2020,Smoking 2016,Females 2018,Median Age
0,Albania,,,105.0,63.0,,,,2.9,1.08,...,11.67,7.02,17.04,174,15102500000.0,40.02,2877.797,28.7,49.063095,32.9
1,Algeria,,,18.0,73.0,,,,1.9,1.05,...,8.77,5.03,12.81,367,173758000000.0,54.41,43851.044,15.6,49.484268,28.1
2,Argentina,,,17.0,93.0,3/20/2020,,,5.0,1.05,...,29.27,20.16,42.59,502,519871500000.0,62.96,45195.774,21.8,51.237348,31.7
3,Armenia,694.0,4269.802594,104.0,63.0,,,,4.2,1.13,...,23.86,16.17,35.99,290,12433090000.0,20.78,2963.243,24.1,52.956577,35.1
4,Australia,31635.0,806.06556,3.0,86.0,,,3/23/2020,3.8,1.06,...,18.79,15.9,22.16,2810,1433904000000.0,42.7,25499.884,14.7,50.199623,38.7


In [99]:
# data cleaning and summing total events by country
def preprocess_summation(df, description):
    df = df.drop(['Province/State', 'Lat', 'Long'], axis=1).groupby(['Country/Region']).sum().reset_index()
    col_name = 'Total' + ' ' + description
    df[col_name] = df.apply(lambda r: r[-1], axis=1)
    df = df[['Country/Region', col_name]]
    df.columns = ['Country', col_name]
    df.replace({'Korea, South': 'South Korea', 'Czechia': 'Czech Republic', 'Taiwan*': 'Taiwan', 'US': 'United States'}, inplace=True)
    return df

In [100]:
df_deaths = pd.read_csv('./csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
total_deaths = preprocess_summation(df_deaths, 'Deaths')
df = df.merge(total_deaths, how='inner', on='Country')

In [101]:
df_recovered = pd.read_csv('./csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')
total_recover = preprocess_summation(df_recovered, 'Recovered')
df = df.merge(total_recover, how='inner', on='Country')

In [102]:
cols = df.columns.tolist()
total_infected_index = list(df.columns).index('Total Infected')
cols[len(cols)-3], cols[total_infected_index] = cols[total_infected_index], cols[len(cols)-3]
df = df[cols]
df.head()
df[df.Country == 'India']

Unnamed: 0,Country,Tests,Test Pop,Density,Urban Pop,Quarantine,Schools,Restrictions,Hospital Bed,sex0,...,Male Lung,Median Age,GDP 2018,Crime Index,Population 2020,Smoking 2016,Females 2018,Total Infected,Total Deaths,Total Recovered
38,India,5900.0,233899.0483,464.0,35.0,3/23/2020,,,0.7,1.12,...,106.89,27.9,2718732000000.0,42.38,1380004.385,11.5,48.02354,727,20,45


In [104]:
df.to_csv('covid19_by_country.csv', index=False)