## Combine my data with stuff from Kaggle

In [27]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

In [28]:
df_mine = pd.read_csv('./covid19.csv')
df_mine.shape

(102, 8)

In [29]:
df_kag = pd.read_csv('./kaggle/covid19countryinfo.csv')
df_kag
df_kag.columns = ['Region', 'Country', *df_kag.columns[2:]]
df_kag
df_kag = df_kag[['Country', 'density', 'urbanpop', 'quarantine', 'schools', 'publicplace', 'gatheringlimit', 'nonessential', 'smokers', 'hospibed', 'sex0', 'sex14', 'sex25', 'sex54', 'sex64', 'sex65plus', 'sexratio', 'lung', 'femalelung', 'malelung', 'healthexp', 'firstcase']]
df_kag.replace({'Korea, South': 'South Korea', 'Czechia': 'Czech Republic', 'Taiwan*': 'Taiwan', 'US': 'United States'}, inplace=True)

In [30]:
df = df_kag.merge(df_mine, how="inner", on="Country")
df.head()

Unnamed: 0,Country,density,urbanpop,quarantine,schools,publicplace,gatheringlimit,nonessential,smokers,hospibed,sex0,sex14,sex25,sex54,sex64,sex65plus,sexratio,lung,femalelung,malelung,healthexp,firstcase,Total Infected,GDP 2018,Crime Index,Population 2020,Smoking 2016,Females 2018,Median Age
0,Albania,105.0,63.0,,,,,,29.4,2.9,1.08,1.11,1.09,0.93,0.95,0.87,0.98,11.67,7.02,17.04,774,3/7/2020,259,15102500000.0,40.02,2877.797,28.7,49.063095,32.9
1,Algeria,18.0,73.0,,,,,,,1.9,1.05,1.05,1.05,1.03,1.01,0.89,1.03,8.77,5.03,12.81,1031,2/24/2020,847,173758000000.0,54.41,43851.044,15.6,49.484268,28.1
2,Argentina,17.0,93.0,3/20/2020,,,,,23.95,5.0,1.05,1.06,1.05,1.0,0.94,0.71,0.98,29.27,20.16,42.59,1390,3/2/2020,1054,519871500000.0,62.96,45195.774,21.8,51.237348,31.7
3,Armenia,104.0,63.0,,,,,,26.9,4.2,1.13,1.14,1.06,0.93,0.84,0.67,0.94,23.86,16.17,35.99,883,2/29/2020,571,12433090000.0,20.78,2963.243,24.1,52.956577,35.1
4,Netherlands,593.0,44.0,,,,,,,1.2,1.02,1.01,1.01,0.93,0.87,0.64,0.9,,,,43,3/12/2020,13696,913658500000.0,28.54,17134.872,25.8,50.220944,42.6


In [31]:
# data cleaning and summing total events by country
def preprocess_summation(df, description):
    df = df.drop(['Province/State', 'Lat', 'Long'], axis=1).groupby(['Country/Region']).sum().reset_index()
    col_name = 'Total' + ' ' + description
    df[col_name] = df.apply(lambda r: r[-1], axis=1)
    df = df[['Country/Region', col_name]]
    df.columns = ['Country', col_name]
    df.replace({'Korea, South': 'South Korea', 'Czechia': 'Czech Republic', 'Taiwan*': 'Taiwan', 'US': 'United States'}, inplace=True)
    return df

In [32]:
df_deaths = pd.read_csv('./csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
total_deaths = preprocess_summation(df_deaths, 'Deaths')
df = df.merge(total_deaths, how='inner', on='Country')

In [33]:
df_recovered = pd.read_csv('./csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')
total_recover = preprocess_summation(df_recovered, 'Recovered')
df = df.merge(total_recover, how='inner', on='Country')
df.head()

Unnamed: 0,Country,density,urbanpop,quarantine,schools,publicplace,gatheringlimit,nonessential,smokers,hospibed,sex0,sex14,sex25,sex54,sex64,sex65plus,sexratio,lung,femalelung,malelung,healthexp,firstcase,Total Infected,GDP 2018,Crime Index,Population 2020,Smoking 2016,Females 2018,Median Age,Total Deaths,Total Recovered
0,Albania,105.0,63.0,,,,,,29.4,2.9,1.08,1.11,1.09,0.93,0.95,0.87,0.98,11.67,7.02,17.04,774,3/7/2020,259,15102500000.0,40.02,2877.797,28.7,49.063095,32.9,15,67
1,Algeria,18.0,73.0,,,,,,,1.9,1.05,1.05,1.05,1.03,1.01,0.89,1.03,8.77,5.03,12.81,1031,2/24/2020,847,173758000000.0,54.41,43851.044,15.6,49.484268,28.1,58,61
2,Argentina,17.0,93.0,3/20/2020,,,,,23.95,5.0,1.05,1.06,1.05,1.0,0.94,0.71,0.98,29.27,20.16,42.59,1390,3/2/2020,1054,519871500000.0,62.96,45195.774,21.8,51.237348,31.7,28,248
3,Armenia,104.0,63.0,,,,,,26.9,4.2,1.13,1.14,1.06,0.93,0.84,0.67,0.94,23.86,16.17,35.99,883,2/29/2020,571,12433090000.0,20.78,2963.243,24.1,52.956577,35.1,4,31
4,Netherlands,593.0,44.0,,,,,,,1.2,1.02,1.01,1.01,0.93,0.87,0.64,0.9,,,,43,3/12/2020,13696,913658500000.0,28.54,17134.872,25.8,50.220944,42.6,1175,260


In [35]:
df.columns = ['Country', 'Density', 'Urban Pop', 'Quarantine', 'Schools',
       'Public Place', 'Gathering Limit', 'Nonessential', 'Smokers', 'Hospital Beds',
       'sex0', 'sex14', 'sex25', 'sex54', 'sex64', 'sex65plus', 'Sex Ratio',
       'Lung', 'Female Lung', 'Male Lung', 'Health Exp', 'First Case',
       'Total Infected', 'GDP 2018', 'Crime Index', 'Population 2020',
       'Smoking 2016', 'Females 2018', 'Median Age', 'Total Deaths',
       'Total Recovered']

df.head()

Unnamed: 0,Country,Density,Urban Pop,Quarantine,Schools,Public Place,Gathering Limit,Nonessential,Smokers,Hospital Beds,sex0,sex14,sex25,sex54,sex64,sex65plus,Sex Ratio,Lung,Female Lung,Male Lung,Health Exp,First Case,Total Infected,GDP 2018,Crime Index,Population 2020,Smoking 2016,Females 2018,Median Age,Total Deaths,Total Recovered
0,Albania,105.0,63.0,,,,,,29.4,2.9,1.08,1.11,1.09,0.93,0.95,0.87,0.98,11.67,7.02,17.04,774,3/7/2020,259,15102500000.0,40.02,2877.797,28.7,49.063095,32.9,15,67
1,Algeria,18.0,73.0,,,,,,,1.9,1.05,1.05,1.05,1.03,1.01,0.89,1.03,8.77,5.03,12.81,1031,2/24/2020,847,173758000000.0,54.41,43851.044,15.6,49.484268,28.1,58,61
2,Argentina,17.0,93.0,3/20/2020,,,,,23.95,5.0,1.05,1.06,1.05,1.0,0.94,0.71,0.98,29.27,20.16,42.59,1390,3/2/2020,1054,519871500000.0,62.96,45195.774,21.8,51.237348,31.7,28,248
3,Armenia,104.0,63.0,,,,,,26.9,4.2,1.13,1.14,1.06,0.93,0.84,0.67,0.94,23.86,16.17,35.99,883,2/29/2020,571,12433090000.0,20.78,2963.243,24.1,52.956577,35.1,4,31
4,Netherlands,593.0,44.0,,,,,,,1.2,1.02,1.01,1.01,0.93,0.87,0.64,0.9,,,,43,3/12/2020,13696,913658500000.0,28.54,17134.872,25.8,50.220944,42.6,1175,260


In [36]:
cols = df.columns.tolist()
total_infected_index = list(df.columns).index('Total Infected')
cols[len(cols)-3], cols[total_infected_index] = cols[total_infected_index], cols[len(cols)-3]
df = df[cols]
df.head()
df[df.Country == 'India']

Unnamed: 0,Country,Density,Urban Pop,Quarantine,Schools,Public Place,Gathering Limit,Nonessential,Smokers,Hospital Beds,sex0,sex14,sex25,sex54,sex64,sex65plus,Sex Ratio,Lung,Female Lung,Male Lung,Health Exp,First Case,Median Age,GDP 2018,Crime Index,Population 2020,Smoking 2016,Females 2018,Total Infected,Total Deaths,Total Recovered
116,India,464.0,35.0,3/25/2020,,,,,11.15,0.7,1.12,1.13,1.13,1.06,1.01,0.9,1.08,96.92,87.54,106.89,238,1/29/2020,27.9,2718732000000.0,42.38,1380004.385,11.5,48.02354,1998,58,148


In [10]:
df.to_csv('covid19_by_country.csv', index=False)